1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/lexical_cast.hpp>
16 #include "include/assert.h" // lexical_cast includes system assert.h
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
27 #include "MDBalancer.h"
29 #include "SnapClient.h"
32 #include "msg/Messenger.h"
34 #include "osdc/Objecter.h"
36 #include "messages/MClientSession.h"
37 #include "messages/MClientRequest.h"
38 #include "messages/MClientReply.h"
39 #include "messages/MClientReconnect.h"
40 #include "messages/MClientCaps.h"
41 #include "messages/MClientSnap.h"
43 #include "messages/MMDSSlaveRequest.h"
45 #include "messages/MLock.h"
47 #include "events/EUpdate.h"
48 #include "events/ESlaveUpdate.h"
49 #include "events/ESession.h"
50 #include "events/EOpen.h"
51 #include "events/ECommitted.h"
53 #include "include/filepath.h"
54 #include "common/errno.h"
55 #include "common/Timer.h"
56 #include "common/perf_counters.h"
57 #include "include/compat.h"
58 #include "osd/OSDMap.h"
66 #include "common/config.h"
68 #define dout_context g_ceph_context
69 #define dout_subsys ceph_subsys_mds
71 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
74 class ServerContext
: public MDSInternalContextBase
{
77 MDSRank
*get_mds() override
83 explicit ServerContext(Server
*s
) : server(s
) {
84 assert(server
!= NULL
);
88 class ServerLogContext
: public MDSLogContextBase
{
91 MDSRank
*get_mds() override
97 void pre_finish(int r
) override
{
99 mdr
->mark_event("journal_committed: ");
102 explicit ServerLogContext(Server
*s
) : server(s
) {
103 assert(server
!= NULL
);
105 explicit ServerLogContext(Server
*s
, MDRequestRef
& r
) : server(s
), mdr(r
) {
106 assert(server
!= NULL
);
110 void Server::create_logger()
112 PerfCountersBuilder
plb(g_ceph_context
, "mds_server", l_mdss_first
, l_mdss_last
);
113 plb
.add_u64_counter(l_mdss_handle_client_request
,"handle_client_request",
114 "Client requests", "hcr");
115 plb
.add_u64_counter(l_mdss_handle_slave_request
, "handle_slave_request",
116 "Slave requests", "hsr");
117 plb
.add_u64_counter(l_mdss_handle_client_session
, "handle_client_session",
118 "Client session messages", "hcs");
119 plb
.add_u64_counter(l_mdss_dispatch_client_request
, "dispatch_client_request", "Client requests dispatched");
120 plb
.add_u64_counter(l_mdss_dispatch_slave_request
, "dispatch_server_request", "Server requests dispatched");
121 plb
.add_u64_counter(l_mdss_req_lookuphash
, "req_lookuphash",
122 "Request type lookup hash of inode");
123 plb
.add_u64_counter(l_mdss_req_lookupino
, "req_lookupino",
124 "Request type lookup inode");
125 plb
.add_u64_counter(l_mdss_req_lookupparent
, "req_lookupparent",
126 "Request type lookup parent");
127 plb
.add_u64_counter(l_mdss_req_lookupname
, "req_lookupname",
128 "Request type lookup name");
129 plb
.add_u64_counter(l_mdss_req_lookup
, "req_lookup",
130 "Request type lookup");
131 plb
.add_u64_counter(l_mdss_req_lookupsnap
, "req_lookupsnap",
132 "Request type lookup snapshot");
133 plb
.add_u64_counter(l_mdss_req_getattr
, "req_getattr",
134 "Request type get attribute");
135 plb
.add_u64_counter(l_mdss_req_setattr
, "req_setattr",
136 "Request type set attribute");
137 plb
.add_u64_counter(l_mdss_req_setlayout
, "req_setlayout",
138 "Request type set file layout");
139 plb
.add_u64_counter(l_mdss_req_setdirlayout
, "req_setdirlayout",
140 "Request type set directory layout");
141 plb
.add_u64_counter(l_mdss_req_setxattr
, "req_setxattr",
142 "Request type set extended attribute");
143 plb
.add_u64_counter(l_mdss_req_rmxattr
, "req_rmxattr",
144 "Request type remove extended attribute");
145 plb
.add_u64_counter(l_mdss_req_readdir
, "req_readdir",
146 "Request type read directory");
147 plb
.add_u64_counter(l_mdss_req_setfilelock
, "req_setfilelock",
148 "Request type set file lock");
149 plb
.add_u64_counter(l_mdss_req_getfilelock
, "req_getfilelock",
150 "Request type get file lock");
151 plb
.add_u64_counter(l_mdss_req_create
, "req_create",
152 "Request type create");
153 plb
.add_u64_counter(l_mdss_req_open
, "req_open",
154 "Request type open");
155 plb
.add_u64_counter(l_mdss_req_mknod
, "req_mknod",
156 "Request type make node");
157 plb
.add_u64_counter(l_mdss_req_link
, "req_link",
158 "Request type link");
159 plb
.add_u64_counter(l_mdss_req_unlink
, "req_unlink",
160 "Request type unlink");
161 plb
.add_u64_counter(l_mdss_req_rmdir
, "req_rmdir",
162 "Request type remove directory");
163 plb
.add_u64_counter(l_mdss_req_rename
, "req_rename",
164 "Request type rename");
165 plb
.add_u64_counter(l_mdss_req_mkdir
, "req_mkdir",
166 "Request type make directory");
167 plb
.add_u64_counter(l_mdss_req_symlink
, "req_symlink",
168 "Request type symbolic link");
169 plb
.add_u64_counter(l_mdss_req_lssnap
, "req_lssnap",
170 "Request type list snapshot");
171 plb
.add_u64_counter(l_mdss_req_mksnap
, "req_mksnap",
172 "Request type make snapshot");
173 plb
.add_u64_counter(l_mdss_req_rmsnap
, "req_rmsnap",
174 "Request type remove snapshot");
175 plb
.add_u64_counter(l_mdss_req_renamesnap
, "req_renamesnap",
176 "Request type rename snapshot");
177 logger
= plb
.create_perf_counters();
178 g_ceph_context
->get_perfcounters_collection()->add(logger
);
181 Server::Server(MDSRank
*m
) :
183 mdcache(mds
->mdcache
), mdlog(mds
->mdlog
),
186 reconnect_done(NULL
),
187 failed_reconnects(0),
188 terminating_sessions(false)
193 /* This function DOES put the passed message before returning*/
194 void Server::dispatch(Message
*m
)
196 switch (m
->get_type()) {
197 case CEPH_MSG_CLIENT_RECONNECT
:
198 handle_client_reconnect(static_cast<MClientReconnect
*>(m
));
203 if (!mds
->is_active() &&
204 !(mds
->is_stopping() && m
->get_source().is_mds())) {
205 if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
&&
206 (mds
->is_reconnect() || mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
)) {
207 MClientRequest
*req
= static_cast<MClientRequest
*>(m
);
208 Session
*session
= get_session(req
);
209 if (!session
|| session
->is_closed()) {
210 dout(5) << "session is closed, dropping " << req
->get_reqid() << dendl
;
214 bool queue_replay
= false;
215 if (req
->is_replay()) {
216 dout(3) << "queuing replayed op" << dendl
;
218 } else if (req
->get_retry_attempt()) {
219 // process completed request in clientreplay stage. The completed request
220 // might have created new file/directorie. This guarantees MDS sends a reply
221 // to client before other request modifies the new file/directorie.
222 if (session
->have_completed_request(req
->get_reqid().tid
, NULL
)) {
223 dout(3) << "queuing completed op" << dendl
;
226 // this request was created before the cap reconnect message, drop any embedded
228 req
->releases
.clear();
231 req
->mark_queued_for_replay();
232 mds
->enqueue_replay(new C_MDS_RetryMessage(mds
, m
));
237 bool wait_for_active
= true;
238 if (m
->get_type() == MSG_MDS_SLAVE_REQUEST
) {
239 // handle_slave_request() will wait if necessary
240 wait_for_active
= false;
241 } else if (mds
->is_clientreplay()) {
242 // session open requests need to be handled during replay,
243 // close requests need to be delayed
244 if ((m
->get_type() == CEPH_MSG_CLIENT_SESSION
&&
245 (static_cast<MClientSession
*>(m
))->get_op() != CEPH_SESSION_REQUEST_CLOSE
)) {
246 wait_for_active
= false;
247 } else if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
) {
248 MClientRequest
*req
= static_cast<MClientRequest
*>(m
);
249 if (req
->is_queued_for_replay()) {
250 wait_for_active
= false;
254 if (wait_for_active
) {
255 dout(3) << "not active yet, waiting" << dendl
;
256 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
261 switch (m
->get_type()) {
262 case CEPH_MSG_CLIENT_SESSION
:
263 handle_client_session(static_cast<MClientSession
*>(m
));
265 case CEPH_MSG_CLIENT_REQUEST
:
266 handle_client_request(static_cast<MClientRequest
*>(m
));
268 case MSG_MDS_SLAVE_REQUEST
:
269 handle_slave_request(static_cast<MMDSSlaveRequest
*>(m
));
272 derr
<< "server unknown message " << m
->get_type() << dendl
;
273 assert(0 == "server unknown message");
279 // ----------------------------------------------------------
280 // SESSION management
282 class C_MDS_session_finish
: public ServerLogContext
{
287 interval_set
<inodeno_t
> inos
;
291 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, Context
*fin_
= NULL
) :
292 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inotablev(0), fin(fin_
) { }
293 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, interval_set
<inodeno_t
>& i
, version_t iv
, Context
*fin_
= NULL
) :
294 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inos(i
), inotablev(iv
), fin(fin_
) { }
295 void finish(int r
) override
{
297 server
->_session_logged(session
, state_seq
, open
, cmapv
, inos
, inotablev
);
304 Session
*Server::get_session(Message
*m
)
306 Session
*session
= static_cast<Session
*>(m
->get_connection()->get_priv());
308 dout(20) << "get_session have " << session
<< " " << session
->info
.inst
309 << " state " << session
->get_state_name() << dendl
;
310 session
->put(); // not carry ref
312 dout(20) << "get_session dne for " << m
->get_source_inst() << dendl
;
317 /* This function DOES put the passed message before returning*/
318 void Server::handle_client_session(MClientSession
*m
)
321 Session
*session
= get_session(m
);
323 dout(3) << "handle_client_session " << *m
<< " from " << m
->get_source() << dendl
;
324 assert(m
->get_source().is_client()); // should _not_ come from an mds!
327 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
333 logger
->inc(l_mdss_handle_client_session
);
336 switch (m
->get_op()) {
337 case CEPH_SESSION_REQUEST_OPEN
:
338 if (session
->is_opening() ||
339 session
->is_open() ||
340 session
->is_stale() ||
341 session
->is_killing()) {
342 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl
;
346 assert(session
->is_closed() ||
347 session
->is_closing());
349 session
->set_client_metadata(m
->client_meta
);
350 dout(20) << __func__
<< " CEPH_SESSION_REQUEST_OPEN "
351 << session
->info
.client_metadata
.size() << " metadata entries:" << dendl
;
352 for (map
<string
, string
>::iterator i
= session
->info
.client_metadata
.begin();
353 i
!= session
->info
.client_metadata
.end(); ++i
) {
354 dout(20) << " " << i
->first
<< ": " << i
->second
<< dendl
;
357 // Special case for the 'root' metadata path; validate that the claimed
358 // root is actually within the caps of the session
359 if (session
->info
.client_metadata
.count("root")) {
360 const auto claimed_root
= session
->info
.client_metadata
.at("root");
361 // claimed_root has a leading "/" which we strip before passing
363 if (claimed_root
.empty() || claimed_root
[0] != '/' ||
364 !session
->auth_caps
.path_capable(claimed_root
.substr(1))) {
365 derr
<< __func__
<< " forbidden path claimed as mount root: "
366 << claimed_root
<< " by " << m
->get_source() << dendl
;
367 // Tell the client we're rejecting their open
368 mds
->send_message_client(new MClientSession(CEPH_SESSION_REJECT
), session
);
369 mds
->clog
->warn() << "client session with invalid root '" <<
370 claimed_root
<< "' denied (" << session
->info
.inst
<< ")";
372 // Drop out; don't record this session in SessionMap or journal it.
377 if (session
->is_closed())
378 mds
->sessionmap
.add_session(session
);
380 pv
= mds
->sessionmap
.mark_projected(session
);
381 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
382 mds
->sessionmap
.touch_session(session
);
383 mdlog
->start_submit_entry(new ESession(m
->get_source_inst(), true, pv
, m
->client_meta
),
384 new C_MDS_session_finish(this, session
, sseq
, true, pv
));
388 case CEPH_SESSION_REQUEST_RENEWCAPS
:
389 if (session
->is_open() ||
390 session
->is_stale()) {
391 mds
->sessionmap
.touch_session(session
);
392 if (session
->is_stale()) {
393 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
394 mds
->locker
->resume_stale_caps(session
);
395 mds
->sessionmap
.touch_session(session
);
397 m
->get_connection()->send_message(new MClientSession(CEPH_SESSION_RENEWCAPS
, m
->get_seq()));
399 dout(10) << "ignoring renewcaps on non open|stale session (" << session
->get_state_name() << ")" << dendl
;
403 case CEPH_SESSION_REQUEST_CLOSE
:
405 if (session
->is_closed() ||
406 session
->is_closing() ||
407 session
->is_killing()) {
408 dout(10) << "already closed|closing|killing, dropping this req" << dendl
;
412 if (session
->is_importing()) {
413 dout(10) << "ignoring close req on importing session" << dendl
;
417 assert(session
->is_open() ||
418 session
->is_stale() ||
419 session
->is_opening());
420 if (m
->get_seq() < session
->get_push_seq()) {
421 dout(10) << "old push seq " << m
->get_seq() << " < " << session
->get_push_seq()
422 << ", dropping" << dendl
;
426 // We are getting a seq that is higher than expected.
427 // Handle the same as any other seqn error.
429 if (m
->get_seq() != session
->get_push_seq()) {
430 dout(0) << "old push seq " << m
->get_seq() << " != " << session
->get_push_seq()
431 << ", BUGGY!" << dendl
;
432 mds
->clog
->warn() << "incorrect push seq " << m
->get_seq() << " != "
433 << session
->get_push_seq() << ", dropping" << " from client : " << session
->get_human_name();
437 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
441 case CEPH_SESSION_FLUSHMSG_ACK
:
442 finish_flush_session(session
, m
->get_seq());
451 void Server::flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
)
453 for (set
<client_t
>::iterator p
= client_set
.begin(); p
!= client_set
.end(); ++p
) {
454 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->v
));
456 if (!session
->is_open() ||
457 !session
->connection
.get() ||
458 !session
->connection
->has_feature(CEPH_FEATURE_EXPORT_PEER
))
460 version_t seq
= session
->wait_for_flush(gather
.new_sub());
461 mds
->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG
, seq
), session
);
465 void Server::finish_flush_session(Session
*session
, version_t seq
)
467 list
<MDSInternalContextBase
*> finished
;
468 session
->finish_flush(seq
, finished
);
469 mds
->queue_waiters(finished
);
472 void Server::_session_logged(Session
*session
, uint64_t state_seq
, bool open
, version_t pv
,
473 interval_set
<inodeno_t
>& inos
, version_t piv
)
475 dout(10) << "_session_logged " << session
->info
.inst
<< " state_seq " << state_seq
<< " " << (open
? "open":"close")
476 << " " << pv
<< dendl
;
479 assert(session
->is_closing() || session
->is_killing() ||
480 session
->is_opening()); // re-open closing session
481 session
->info
.prealloc_inos
.subtract(inos
);
482 mds
->inotable
->apply_release_ids(inos
);
483 assert(mds
->inotable
->get_version() == piv
);
486 mds
->sessionmap
.mark_dirty(session
);
489 if (session
->get_state_seq() != state_seq
) {
490 dout(10) << " journaled state_seq " << state_seq
<< " != current " << session
->get_state_seq()
491 << ", noop" << dendl
;
492 // close must have been canceled (by an import?), or any number of other things..
494 assert(session
->is_opening());
495 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
496 mds
->sessionmap
.touch_session(session
);
497 assert(session
->connection
!= NULL
);
498 session
->connection
->send_message(new MClientSession(CEPH_SESSION_OPEN
));
499 if (mdcache
->is_readonly())
500 session
->connection
->send_message(new MClientSession(CEPH_SESSION_FORCE_RO
));
501 } else if (session
->is_closing() ||
502 session
->is_killing()) {
503 // kill any lingering capabilities, leases, requests
504 while (!session
->caps
.empty()) {
505 Capability
*cap
= session
->caps
.front();
506 CInode
*in
= cap
->get_inode();
507 dout(20) << " killing capability " << ccap_string(cap
->issued()) << " on " << *in
<< dendl
;
508 mds
->locker
->remove_client_cap(in
, session
->info
.inst
.name
.num());
510 while (!session
->leases
.empty()) {
511 ClientLease
*r
= session
->leases
.front();
512 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
513 dout(20) << " killing client lease of " << *dn
<< dendl
;
514 dn
->remove_client_lease(r
, mds
->locker
);
516 if (client_reconnect_gather
.count(session
->info
.get_client())) {
517 dout(20) << " removing client from reconnect set" << dendl
;
518 client_reconnect_gather
.erase(session
->info
.get_client());
520 if (client_reconnect_gather
.empty()) {
521 dout(7) << " client " << session
->info
.inst
<< " was last reconnect, finishing" << dendl
;
522 reconnect_gather_finish();
526 if (session
->is_closing()) {
527 // mark con disposable. if there is a fault, we will get a
528 // reset and clean it up. if the client hasn't received the
529 // CLOSE message yet, they will reconnect and get an
530 // ms_handle_remote_reset() and realize they had in fact closed.
531 // do this *before* sending the message to avoid a possible
533 if (session
->connection
!= NULL
) {
534 // Conditional because terminate_sessions will indiscrimately
535 // put sessions in CLOSING whether they ever had a conn or not.
536 session
->connection
->mark_disposable();
540 mds
->send_message_client(new MClientSession(CEPH_SESSION_CLOSE
), session
);
541 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
543 mds
->sessionmap
.remove_session(session
);
544 } else if (session
->is_killing()) {
545 // destroy session, close connection
546 if (session
->connection
!= NULL
) {
547 session
->connection
->mark_down();
549 mds
->sessionmap
.remove_session(session
);
559 * Inject sessions from some source other than actual connections.
562 * - sessions inferred from journal replay
563 * - sessions learned from other MDSs during rejoin
564 * - sessions learned from other MDSs during dir/caps migration
565 * - sessions learned from other MDSs during a cross-MDS rename
567 version_t
Server::prepare_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
568 map
<client_t
,uint64_t>& sseqmap
)
570 version_t pv
= mds
->sessionmap
.get_projected();
572 dout(10) << "prepare_force_open_sessions " << pv
573 << " on " << cm
.size() << " clients"
575 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
577 Session
*session
= mds
->sessionmap
.get_or_add_session(p
->second
);
578 pv
= mds
->sessionmap
.mark_projected(session
);
579 if (session
->is_closed() ||
580 session
->is_closing() ||
581 session
->is_killing())
582 sseqmap
[p
->first
] = mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
584 assert(session
->is_open() ||
585 session
->is_opening() ||
586 session
->is_stale());
587 session
->inc_importing();
592 void Server::finish_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
593 map
<client_t
,uint64_t>& sseqmap
,
597 * FIXME: need to carefully consider the race conditions between a
598 * client trying to close a session and an MDS doing an import
599 * trying to force open a session...
601 dout(10) << "finish_force_open_sessions on " << cm
.size() << " clients,"
602 << " initial v " << mds
->sessionmap
.get_version() << dendl
;
605 int sessions_inserted
= 0;
606 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
609 Session
*session
= mds
->sessionmap
.get_session(p
->second
.name
);
612 if (sseqmap
.count(p
->first
)) {
613 uint64_t sseq
= sseqmap
[p
->first
];
614 if (session
->get_state_seq() != sseq
) {
615 dout(10) << "force_open_sessions skipping changed " << session
->info
.inst
<< dendl
;
617 dout(10) << "force_open_sessions opened " << session
->info
.inst
<< dendl
;
618 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
619 mds
->sessionmap
.touch_session(session
);
620 mds
->send_message_client(new MClientSession(CEPH_SESSION_OPEN
), session
);
621 if (mdcache
->is_readonly())
622 mds
->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO
), session
);
625 dout(10) << "force_open_sessions skipping already-open " << session
->info
.inst
<< dendl
;
626 assert(session
->is_open() || session
->is_stale());
630 session
->dec_importing();
633 mds
->sessionmap
.mark_dirty(session
);
636 dout(10) << __func__
<< ": final v " << mds
->sessionmap
.get_version() << dendl
;
639 class C_MDS_TerminatedSessions
: public ServerContext
{
640 void finish(int r
) override
{
641 server
->terminating_sessions
= false;
644 explicit C_MDS_TerminatedSessions(Server
*s
) : ServerContext(s
) {}
647 void Server::terminate_sessions()
649 dout(2) << "terminate_sessions" << dendl
;
651 terminating_sessions
= true;
653 // kill them off. clients will retry etc.
654 set
<Session
*> sessions
;
655 mds
->sessionmap
.get_client_session_set(sessions
);
656 for (set
<Session
*>::const_iterator p
= sessions
.begin();
659 Session
*session
= *p
;
660 if (session
->is_closing() ||
661 session
->is_killing() ||
662 session
->is_closed())
664 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
667 mdlog
->wait_for_safe(new C_MDS_TerminatedSessions(this));
671 void Server::find_idle_sessions()
673 dout(10) << "find_idle_sessions. laggy until " << mds
->get_laggy_until() << dendl
;
676 // (caps go stale, lease die)
677 utime_t now
= ceph_clock_now();
678 utime_t cutoff
= now
;
679 cutoff
-= g_conf
->mds_session_timeout
;
681 Session
*session
= mds
->sessionmap
.get_oldest_session(Session::STATE_OPEN
);
683 dout(20) << "laggiest active session is " << session
->info
.inst
<< dendl
;
684 if (session
->last_cap_renew
>= cutoff
) {
685 dout(20) << "laggiest active session is " << session
->info
.inst
<< " and sufficiently new ("
686 << session
->last_cap_renew
<< ")" << dendl
;
690 dout(10) << "new stale session " << session
->info
.inst
<< " last " << session
->last_cap_renew
<< dendl
;
691 mds
->sessionmap
.set_state(session
, Session::STATE_STALE
);
692 mds
->locker
->revoke_stale_caps(session
);
693 mds
->locker
->remove_stale_leases(session
);
694 mds
->send_message_client(new MClientSession(CEPH_SESSION_STALE
, session
->get_push_seq()), session
);
695 finish_flush_session(session
, session
->get_push_seq());
700 cutoff
-= g_conf
->mds_session_autoclose
;
702 // don't kick clients if we've been laggy
703 if (mds
->get_laggy_until() > cutoff
) {
704 dout(10) << " laggy_until " << mds
->get_laggy_until() << " > cutoff " << cutoff
705 << ", not kicking any clients to be safe" << dendl
;
709 if (mds
->sessionmap
.get_sessions().size() == 1 &&
710 mds
->mdsmap
->get_num_in_mds() == 1) {
711 dout(20) << "not evicting a slow client, because there is only one"
717 Session
*session
= mds
->sessionmap
.get_oldest_session(Session::STATE_STALE
);
720 if (session
->is_importing()) {
721 dout(10) << "stopping at importing session " << session
->info
.inst
<< dendl
;
724 assert(session
->is_stale());
725 if (session
->last_cap_renew
>= cutoff
) {
726 dout(20) << "oldest stale session is " << session
->info
.inst
<< " and sufficiently new ("
727 << session
->last_cap_renew
<< ")" << dendl
;
732 age
-= session
->last_cap_renew
;
733 mds
->clog
->info() << "closing stale session " << session
->info
.inst
735 dout(10) << "autoclosing stale session " << session
->info
.inst
<< " last " << session
->last_cap_renew
<< dendl
;
736 kill_session(session
, NULL
);
741 * XXX bump in the interface here, not using an MDSInternalContextBase here
742 * because all the callers right now happen to use a SaferCond
744 void Server::kill_session(Session
*session
, Context
*on_safe
)
746 if ((session
->is_opening() ||
747 session
->is_open() ||
748 session
->is_stale()) &&
749 !session
->is_importing()) {
750 dout(10) << "kill_session " << session
<< dendl
;
751 journal_close_session(session
, Session::STATE_KILLING
, on_safe
);
753 dout(10) << "kill_session importing or already closing/killing " << session
<< dendl
;
754 assert(session
->is_closing() ||
755 session
->is_closed() ||
756 session
->is_killing() ||
757 session
->is_importing());
759 on_safe
->complete(0);
764 void Server::journal_close_session(Session
*session
, int state
, Context
*on_safe
)
766 uint64_t sseq
= mds
->sessionmap
.set_state(session
, state
);
767 version_t pv
= mds
->sessionmap
.mark_projected(session
);
770 // release alloc and pending-alloc inos for this session
771 // and wipe out session state, in case the session close aborts for some reason
772 interval_set
<inodeno_t
> both
;
773 both
.insert(session
->info
.prealloc_inos
);
774 both
.insert(session
->pending_prealloc_inos
);
776 mds
->inotable
->project_release_ids(both
);
777 piv
= mds
->inotable
->get_projected_version();
781 mdlog
->start_submit_entry(new ESession(session
->info
.inst
, false, pv
, both
, piv
),
782 new C_MDS_session_finish(this, session
, sseq
, false, pv
, both
, piv
, on_safe
));
785 // clean up requests, too
786 elist
<MDRequestImpl
*>::iterator p
=
787 session
->requests
.begin(member_offset(MDRequestImpl
,
788 item_session_request
));
790 MDRequestRef mdr
= mdcache
->request_get((*p
)->reqid
);
792 mdcache
->request_kill(mdr
);
795 finish_flush_session(session
, session
->get_push_seq());
798 void Server::reconnect_clients(MDSInternalContext
*reconnect_done_
)
800 reconnect_done
= reconnect_done_
;
801 mds
->sessionmap
.get_client_set(client_reconnect_gather
);
803 if (client_reconnect_gather
.empty()) {
804 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl
;
805 reconnect_gather_finish();
809 // clients will get the mdsmap and discover we're reconnecting via the monitor.
811 reconnect_start
= ceph_clock_now();
812 dout(1) << "reconnect_clients -- " << client_reconnect_gather
.size() << " sessions" << dendl
;
813 mds
->sessionmap
.dump();
816 /* This function DOES put the passed message before returning*/
817 void Server::handle_client_reconnect(MClientReconnect
*m
)
819 dout(7) << "handle_client_reconnect " << m
->get_source() << dendl
;
820 client_t from
= m
->get_source().num();
821 Session
*session
= get_session(m
);
824 if (!mds
->is_reconnect() && mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
825 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl
;
826 mds
->wait_for_reconnect(new C_MDS_RetryMessage(mds
, m
));
830 utime_t delay
= ceph_clock_now();
831 delay
-= reconnect_start
;
832 dout(10) << " reconnect_start " << reconnect_start
<< " delay " << delay
<< dendl
;
835 if (!mds
->is_reconnect()) {
836 // XXX maybe in the future we can do better than this?
837 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl
;
838 mds
->clog
->info() << "denied reconnect attempt (mds is "
839 << ceph_mds_state_name(mds
->get_state())
840 << ") from " << m
->get_source_inst()
841 << " after " << delay
<< " (allowed interval " << g_conf
->mds_reconnect_timeout
<< ")";
843 } else if (session
->is_closed()) {
844 dout(1) << " session is closed, ignoring reconnect, sending close" << dendl
;
845 mds
->clog
->info() << "denied reconnect attempt (mds is "
846 << ceph_mds_state_name(mds
->get_state())
847 << ") from " << m
->get_source_inst() << " (session is closed)";
849 } else if (mdcache
->is_readonly()) {
850 dout(1) << " read-only FS, ignoring reconnect, sending close" << dendl
;
851 mds
->clog
->info() << "denied reconnect attempt (mds is read-only)";
856 m
->get_connection()->send_message(new MClientSession(CEPH_SESSION_CLOSE
));
861 // notify client of success with an OPEN
862 m
->get_connection()->send_message(new MClientSession(CEPH_SESSION_OPEN
));
863 session
->last_cap_renew
= ceph_clock_now();
864 mds
->clog
->debug() << "reconnect by " << session
->info
.inst
<< " after " << delay
;
867 for (vector
<ceph_mds_snaprealm_reconnect
>::iterator p
= m
->realms
.begin();
868 p
!= m
->realms
.end();
870 CInode
*in
= mdcache
->get_inode(inodeno_t(p
->ino
));
871 if (in
&& in
->state_test(CInode::STATE_PURGING
))
874 assert(in
->snaprealm
);
875 if (in
->snaprealm
->have_past_parents_open()) {
876 dout(15) << "open snaprealm (w/ past parents) on " << *in
<< dendl
;
877 mdcache
->finish_snaprealm_reconnect(from
, in
->snaprealm
, snapid_t(p
->seq
));
879 dout(15) << "open snaprealm (w/o past parents) on " << *in
<< dendl
;
880 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(p
->ino
), snapid_t(p
->seq
));
883 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(p
->ino
)
884 << " seq " << p
->seq
<< dendl
;
885 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(p
->ino
), snapid_t(p
->seq
));
890 for (map
<inodeno_t
, cap_reconnect_t
>::iterator p
= m
->caps
.begin();
893 // make sure our last_cap_id is MAX over all issued caps
894 if (p
->second
.capinfo
.cap_id
> mdcache
->last_cap_id
)
895 mdcache
->last_cap_id
= p
->second
.capinfo
.cap_id
;
897 CInode
*in
= mdcache
->get_inode(p
->first
);
898 if (in
&& in
->state_test(CInode::STATE_PURGING
))
900 if (in
&& in
->is_auth()) {
901 // we recovered it, and it's ours. take note.
902 dout(15) << "open cap realm " << inodeno_t(p
->second
.capinfo
.snaprealm
)
903 << " on " << *in
<< dendl
;
904 in
->reconnect_cap(from
, p
->second
, session
);
905 mdcache
->add_reconnected_cap(from
, p
->first
, p
->second
);
906 recover_filelocks(in
, p
->second
.flockbl
, m
->get_orig_source().num());
910 if (in
&& !in
->is_auth()) {
912 dout(10) << "non-auth " << *in
<< ", will pass off to authority" << dendl
;
913 // add to cap export list.
914 p
->second
.path
.clear(); // we don't need path
915 mdcache
->rejoin_export_caps(p
->first
, from
, p
->second
,
916 in
->authority().first
);
918 // don't know if the inode is mine
919 dout(10) << "missing ino " << p
->first
<< ", will load later" << dendl
;
920 p
->second
.path
.clear(); // we don't need path
921 mdcache
->rejoin_recovered_caps(p
->first
, from
, p
->second
, MDS_RANK_NONE
);
925 // remove from gather set
926 client_reconnect_gather
.erase(from
);
927 if (client_reconnect_gather
.empty())
928 reconnect_gather_finish();
935 void Server::reconnect_gather_finish()
937 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects
<< " clients" << dendl
;
938 assert(reconnect_done
);
939 reconnect_done
->complete(0);
940 reconnect_done
= NULL
;
943 void Server::reconnect_tick()
945 utime_t reconnect_end
= reconnect_start
;
946 reconnect_end
+= g_conf
->mds_reconnect_timeout
;
947 if (ceph_clock_now() >= reconnect_end
&&
948 !client_reconnect_gather
.empty()) {
949 dout(10) << "reconnect timed out" << dendl
;
950 for (set
<client_t
>::iterator p
= client_reconnect_gather
.begin();
951 p
!= client_reconnect_gather
.end();
953 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->v
));
955 dout(1) << "reconnect gave up on " << session
->info
.inst
<< dendl
;
956 kill_session(session
, NULL
);
959 client_reconnect_gather
.clear();
960 reconnect_gather_finish();
964 void Server::recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
)
966 if (!locks
.length()) return;
969 bufferlist::iterator p
= locks
.begin();
970 ::decode(numlocks
, p
);
971 for (int i
= 0; i
< numlocks
; ++i
) {
973 lock
.client
= client
;
974 in
->get_fcntl_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
>(lock
.start
, lock
));
975 ++in
->get_fcntl_lock_state()->client_held_lock_counts
[client
];
977 ::decode(numlocks
, p
);
978 for (int i
= 0; i
< numlocks
; ++i
) {
980 lock
.client
= client
;
981 in
->get_flock_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
> (lock
.start
, lock
));
982 ++in
->get_flock_lock_state()->client_held_lock_counts
[client
];
988 * Call this when the MDCache is oversized, to send requests to the clients
989 * to trim some caps, and consequently unpin some inodes in the MDCache so
990 * that it can trim too.
992 void Server::recall_client_state(float ratio
)
994 int max_caps_per_client
= (int)(g_conf
->mds_cache_size
* .8);
995 int min_caps_per_client
= 100;
997 dout(10) << "recall_client_state " << ratio
998 << ", caps per client " << min_caps_per_client
<< "-" << max_caps_per_client
1001 set
<Session
*> sessions
;
1002 mds
->sessionmap
.get_client_session_set(sessions
);
1003 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1004 p
!= sessions
.end();
1006 Session
*session
= *p
;
1007 if (!session
->is_open() ||
1008 !session
->info
.inst
.name
.is_client())
1011 dout(10) << " session " << session
->info
.inst
1012 << " caps " << session
->caps
.size()
1013 << ", leases " << session
->leases
.size()
1016 if (session
->caps
.size() > min_caps_per_client
) {
1017 int newlim
= MIN((int)(session
->caps
.size() * ratio
), max_caps_per_client
);
1018 if (session
->caps
.size() > newlim
) {
1019 MClientSession
*m
= new MClientSession(CEPH_SESSION_RECALL_STATE
);
1020 m
->head
.max_caps
= newlim
;
1021 mds
->send_message_client(m
, session
);
1022 session
->notify_recall_sent(newlim
);
1028 void Server::force_clients_readonly()
1030 dout(10) << "force_clients_readonly" << dendl
;
1031 set
<Session
*> sessions
;
1032 mds
->sessionmap
.get_client_session_set(sessions
);
1033 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1034 p
!= sessions
.end();
1036 Session
*session
= *p
;
1037 if (!session
->info
.inst
.name
.is_client() ||
1038 !(session
->is_open() || session
->is_stale()))
1040 mds
->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO
), session
);
1045 * some generic stuff for finishing off requests
1047 void Server::journal_and_reply(MDRequestRef
& mdr
, CInode
*in
, CDentry
*dn
, LogEvent
*le
, MDSLogContextBase
*fin
)
1049 dout(10) << "journal_and_reply tracei " << in
<< " tracedn " << dn
<< dendl
;
1050 assert(!mdr
->has_completed
);
1052 // note trace items for eventual reply.
1061 early_reply(mdr
, in
, dn
);
1063 mdr
->committing
= true;
1064 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
1066 if (mdr
->client_request
&& mdr
->client_request
->is_queued_for_replay()) {
1067 if (mds
->queue_one_replay()) {
1068 dout(10) << " queued next replay op" << dendl
;
1070 dout(10) << " journaled last replay op, flushing" << dendl
;
1073 } else if (mdr
->did_early_reply
)
1074 mds
->locker
->drop_rdlocks(mdr
.get());
1079 void Server::submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
, MDRequestRef
& mdr
,
1083 string
event_str("submit entry: ");
1085 mdr
->mark_event_string(event_str
);
1087 mdlog
->submit_entry(le
, fin
);
1091 * send response built from mdr contents and error code; clean up mdr
1093 void Server::respond_to_request(MDRequestRef
& mdr
, int r
)
1095 if (mdr
->client_request
) {
1096 reply_client_request(mdr
, new MClientReply(mdr
->client_request
, r
));
1098 // add here to avoid counting ops multiple times (e.g., locks, loading)
1099 switch(mdr
->client_request
->get_op()) {
1100 case CEPH_MDS_OP_LOOKUPHASH
:
1101 logger
->inc(l_mdss_req_lookuphash
);
1103 case CEPH_MDS_OP_LOOKUPINO
:
1104 logger
->inc(l_mdss_req_lookupino
);
1106 case CEPH_MDS_OP_LOOKUPPARENT
:
1107 logger
->inc(l_mdss_req_lookupparent
);
1109 case CEPH_MDS_OP_LOOKUPNAME
:
1110 logger
->inc(l_mdss_req_lookupname
);
1112 case CEPH_MDS_OP_LOOKUP
:
1113 logger
->inc(l_mdss_req_lookup
);
1115 case CEPH_MDS_OP_LOOKUPSNAP
:
1116 logger
->inc(l_mdss_req_lookupsnap
);
1118 case CEPH_MDS_OP_GETATTR
:
1119 logger
->inc(l_mdss_req_getattr
);
1121 case CEPH_MDS_OP_SETATTR
:
1122 logger
->inc(l_mdss_req_setattr
);
1124 case CEPH_MDS_OP_SETLAYOUT
:
1125 logger
->inc(l_mdss_req_setlayout
);
1127 case CEPH_MDS_OP_SETDIRLAYOUT
:
1128 logger
->inc(l_mdss_req_setdirlayout
);
1130 case CEPH_MDS_OP_SETXATTR
:
1131 logger
->inc(l_mdss_req_setxattr
);
1133 case CEPH_MDS_OP_RMXATTR
:
1134 logger
->inc(l_mdss_req_rmxattr
);
1136 case CEPH_MDS_OP_READDIR
:
1137 logger
->inc(l_mdss_req_readdir
);
1139 case CEPH_MDS_OP_SETFILELOCK
:
1140 logger
->inc(l_mdss_req_setfilelock
);
1142 case CEPH_MDS_OP_GETFILELOCK
:
1143 logger
->inc(l_mdss_req_getfilelock
);
1145 case CEPH_MDS_OP_CREATE
:
1146 logger
->inc(l_mdss_req_create
);
1147 case CEPH_MDS_OP_OPEN
:
1148 logger
->inc(l_mdss_req_open
);
1150 case CEPH_MDS_OP_MKNOD
:
1151 logger
->inc(l_mdss_req_mknod
);
1153 case CEPH_MDS_OP_LINK
:
1154 logger
->inc(l_mdss_req_link
);
1156 case CEPH_MDS_OP_UNLINK
:
1157 logger
->inc(l_mdss_req_unlink
);
1159 case CEPH_MDS_OP_RMDIR
:
1160 logger
->inc(l_mdss_req_rmdir
);
1162 case CEPH_MDS_OP_RENAME
:
1163 logger
->inc(l_mdss_req_rename
);
1165 case CEPH_MDS_OP_MKDIR
:
1166 logger
->inc(l_mdss_req_mkdir
);
1168 case CEPH_MDS_OP_SYMLINK
:
1169 logger
->inc(l_mdss_req_symlink
);
1171 case CEPH_MDS_OP_LSSNAP
:
1172 logger
->inc(l_mdss_req_lssnap
);
1174 case CEPH_MDS_OP_MKSNAP
:
1175 logger
->inc(l_mdss_req_mksnap
);
1177 case CEPH_MDS_OP_RMSNAP
:
1178 logger
->inc(l_mdss_req_rmsnap
);
1180 case CEPH_MDS_OP_RENAMESNAP
:
1181 logger
->inc(l_mdss_req_renamesnap
);
1184 } else if (mdr
->internal_op
> -1) {
1185 dout(10) << "respond_to_request on internal request " << mdr
<< dendl
;
1186 if (!mdr
->internal_op_finish
)
1187 assert(0 == "trying to respond to internal op without finisher");
1188 mdr
->internal_op_finish
->complete(r
);
1189 mdcache
->request_finish(mdr
);
1193 void Server::early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
)
1195 if (!g_conf
->mds_early_reply
)
1198 if (mdr
->has_more() && mdr
->more()->has_journaled_slaves
) {
1199 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl
;
1203 if (mdr
->alloc_ino
) {
1204 dout(10) << "early_reply - allocated ino, not allowed" << dendl
;
1208 MClientRequest
*req
= mdr
->client_request
;
1209 entity_inst_t client_inst
= req
->get_source_inst();
1210 if (client_inst
.name
.is_mds())
1213 if (req
->is_replay()) {
1214 dout(10) << " no early reply on replay op" << dendl
;
1219 MClientReply
*reply
= new MClientReply(req
, 0);
1220 reply
->set_unsafe();
1222 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1224 //_rename_finish() does not send dentry link/unlink message to replicas.
1225 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1226 // that have projected linkages from getting new replica.
1227 mds
->locker
->set_xlocks_done(mdr
.get(), req
->get_op() == CEPH_MDS_OP_RENAME
);
1229 dout(10) << "early_reply " << reply
->get_result()
1230 << " (" << cpp_strerror(reply
->get_result())
1231 << ") " << *req
<< dendl
;
1233 if (tracei
|| tracedn
) {
1235 mdr
->cap_releases
.erase(tracei
->vino());
1237 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
1239 set_trace_dist(mdr
->session
, reply
, tracei
, tracedn
, mdr
->snapid
,
1240 req
->get_dentry_wanted(), mdr
);
1243 reply
->set_extra_bl(mdr
->reply_extra_bl
);
1244 req
->get_connection()->send_message(reply
);
1246 mdr
->did_early_reply
= true;
1248 mds
->logger
->inc(l_mds_reply
);
1249 utime_t lat
= ceph_clock_now() - req
->get_recv_stamp();
1250 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
1251 dout(20) << "lat " << lat
<< dendl
;
1253 mdr
->mark_event("early_replied");
1258 * include a trace to tracei
1261 void Server::reply_client_request(MDRequestRef
& mdr
, MClientReply
*reply
)
1264 MClientRequest
*req
= mdr
->client_request
;
1266 dout(7) << "reply_client_request " << reply
->get_result()
1267 << " (" << cpp_strerror(reply
->get_result())
1268 << ") " << *req
<< dendl
;
1270 mdr
->mark_event("replying");
1272 Session
*session
= mdr
->session
;
1274 // note successful request in session map?
1276 // setfilelock requests are special, they only modify states in MDS memory.
1277 // The states get lost when MDS fails. If Client re-send a completed
1278 // setfilelock request, it means that client did not receive corresponding
1279 // setfilelock reply. So MDS should re-execute the setfilelock request.
1280 if (req
->may_write() && req
->get_op() != CEPH_MDS_OP_SETFILELOCK
&&
1281 reply
->get_result() == 0 && session
) {
1282 inodeno_t created
= mdr
->alloc_ino
? mdr
->alloc_ino
: mdr
->used_prealloc_ino
;
1283 session
->add_completed_request(mdr
->reqid
.tid
, created
);
1285 mdr
->ls
->touched_sessions
.insert(session
->info
.inst
.name
);
1289 // give any preallocated inos to the session
1290 apply_allocated_inos(mdr
, session
);
1292 // get tracei/tracedn from mdr?
1293 snapid_t snapid
= mdr
->snapid
;
1294 CInode
*tracei
= mdr
->tracei
;
1295 CDentry
*tracedn
= mdr
->tracedn
;
1297 bool is_replay
= mdr
->client_request
->is_replay();
1298 bool did_early_reply
= mdr
->did_early_reply
;
1299 entity_inst_t client_inst
= req
->get_source_inst();
1300 int dentry_wanted
= req
->get_dentry_wanted();
1302 if (!did_early_reply
&& !is_replay
) {
1304 mds
->logger
->inc(l_mds_reply
);
1305 utime_t lat
= ceph_clock_now() - mdr
->client_request
->get_recv_stamp();
1306 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
1307 dout(20) << "lat " << lat
<< dendl
;
1310 mdr
->cap_releases
.erase(tracei
->vino());
1312 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
1315 // drop non-rdlocks before replying, so that we can issue leases
1316 mdcache
->request_drop_non_rdlocks(mdr
);
1319 if (client_inst
.name
.is_mds() || !session
) {
1320 reply
->put(); // mds doesn't need a reply
1324 if (!did_early_reply
&& // don't issue leases if we sent an earlier reply already
1325 (tracei
|| tracedn
)) {
1328 mdcache
->try_reconnect_cap(tracei
, session
);
1330 // include metadata in reply
1331 set_trace_dist(session
, reply
, tracei
, tracedn
,
1332 snapid
, dentry_wanted
,
1337 // We can set the extra bl unconditionally: if it's already been sent in the
1338 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
1339 reply
->set_extra_bl(mdr
->reply_extra_bl
);
1341 reply
->set_mdsmap_epoch(mds
->mdsmap
->get_epoch());
1342 req
->get_connection()->send_message(reply
);
1345 if (req
->is_queued_for_replay() &&
1346 (mdr
->has_completed
|| reply
->get_result() < 0)) {
1347 if (reply
->get_result() < 0) {
1348 int r
= reply
->get_result();
1349 derr
<< "reply_client_request: failed to replay " << *req
1350 << " error " << r
<< " (" << cpp_strerror(r
) << ")" << dendl
;
1351 mds
->clog
->warn() << "failed to replay " << req
->get_reqid() << " error " << r
;
1353 mds
->queue_one_replay();
1357 mdcache
->request_finish(mdr
);
1359 // take a closer look at tracei, if it happens to be a remote link
1362 tracedn
->get_projected_linkage()->is_remote()) {
1363 mdcache
->eval_remote(tracedn
);
1368 void Server::encode_empty_dirstat(bufferlist
& bl
)
1370 static DirStat empty
;
1374 void Server::encode_infinite_lease(bufferlist
& bl
)
1381 dout(20) << "encode_infinite_lease " << e
<< dendl
;
1384 void Server::encode_null_lease(bufferlist
& bl
)
1391 dout(20) << "encode_null_lease " << e
<< dendl
;
1396 * pass inode OR dentry (not both, or we may get confused)
1398 * trace is in reverse order (i.e. root inode comes last)
1400 void Server::set_trace_dist(Session
*session
, MClientReply
*reply
,
1401 CInode
*in
, CDentry
*dn
,
1406 // skip doing this for debugging purposes?
1407 if (g_conf
->mds_inject_traceless_reply_probability
&&
1408 mdr
->ls
&& !mdr
->o_trunc
&&
1409 (rand() % 10000 < g_conf
->mds_inject_traceless_reply_probability
* 10000.0)) {
1410 dout(5) << "deliberately skipping trace for " << *reply
<< dendl
;
1414 // inode, dentry, dir, ..., inode
1416 mds_rank_t whoami
= mds
->get_nodeid();
1417 client_t client
= session
->get_client();
1418 utime_t now
= ceph_clock_now();
1420 dout(20) << "set_trace_dist snapid " << snapid
<< dendl
;
1422 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
1425 if (snapid
== CEPH_NOSNAP
) {
1428 realm
= in
->find_snaprealm();
1430 realm
= dn
->get_dir()->get_inode()->find_snaprealm();
1431 reply
->snapbl
= realm
->get_snap_trace();
1432 dout(10) << "set_trace_dist snaprealm " << *realm
<< " len=" << reply
->snapbl
.length() << dendl
;
1437 reply
->head
.is_dentry
= 1;
1438 CDir
*dir
= dn
->get_dir();
1439 CInode
*diri
= dir
->get_inode();
1441 diri
->encode_inodestat(bl
, session
, NULL
, snapid
);
1442 dout(20) << "set_trace_dist added diri " << *diri
<< dendl
;
1444 #ifdef MDS_VERIFY_FRAGSTAT
1445 if (dir
->is_complete())
1446 dir
->verify_fragstat();
1448 dir
->encode_dirstat(bl
, whoami
);
1449 dout(20) << "set_trace_dist added dir " << *dir
<< dendl
;
1451 ::encode(dn
->get_name(), bl
);
1452 if (snapid
== CEPH_NOSNAP
)
1453 mds
->locker
->issue_client_lease(dn
, client
, bl
, now
, session
);
1455 encode_null_lease(bl
);
1456 dout(20) << "set_trace_dist added dn " << snapid
<< " " << *dn
<< dendl
;
1458 reply
->head
.is_dentry
= 0;
1462 in
->encode_inodestat(bl
, session
, NULL
, snapid
, 0, mdr
->getattr_caps
);
1463 dout(20) << "set_trace_dist added in " << *in
<< dendl
;
1464 reply
->head
.is_target
= 1;
1466 reply
->head
.is_target
= 0;
1468 reply
->set_trace(bl
);
1475 * process a client request
1476 * This function DOES put the passed message before returning
1478 void Server::handle_client_request(MClientRequest
*req
)
1480 dout(4) << "handle_client_request " << *req
<< dendl
;
1483 mds
->logger
->inc(l_mds_request
);
1485 logger
->inc(l_mdss_handle_client_request
);
1487 if (!mdcache
->is_open()) {
1488 dout(5) << "waiting for root" << dendl
;
1489 mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, req
));
1494 Session
*session
= 0;
1495 if (req
->get_source().is_client()) {
1496 session
= get_session(req
);
1498 dout(5) << "no session for " << req
->get_source() << ", dropping" << dendl
;
1499 } else if (session
->is_closed() ||
1500 session
->is_closing() ||
1501 session
->is_killing()) {
1502 dout(5) << "session closed|closing|killing, dropping" << dendl
;
1506 if (req
->is_queued_for_replay())
1507 mds
->queue_one_replay();
1514 if (req
->get_mdsmap_epoch() < mds
->mdsmap
->get_epoch()) {
1515 // send it? hrm, this isn't ideal; they may get a lot of copies if
1516 // they have a high request rate.
1519 // completed request?
1520 bool has_completed
= false;
1521 if (req
->is_replay() || req
->get_retry_attempt()) {
1524 if (session
->have_completed_request(req
->get_reqid().tid
, &created
)) {
1525 has_completed
= true;
1526 // Don't send traceless reply if the completed request has created
1527 // new inode. Treat the request as lookup request instead.
1528 if (req
->is_replay() ||
1529 ((created
== inodeno_t() || !mds
->is_clientreplay()) &&
1530 req
->get_op() != CEPH_MDS_OP_OPEN
&&
1531 req
->get_op() != CEPH_MDS_OP_CREATE
)) {
1532 dout(5) << "already completed " << req
->get_reqid() << dendl
;
1533 MClientReply
*reply
= new MClientReply(req
, 0);
1534 if (created
!= inodeno_t()) {
1536 ::encode(created
, extra
);
1537 reply
->set_extra_bl(extra
);
1539 req
->get_connection()->send_message(reply
);
1541 if (req
->is_queued_for_replay())
1542 mds
->queue_one_replay();
1547 if (req
->get_op() != CEPH_MDS_OP_OPEN
&&
1548 req
->get_op() != CEPH_MDS_OP_CREATE
) {
1549 dout(10) << " completed request which created new inode " << created
1550 << ", convert it to lookup request" << dendl
;
1551 req
->head
.op
= req
->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP
: CEPH_MDS_OP_GETATTR
;
1552 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
1557 // trim completed_request list
1558 if (req
->get_oldest_client_tid() > 0) {
1559 dout(15) << " oldest_client_tid=" << req
->get_oldest_client_tid() << dendl
;
1561 if (session
->trim_completed_requests(req
->get_oldest_client_tid())) {
1562 // Sessions 'completed_requests' was dirtied, mark it to be
1563 // potentially flushed at segment expiry.
1564 mdlog
->get_current_segment()->touched_sessions
.insert(session
->info
.inst
.name
);
1566 if (session
->get_num_trim_requests_warnings() > 0 &&
1567 session
->get_num_completed_requests() * 2 < g_conf
->mds_max_completed_requests
)
1568 session
->reset_num_trim_requests_warnings();
1570 if (session
->get_num_completed_requests() >=
1571 (g_conf
->mds_max_completed_requests
<< session
->get_num_trim_requests_warnings())) {
1572 session
->inc_num_trim_requests_warnings();
1574 ss
<< "client." << session
->get_client() << " does not advance its oldest_client_tid ("
1575 << req
->get_oldest_client_tid() << "), "
1576 << session
->get_num_completed_requests()
1577 << " completed requests recorded in session\n";
1578 mds
->clog
->warn() << ss
.str();
1579 dout(20) << __func__
<< " " << ss
.str() << dendl
;
1584 // register + dispatch
1585 MDRequestRef mdr
= mdcache
->request_start(req
);
1590 mdr
->session
= session
;
1591 session
->requests
.push_back(&mdr
->item_session_request
);
1595 mdr
->has_completed
= true;
1597 // process embedded cap releases?
1598 // (only if NOT replay!)
1599 if (!req
->releases
.empty() && req
->get_source().is_client() && !req
->is_replay()) {
1600 client_t client
= req
->get_source().num();
1601 for (vector
<MClientRequest::Release
>::iterator p
= req
->releases
.begin();
1602 p
!= req
->releases
.end();
1604 mds
->locker
->process_request_cap_release(mdr
, client
, p
->item
, p
->dname
);
1605 req
->releases
.clear();
1608 dispatch_client_request(mdr
);
1612 void Server::handle_osd_map()
1614 /* Note that we check the OSDMAP_FULL flag directly rather than
1615 * using osdmap_full_flag(), because we want to know "is the flag set"
1616 * rather than "does the flag apply to us?" */
1617 mds
->objecter
->with_osdmap([this](const OSDMap
& o
) {
1618 is_full
= o
.test_flag(CEPH_OSDMAP_FULL
);
1619 dout(7) << __func__
<< ": full = " << is_full
<< " epoch = "
1620 << o
.get_epoch() << dendl
;
1624 void Server::dispatch_client_request(MDRequestRef
& mdr
)
1626 // we shouldn't be waiting on anyone.
1627 assert(!mdr
->has_more() || mdr
->more()->waiting_on_slave
.empty());
1630 dout(10) << "request " << *mdr
<< " was killed" << dendl
;
1634 MClientRequest
*req
= mdr
->client_request
;
1636 if (logger
) logger
->inc(l_mdss_dispatch_client_request
);
1638 dout(7) << "dispatch_client_request " << *req
<< dendl
;
1640 if (req
->may_write()) {
1641 if (mdcache
->is_readonly()) {
1642 dout(10) << " read-only FS" << dendl
;
1643 respond_to_request(mdr
, -EROFS
);
1646 if (mdr
->has_more() && mdr
->more()->slave_error
) {
1647 dout(10) << " got error from slaves" << dendl
;
1648 respond_to_request(mdr
, mdr
->more()->slave_error
);
1654 if (req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
1655 req
->get_op() == CEPH_MDS_OP_SETDIRLAYOUT
||
1656 req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
1657 req
->get_op() == CEPH_MDS_OP_RMXATTR
||
1658 req
->get_op() == CEPH_MDS_OP_SETXATTR
||
1659 req
->get_op() == CEPH_MDS_OP_CREATE
||
1660 req
->get_op() == CEPH_MDS_OP_SYMLINK
||
1661 req
->get_op() == CEPH_MDS_OP_MKSNAP
||
1662 ((req
->get_op() == CEPH_MDS_OP_LINK
||
1663 req
->get_op() == CEPH_MDS_OP_RENAME
) &&
1664 (!mdr
->has_more() || mdr
->more()->witnessed
.empty())) // haven't started slave request
1667 dout(20) << __func__
<< ": full, responding ENOSPC to op " << ceph_mds_op_name(req
->get_op()) << dendl
;
1668 respond_to_request(mdr
, -ENOSPC
);
1671 dout(20) << __func__
<< ": full, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
1675 switch (req
->get_op()) {
1676 case CEPH_MDS_OP_LOOKUPHASH
:
1677 case CEPH_MDS_OP_LOOKUPINO
:
1678 handle_client_lookup_ino(mdr
, false, false);
1680 case CEPH_MDS_OP_LOOKUPPARENT
:
1681 handle_client_lookup_ino(mdr
, true, false);
1683 case CEPH_MDS_OP_LOOKUPNAME
:
1684 handle_client_lookup_ino(mdr
, false, true);
1688 case CEPH_MDS_OP_LOOKUP
:
1689 handle_client_getattr(mdr
, true);
1692 case CEPH_MDS_OP_LOOKUPSNAP
:
1693 // lookupsnap does not reference a CDentry; treat it as a getattr
1694 case CEPH_MDS_OP_GETATTR
:
1695 handle_client_getattr(mdr
, false);
1698 case CEPH_MDS_OP_SETATTR
:
1699 handle_client_setattr(mdr
);
1701 case CEPH_MDS_OP_SETLAYOUT
:
1702 handle_client_setlayout(mdr
);
1704 case CEPH_MDS_OP_SETDIRLAYOUT
:
1705 handle_client_setdirlayout(mdr
);
1707 case CEPH_MDS_OP_SETXATTR
:
1708 handle_client_setxattr(mdr
);
1710 case CEPH_MDS_OP_RMXATTR
:
1711 handle_client_removexattr(mdr
);
1714 case CEPH_MDS_OP_READDIR
:
1715 handle_client_readdir(mdr
);
1718 case CEPH_MDS_OP_SETFILELOCK
:
1719 handle_client_file_setlock(mdr
);
1722 case CEPH_MDS_OP_GETFILELOCK
:
1723 handle_client_file_readlock(mdr
);
1727 case CEPH_MDS_OP_CREATE
:
1728 if (mdr
->has_completed
)
1729 handle_client_open(mdr
); // already created.. just open
1731 handle_client_openc(mdr
);
1734 case CEPH_MDS_OP_OPEN
:
1735 handle_client_open(mdr
);
1740 case CEPH_MDS_OP_MKNOD
:
1741 handle_client_mknod(mdr
);
1743 case CEPH_MDS_OP_LINK
:
1744 handle_client_link(mdr
);
1746 case CEPH_MDS_OP_UNLINK
:
1747 case CEPH_MDS_OP_RMDIR
:
1748 handle_client_unlink(mdr
);
1750 case CEPH_MDS_OP_RENAME
:
1751 handle_client_rename(mdr
);
1753 case CEPH_MDS_OP_MKDIR
:
1754 handle_client_mkdir(mdr
);
1756 case CEPH_MDS_OP_SYMLINK
:
1757 handle_client_symlink(mdr
);
1762 case CEPH_MDS_OP_LSSNAP
:
1763 handle_client_lssnap(mdr
);
1765 case CEPH_MDS_OP_MKSNAP
:
1766 handle_client_mksnap(mdr
);
1768 case CEPH_MDS_OP_RMSNAP
:
1769 handle_client_rmsnap(mdr
);
1771 case CEPH_MDS_OP_RENAMESNAP
:
1772 handle_client_renamesnap(mdr
);
1776 dout(1) << " unknown client op " << req
->get_op() << dendl
;
1777 respond_to_request(mdr
, -EOPNOTSUPP
);
1782 // ---------------------------------------
1785 /* This function DOES put the passed message before returning*/
1786 void Server::handle_slave_request(MMDSSlaveRequest
*m
)
1788 dout(4) << "handle_slave_request " << m
->get_reqid() << " from " << m
->get_source() << dendl
;
1789 mds_rank_t from
= mds_rank_t(m
->get_source().num());
1791 if (logger
) logger
->inc(l_mdss_handle_slave_request
);
1795 return handle_slave_request_reply(m
);
1797 // the purpose of rename notify is enforcing causal message ordering. making sure
1798 // bystanders have received all messages from rename srcdn's auth MDS.
1799 if (m
->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY
) {
1800 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(m
->get_reqid(), m
->get_attempt(),
1801 MMDSSlaveRequest::OP_RENAMENOTIFYACK
);
1802 mds
->send_message(reply
, m
->get_connection());
1807 CDentry
*straydn
= NULL
;
1808 if (m
->stray
.length() > 0) {
1809 straydn
= mdcache
->add_replica_stray(m
->stray
, from
);
1814 // am i a new slave?
1816 if (mdcache
->have_request(m
->get_reqid())) {
1818 mdr
= mdcache
->request_get(m
->get_reqid());
1820 // is my request newer?
1821 if (mdr
->attempt
> m
->get_attempt()) {
1822 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " > " << m
->get_attempt()
1823 << ", dropping " << *m
<< dendl
;
1829 if (mdr
->attempt
< m
->get_attempt()) {
1830 // mine is old, close it out
1831 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " < " << m
->get_attempt()
1832 << ", closing out" << dendl
;
1833 mdcache
->request_finish(mdr
);
1835 } else if (mdr
->slave_to_mds
!= from
) {
1836 dout(10) << "local request " << *mdr
<< " not slave to mds." << from
<< dendl
;
1841 if (m
->get_op() == MMDSSlaveRequest::OP_FINISH
&& m
->is_abort()) {
1842 mdr
->aborted
= true;
1843 if (mdr
->slave_request
) {
1844 // only abort on-going xlock, wrlock and auth pin
1845 assert(!mdr
->slave_did_prepare());
1847 mdcache
->request_finish(mdr
);
1854 if (m
->get_op() == MMDSSlaveRequest::OP_FINISH
) {
1855 dout(10) << "missing slave request for " << m
->get_reqid()
1856 << " OP_FINISH, must have lost race with a forward" << dendl
;
1860 mdr
= mdcache
->request_start_slave(m
->get_reqid(), m
->get_attempt(), m
);
1861 mdr
->set_op_stamp(m
->op_stamp
);
1863 assert(mdr
->slave_request
== 0); // only one at a time, please!
1867 mdr
->straydn
= straydn
;
1870 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
1871 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
1872 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
1874 } else if (mds
->is_clientreplay() && !mds
->mdsmap
->is_clientreplay(from
) &&
1875 mdr
->locks
.empty()) {
1876 dout(3) << "not active yet, waiting" << dendl
;
1877 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
1881 mdr
->slave_request
= m
;
1883 dispatch_slave_request(mdr
);
1886 /* This function DOES put the passed message before returning*/
1887 void Server::handle_slave_request_reply(MMDSSlaveRequest
*m
)
1889 mds_rank_t from
= mds_rank_t(m
->get_source().num());
1891 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
1892 metareqid_t r
= m
->get_reqid();
1893 if (!mdcache
->have_uncommitted_master(r
, from
)) {
1894 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
1895 << from
<< " reqid " << r
<< dendl
;
1899 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
1900 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
1904 if (m
->get_op() == MMDSSlaveRequest::OP_COMMITTED
) {
1905 metareqid_t r
= m
->get_reqid();
1906 mdcache
->committed_master_slave(r
, from
);
1911 MDRequestRef mdr
= mdcache
->request_get(m
->get_reqid());
1912 if (m
->get_attempt() != mdr
->attempt
) {
1913 dout(10) << "handle_slave_request_reply " << *mdr
<< " ignoring reply from other attempt "
1914 << m
->get_attempt() << dendl
;
1919 switch (m
->get_op()) {
1920 case MMDSSlaveRequest::OP_XLOCKACK
:
1922 // identify lock, master request
1923 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
1924 m
->get_object_info());
1925 mdr
->more()->slaves
.insert(from
);
1926 dout(10) << "got remote xlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
1927 mdr
->xlocks
.insert(lock
);
1928 mdr
->locks
.insert(lock
);
1929 mdr
->finish_locking(lock
);
1930 lock
->get_xlock(mdr
, mdr
->get_client());
1932 assert(mdr
->more()->waiting_on_slave
.count(from
));
1933 mdr
->more()->waiting_on_slave
.erase(from
);
1934 assert(mdr
->more()->waiting_on_slave
.empty());
1935 mdcache
->dispatch_request(mdr
);
1939 case MMDSSlaveRequest::OP_WRLOCKACK
:
1941 // identify lock, master request
1942 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
1943 m
->get_object_info());
1944 mdr
->more()->slaves
.insert(from
);
1945 dout(10) << "got remote wrlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
1946 mdr
->remote_wrlocks
[lock
] = from
;
1947 mdr
->locks
.insert(lock
);
1948 mdr
->finish_locking(lock
);
1950 assert(mdr
->more()->waiting_on_slave
.count(from
));
1951 mdr
->more()->waiting_on_slave
.erase(from
);
1952 assert(mdr
->more()->waiting_on_slave
.empty());
1953 mdcache
->dispatch_request(mdr
);
1957 case MMDSSlaveRequest::OP_AUTHPINACK
:
1958 handle_slave_auth_pin_ack(mdr
, m
);
1961 case MMDSSlaveRequest::OP_LINKPREPACK
:
1962 handle_slave_link_prep_ack(mdr
, m
);
1965 case MMDSSlaveRequest::OP_RMDIRPREPACK
:
1966 handle_slave_rmdir_prep_ack(mdr
, m
);
1969 case MMDSSlaveRequest::OP_RENAMEPREPACK
:
1970 handle_slave_rename_prep_ack(mdr
, m
);
1973 case MMDSSlaveRequest::OP_RENAMENOTIFYACK
:
1974 handle_slave_rename_notify_ack(mdr
, m
);
1985 /* This function DOES put the mdr->slave_request before returning*/
1986 void Server::dispatch_slave_request(MDRequestRef
& mdr
)
1988 dout(7) << "dispatch_slave_request " << *mdr
<< " " << *mdr
->slave_request
<< dendl
;
1991 dout(7) << " abort flag set, finishing" << dendl
;
1992 mdcache
->request_finish(mdr
);
1996 if (logger
) logger
->inc(l_mdss_dispatch_slave_request
);
1998 int op
= mdr
->slave_request
->get_op();
2000 case MMDSSlaveRequest::OP_XLOCK
:
2001 case MMDSSlaveRequest::OP_WRLOCK
:
2004 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->slave_request
->get_lock_type(),
2005 mdr
->slave_request
->get_object_info());
2008 dout(10) << "don't have object, dropping" << dendl
;
2009 ceph_abort(); // can this happen, if we auth pinned properly.
2011 if (op
== MMDSSlaveRequest::OP_XLOCK
&& !lock
->get_parent()->is_auth()) {
2012 dout(10) << "not auth for remote xlock attempt, dropping on "
2013 << *lock
<< " on " << *lock
->get_parent() << dendl
;
2015 // use acquire_locks so that we get auth_pinning.
2016 set
<SimpleLock
*> rdlocks
;
2017 set
<SimpleLock
*> wrlocks
= mdr
->wrlocks
;
2018 set
<SimpleLock
*> xlocks
= mdr
->xlocks
;
2022 case MMDSSlaveRequest::OP_XLOCK
:
2023 xlocks
.insert(lock
);
2024 replycode
= MMDSSlaveRequest::OP_XLOCKACK
;
2026 case MMDSSlaveRequest::OP_WRLOCK
:
2027 wrlocks
.insert(lock
);
2028 replycode
= MMDSSlaveRequest::OP_WRLOCKACK
;
2032 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
2036 MMDSSlaveRequest
*r
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, replycode
);
2037 r
->set_lock_type(lock
->get_type());
2038 lock
->get_parent()->set_object_info(r
->get_object_info());
2039 mds
->send_message(r
, mdr
->slave_request
->get_connection());
2043 mdr
->slave_request
->put();
2044 mdr
->slave_request
= 0;
2048 case MMDSSlaveRequest::OP_UNXLOCK
:
2049 case MMDSSlaveRequest::OP_UNWRLOCK
:
2051 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->slave_request
->get_lock_type(),
2052 mdr
->slave_request
->get_object_info());
2054 bool need_issue
= false;
2056 case MMDSSlaveRequest::OP_UNXLOCK
:
2057 mds
->locker
->xlock_finish(lock
, mdr
.get(), &need_issue
);
2059 case MMDSSlaveRequest::OP_UNWRLOCK
:
2060 mds
->locker
->wrlock_finish(lock
, mdr
.get(), &need_issue
);
2064 mds
->locker
->issue_caps(static_cast<CInode
*>(lock
->get_parent()));
2066 // done. no ack necessary.
2067 mdr
->slave_request
->put();
2068 mdr
->slave_request
= 0;
2072 case MMDSSlaveRequest::OP_DROPLOCKS
:
2073 mds
->locker
->drop_locks(mdr
.get());
2074 mdr
->slave_request
->put();
2075 mdr
->slave_request
= 0;
2078 case MMDSSlaveRequest::OP_AUTHPIN
:
2079 handle_slave_auth_pin(mdr
);
2082 case MMDSSlaveRequest::OP_LINKPREP
:
2083 case MMDSSlaveRequest::OP_UNLINKPREP
:
2084 handle_slave_link_prep(mdr
);
2087 case MMDSSlaveRequest::OP_RMDIRPREP
:
2088 handle_slave_rmdir_prep(mdr
);
2091 case MMDSSlaveRequest::OP_RENAMEPREP
:
2092 handle_slave_rename_prep(mdr
);
2095 case MMDSSlaveRequest::OP_FINISH
:
2096 // information about rename imported caps
2097 if (mdr
->slave_request
->inode_export
.length() > 0)
2098 mdr
->more()->inode_import
.claim(mdr
->slave_request
->inode_export
);
2099 // finish off request.
2100 mdcache
->request_finish(mdr
);
2108 /* This function DOES put the mdr->slave_request before returning*/
2109 void Server::handle_slave_auth_pin(MDRequestRef
& mdr
)
2111 dout(10) << "handle_slave_auth_pin " << *mdr
<< dendl
;
2113 // build list of objects
2114 list
<MDSCacheObject
*> objects
;
2115 CInode
*auth_pin_freeze
= NULL
;
2116 bool fail
= false, wouldblock
= false, readonly
= false;
2118 if (mdcache
->is_readonly()) {
2119 dout(10) << " read-only FS" << dendl
;
2125 for (vector
<MDSCacheObjectInfo
>::iterator p
= mdr
->slave_request
->get_authpins().begin();
2126 p
!= mdr
->slave_request
->get_authpins().end();
2128 MDSCacheObject
*object
= mdcache
->get_object(*p
);
2130 dout(10) << " don't have " << *p
<< dendl
;
2135 objects
.push_back(object
);
2136 if (*p
== mdr
->slave_request
->get_authpin_freeze())
2137 auth_pin_freeze
= static_cast<CInode
*>(object
);
2141 // can we auth pin them?
2143 for (list
<MDSCacheObject
*>::iterator p
= objects
.begin();
2146 if (!(*p
)->is_auth()) {
2147 dout(10) << " not auth for " << **p
<< dendl
;
2151 if (mdr
->is_auth_pinned(*p
))
2153 if (!mdr
->can_auth_pin(*p
)) {
2154 if (mdr
->slave_request
->is_nonblock()) {
2155 dout(10) << " can't auth_pin (freezing?) " << **p
<< " nonblocking" << dendl
;
2161 dout(10) << " waiting for authpinnable on " << **p
<< dendl
;
2162 (*p
)->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2163 mdr
->drop_local_auth_pins();
2166 if (CInode
*in
= dynamic_cast<CInode
*>(*p
)) {
2168 dir
= in
->get_parent_dir();
2169 } else if (CDentry
*dn
= dynamic_cast<CDentry
*>(*p
)) {
2170 dir
= dn
->get_dir();
2175 if (dir
->is_freezing_dir())
2176 mdcache
->fragment_freeze_inc_num_waiters(dir
);
2177 if (dir
->is_freezing_tree()) {
2178 while (!dir
->is_freezing_tree_root())
2179 dir
= dir
->get_parent_dir();
2180 mdcache
->migrator
->export_freeze_inc_num_waiters(dir
);
2190 mdr
->drop_local_auth_pins(); // just in case
2192 /* freeze authpin wrong inode */
2193 if (mdr
->has_more() && mdr
->more()->is_freeze_authpin
&&
2194 mdr
->more()->rename_inode
!= auth_pin_freeze
)
2195 mdr
->unfreeze_auth_pin(true);
2197 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2198 * on the source inode to complete. This happens after all locks for the rename
2199 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2200 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2201 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2202 * The solution is freeze the inode and prevent other MDRequests from getting new
2205 if (auth_pin_freeze
) {
2206 dout(10) << " freezing auth pin on " << *auth_pin_freeze
<< dendl
;
2207 if (!mdr
->freeze_auth_pin(auth_pin_freeze
)) {
2208 auth_pin_freeze
->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
2209 mds
->mdlog
->flush();
2213 for (list
<MDSCacheObject
*>::iterator p
= objects
.begin();
2216 dout(10) << "auth_pinning " << **p
<< dendl
;
2222 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_AUTHPINACK
);
2224 // return list of my auth_pins (if any)
2225 for (set
<MDSCacheObject
*>::iterator p
= mdr
->auth_pins
.begin();
2226 p
!= mdr
->auth_pins
.end();
2228 MDSCacheObjectInfo info
;
2229 (*p
)->set_object_info(info
);
2230 reply
->get_authpins().push_back(info
);
2231 if (*p
== (MDSCacheObject
*)auth_pin_freeze
)
2232 auth_pin_freeze
->set_object_info(reply
->get_authpin_freeze());
2236 reply
->mark_error_wouldblock();
2238 reply
->mark_error_rofs();
2240 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
2242 // clean up this request
2243 mdr
->slave_request
->put();
2244 mdr
->slave_request
= 0;
2248 /* This function DOES NOT put the passed ack before returning*/
2249 void Server::handle_slave_auth_pin_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
2251 dout(10) << "handle_slave_auth_pin_ack on " << *mdr
<< " " << *ack
<< dendl
;
2252 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
2255 set
<MDSCacheObject
*> pinned
;
2256 for (vector
<MDSCacheObjectInfo
>::iterator p
= ack
->get_authpins().begin();
2257 p
!= ack
->get_authpins().end();
2259 MDSCacheObject
*object
= mdcache
->get_object(*p
);
2260 assert(object
); // we pinned it
2261 dout(10) << " remote has pinned " << *object
<< dendl
;
2262 if (!mdr
->is_auth_pinned(object
))
2263 mdr
->remote_auth_pins
[object
] = from
;
2264 if (*p
== ack
->get_authpin_freeze())
2265 mdr
->set_remote_frozen_auth_pin(static_cast<CInode
*>(object
));
2266 pinned
.insert(object
);
2269 // removed frozen auth pin ?
2270 if (mdr
->more()->is_remote_frozen_authpin
&&
2271 ack
->get_authpin_freeze() == MDSCacheObjectInfo()) {
2272 auto p
= mdr
->remote_auth_pins
.find(mdr
->more()->rename_inode
);
2273 assert(p
!= mdr
->remote_auth_pins
.end());
2274 if (p
->second
== from
) {
2275 mdr
->more()->is_remote_frozen_authpin
= false;
2279 // removed auth pins?
2280 map
<MDSCacheObject
*, mds_rank_t
>::iterator p
= mdr
->remote_auth_pins
.begin();
2281 while (p
!= mdr
->remote_auth_pins
.end()) {
2282 MDSCacheObject
* object
= p
->first
;
2283 if (p
->second
== from
&& pinned
.count(object
) == 0) {
2284 dout(10) << " remote has unpinned " << *object
<< dendl
;
2285 mdr
->remote_auth_pins
.erase(p
++);
2291 if (ack
->is_error_rofs()) {
2292 mdr
->more()->slave_error
= -EROFS
;
2293 mdr
->aborted
= true;
2294 } else if (ack
->is_error_wouldblock()) {
2295 mdr
->more()->slave_error
= -EWOULDBLOCK
;
2296 mdr
->aborted
= true;
2300 mdr
->more()->slaves
.insert(from
);
2302 // clear from waiting list
2303 assert(mdr
->more()->waiting_on_slave
.count(from
));
2304 mdr
->more()->waiting_on_slave
.erase(from
);
2307 if (mdr
->more()->waiting_on_slave
.empty())
2308 mdcache
->dispatch_request(mdr
);
2310 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
2314 // ---------------------------------------
2319 * check whether we are permitted to complete a request
2321 * Check whether we have permission to perform the operation specified
2322 * by mask on the given inode, based on the capability in the mdr's
2325 bool Server::check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
)
2328 int r
= mdr
->session
->check_access(
2330 mdr
->client_request
->get_caller_uid(),
2331 mdr
->client_request
->get_caller_gid(),
2332 &mdr
->client_request
->get_caller_gid_list(),
2333 mdr
->client_request
->head
.args
.setattr
.uid
,
2334 mdr
->client_request
->head
.args
.setattr
.gid
);
2336 respond_to_request(mdr
, r
);
2344 * check whether fragment has reached maximum size
2347 bool Server::check_fragment_space(MDRequestRef
&mdr
, CDir
*in
)
2349 const auto size
= in
->get_frag_size();
2350 if (size
>= g_conf
->mds_bal_fragment_size_max
) {
2351 dout(10) << "fragment " << *in
<< " size exceeds " << g_conf
->mds_bal_fragment_size_max
<< " (ENOSPC)" << dendl
;
2352 respond_to_request(mdr
, -ENOSPC
);
2360 /** validate_dentry_dir
2362 * verify that the dir exists and would own the dname.
2363 * do not check if the dentry exists.
2365 CDir
*Server::validate_dentry_dir(MDRequestRef
& mdr
, CInode
*diri
, const string
& dname
)
2367 // make sure parent is a dir?
2368 if (!diri
->is_dir()) {
2369 dout(7) << "validate_dentry_dir: not a dir" << dendl
;
2370 respond_to_request(mdr
, -ENOTDIR
);
2375 frag_t fg
= diri
->pick_dirfrag(dname
);
2376 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
2381 if (dir
->is_frozen()) {
2382 dout(7) << "dir is frozen " << *dir
<< dendl
;
2383 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2391 /** prepare_null_dentry
2392 * prepare a null (or existing) dentry in given dir.
2393 * wait for any dn lock.
2395 CDentry
* Server::prepare_null_dentry(MDRequestRef
& mdr
, CDir
*dir
, const string
& dname
, bool okexist
)
2397 dout(10) << "prepare_null_dentry " << dname
<< " in " << *dir
<< dendl
;
2398 assert(dir
->is_auth());
2400 client_t client
= mdr
->get_client();
2402 // does it already exist?
2403 CDentry
*dn
= dir
->lookup(dname
);
2406 if (dn->lock.is_xlocked_by_other(mdr)) {
2407 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2408 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2412 if (!dn
->get_linkage(client
, mdr
)->is_null()) {
2413 // name already exists
2414 dout(10) << "dentry " << dname
<< " exists in " << *dir
<< dendl
;
2416 respond_to_request(mdr
, -EEXIST
);
2420 dn
->first
= dir
->inode
->find_snaprealm()->get_newest_seq() + 1;
2426 // make sure dir is complete
2427 if (!dir
->is_complete() && (!dir
->has_bloom() || dir
->is_in_bloom(dname
))) {
2428 dout(7) << " incomplete dir contents for " << *dir
<< ", fetching" << dendl
;
2429 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
));
2434 dn
= dir
->add_null_dentry(dname
, dir
->inode
->find_snaprealm()->get_newest_seq() + 1);
2436 dout(10) << "prepare_null_dentry added " << *dn
<< dendl
;
2440 CDentry
* Server::prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
)
2442 CDentry
*straydn
= mdr
->straydn
;
2445 in
->name_stray_dentry(straydname
);
2446 if (straydn
->get_name() == straydname
)
2449 assert(!mdr
->done_locking
);
2450 mdr
->unpin(straydn
);
2453 CDir
*straydir
= mdcache
->get_stray_dir(in
);
2455 if (!mdr
->client_request
->is_replay() &&
2456 !check_fragment_space(mdr
, straydir
))
2459 straydn
= mdcache
->get_or_create_stray_dentry(in
);
2460 mdr
->straydn
= straydn
;
2465 /** prepare_new_inode
2467 * create a new inode. set c/m/atime. hit dir pop.
2469 CInode
* Server::prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
2470 file_layout_t
*layout
)
2472 CInode
*in
= new CInode(mdcache
);
2474 // Server::prepare_force_open_sessions() can re-open session in closing
2475 // state. In that corner case, session's prealloc_inos are being freed.
2476 // To simplify the code, we disallow using/refilling session's prealloc_ino
2477 // while session is opening.
2478 bool allow_prealloc_inos
= !mdr
->session
->is_opening();
2481 if (allow_prealloc_inos
&&
2482 mdr
->session
->info
.prealloc_inos
.size()) {
2483 mdr
->used_prealloc_ino
=
2484 in
->inode
.ino
= mdr
->session
->take_ino(useino
); // prealloc -> used
2485 mds
->sessionmap
.mark_projected(mdr
->session
);
2487 dout(10) << "prepare_new_inode used_prealloc " << mdr
->used_prealloc_ino
2488 << " (" << mdr
->session
->info
.prealloc_inos
2489 << ", " << mdr
->session
->info
.prealloc_inos
.size() << " left)"
2493 in
->inode
.ino
= mds
->inotable
->project_alloc_id();
2494 dout(10) << "prepare_new_inode alloc " << mdr
->alloc_ino
<< dendl
;
2497 if (useino
&& useino
!= in
->inode
.ino
) {
2498 dout(0) << "WARNING: client specified " << useino
<< " and i allocated " << in
->inode
.ino
<< dendl
;
2499 mds
->clog
->error() << mdr
->client_request
->get_source()
2500 << " specified ino " << useino
2501 << " but mds." << mds
->get_nodeid() << " allocated " << in
->inode
.ino
;
2502 //ceph_abort(); // just for now.
2505 if (allow_prealloc_inos
&&
2506 mdr
->session
->get_num_projected_prealloc_inos() < g_conf
->mds_client_prealloc_inos
/ 2) {
2507 int need
= g_conf
->mds_client_prealloc_inos
- mdr
->session
->get_num_projected_prealloc_inos();
2508 mds
->inotable
->project_alloc_ids(mdr
->prealloc_inos
, need
);
2509 assert(mdr
->prealloc_inos
.size()); // or else fix projected increment semantics
2510 mdr
->session
->pending_prealloc_inos
.insert(mdr
->prealloc_inos
);
2511 mds
->sessionmap
.mark_projected(mdr
->session
);
2512 dout(10) << "prepare_new_inode prealloc " << mdr
->prealloc_inos
<< dendl
;
2515 in
->inode
.version
= 1;
2516 in
->inode
.xattr_version
= 1;
2517 in
->inode
.nlink
= 1; // FIXME
2519 in
->inode
.mode
= mode
;
2521 memset(&in
->inode
.dir_layout
, 0, sizeof(in
->inode
.dir_layout
));
2522 if (in
->inode
.is_dir()) {
2523 in
->inode
.dir_layout
.dl_dir_hash
= g_conf
->mds_default_dir_hash
;
2524 } else if (layout
) {
2525 in
->inode
.layout
= *layout
;
2527 in
->inode
.layout
= mdcache
->default_file_layout
;
2530 in
->inode
.truncate_size
= -1ull; // not truncated, yet!
2531 in
->inode
.truncate_seq
= 1; /* starting with 1, 0 is kept for no-truncation logic */
2533 CInode
*diri
= dir
->get_inode();
2535 dout(10) << oct
<< " dir mode 0" << diri
->inode
.mode
<< " new mode 0" << mode
<< dec
<< dendl
;
2537 if (diri
->inode
.mode
& S_ISGID
) {
2538 dout(10) << " dir is sticky" << dendl
;
2539 in
->inode
.gid
= diri
->inode
.gid
;
2540 if (S_ISDIR(mode
)) {
2541 dout(10) << " new dir also sticky" << dendl
;
2542 in
->inode
.mode
|= S_ISGID
;
2545 in
->inode
.gid
= mdr
->client_request
->get_caller_gid();
2547 in
->inode
.uid
= mdr
->client_request
->get_caller_uid();
2549 in
->inode
.btime
= in
->inode
.ctime
= in
->inode
.mtime
= in
->inode
.atime
=
2550 mdr
->get_op_stamp();
2552 in
->inode
.change_attr
= 0;
2554 MClientRequest
*req
= mdr
->client_request
;
2555 if (req
->get_data().length()) {
2556 bufferlist::iterator p
= req
->get_data().begin();
2558 // xattrs on new inode?
2559 map
<string
,bufferptr
> xattrs
;
2560 ::decode(xattrs
, p
);
2561 for (map
<string
,bufferptr
>::iterator p
= xattrs
.begin(); p
!= xattrs
.end(); ++p
) {
2562 dout(10) << "prepare_new_inode setting xattr " << p
->first
<< dendl
;
2563 in
->xattrs
[p
->first
] = p
->second
;
2567 if (!mds
->mdsmap
->get_inline_data_enabled() ||
2568 !mdr
->session
->connection
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
))
2569 in
->inode
.inline_data
.version
= CEPH_INLINE_NONE
;
2571 mdcache
->add_inode(in
); // add
2572 dout(10) << "prepare_new_inode " << *in
<< dendl
;
2576 void Server::journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
)
2578 dout(20) << "journal_allocated_inos sessionmapv " << mds
->sessionmap
.get_projected()
2579 << " inotablev " << mds
->inotable
->get_projected_version()
2581 blob
->set_ino_alloc(mdr
->alloc_ino
,
2582 mdr
->used_prealloc_ino
,
2584 mdr
->client_request
->get_source(),
2585 mds
->sessionmap
.get_projected(),
2586 mds
->inotable
->get_projected_version());
2589 void Server::apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
)
2591 dout(10) << "apply_allocated_inos " << mdr
->alloc_ino
2592 << " / " << mdr
->prealloc_inos
2593 << " / " << mdr
->used_prealloc_ino
<< dendl
;
2595 if (mdr
->alloc_ino
) {
2596 mds
->inotable
->apply_alloc_id(mdr
->alloc_ino
);
2598 if (mdr
->prealloc_inos
.size()) {
2600 session
->pending_prealloc_inos
.subtract(mdr
->prealloc_inos
);
2601 session
->info
.prealloc_inos
.insert(mdr
->prealloc_inos
);
2602 mds
->sessionmap
.mark_dirty(session
);
2603 mds
->inotable
->apply_alloc_ids(mdr
->prealloc_inos
);
2605 if (mdr
->used_prealloc_ino
) {
2607 session
->info
.used_inos
.erase(mdr
->used_prealloc_ino
);
2608 mds
->sessionmap
.mark_dirty(session
);
2612 class C_MDS_TryFindInode
: public ServerContext
{
2615 C_MDS_TryFindInode(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
2616 void finish(int r
) override
{
2617 if (r
== -ESTALE
) // :( find_ino_peers failed
2618 server
->respond_to_request(mdr
, r
);
2620 server
->dispatch_client_request(mdr
);
2624 CDir
*Server::traverse_to_auth_dir(MDRequestRef
& mdr
, vector
<CDentry
*> &trace
, filepath refpath
)
2626 // figure parent dir vs dname
2627 if (refpath
.depth() == 0) {
2628 dout(7) << "can't do that to root" << dendl
;
2629 respond_to_request(mdr
, -EINVAL
);
2632 string dname
= refpath
.last_dentry();
2633 refpath
.pop_dentry();
2635 dout(10) << "traverse_to_auth_dir dirpath " << refpath
<< " dname " << dname
<< dendl
;
2637 // traverse to parent dir
2639 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, refpath
, &trace
, &diri
, MDS_TRAVERSE_FORWARD
);
2640 if (r
> 0) return 0; // delayed
2643 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
2644 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
2647 respond_to_request(mdr
, r
);
2651 // is it an auth dir?
2652 CDir
*dir
= validate_dentry_dir(mdr
, diri
, dname
);
2654 return 0; // forwarded or waiting for freeze
2656 dout(10) << "traverse_to_auth_dir " << *dir
<< dendl
;
2660 /* If this returns null, the request has been handled
2661 * as appropriate: forwarded on, or the client's been replied to */
2662 CInode
* Server::rdlock_path_pin_ref(MDRequestRef
& mdr
, int n
,
2663 set
<SimpleLock
*> &rdlocks
,
2665 bool no_want_auth
, /* for readdir, who doesn't want auth _even_if_ it's
2667 file_layout_t
**layout
,
2668 bool no_lookup
) // true if we cannot return a null dentry lease
2670 const filepath
& refpath
= n
? mdr
->get_filepath2() : mdr
->get_filepath();
2671 dout(10) << "rdlock_path_pin_ref " << *mdr
<< " " << refpath
<< dendl
;
2673 if (mdr
->done_locking
)
2677 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, refpath
, &mdr
->dn
[n
], &mdr
->in
[n
], MDS_TRAVERSE_FORWARD
);
2679 return NULL
; // delayed
2680 if (r
< 0) { // error
2681 if (r
== -ENOENT
&& n
== 0 && mdr
->dn
[n
].size()) {
2683 mdr
->tracedn
= mdr
->dn
[n
][mdr
->dn
[n
].size()-1];
2684 respond_to_request(mdr
, r
);
2685 } else if (r
== -ESTALE
) {
2686 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
2687 MDSInternalContextBase
*c
= new C_MDS_TryFindInode(this, mdr
);
2688 mdcache
->find_ino_peers(refpath
.get_ino(), c
);
2690 dout(10) << "FAIL on error " << r
<< dendl
;
2691 respond_to_request(mdr
, r
);
2695 CInode
*ref
= mdr
->in
[n
];
2696 dout(10) << "ref is " << *ref
<< dendl
;
2698 // fw to inode auth?
2699 if (mdr
->snapid
!= CEPH_NOSNAP
&& !no_want_auth
)
2703 if (ref
->is_ambiguous_auth()) {
2704 dout(10) << "waiting for single auth on " << *ref
<< dendl
;
2705 ref
->add_waiter(CInode::WAIT_SINGLEAUTH
, new C_MDS_RetryRequest(mdcache
, mdr
));
2708 if (!ref
->is_auth()) {
2709 dout(10) << "fw to auth for " << *ref
<< dendl
;
2710 mdcache
->request_forward(mdr
, ref
->authority().first
);
2715 // do NOT proceed if freezing, as cap release may defer in that case, and
2716 // we could deadlock when we try to lock @ref.
2717 // if we're already auth_pinned, continue; the release has already been processed.
2718 if (ref
->is_frozen() || ref
->is_frozen_auth_pin() ||
2719 (ref
->is_freezing() && !mdr
->is_auth_pinned(ref
))) {
2720 dout(7) << "waiting for !frozen/authpinnable on " << *ref
<< dendl
;
2721 ref
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2722 /* If we have any auth pins, this will deadlock.
2723 * But the only way to get here if we've already got auth pins
2724 * is because we're on an inode with snapshots that got updated
2725 * between dispatches of this request. So we're going to drop
2726 * our locks and our auth pins and reacquire them later.
2728 * This is safe since we're only in this function when working on
2729 * a single MDS request; otherwise we'd be in
2730 * rdlock_path_xlock_dentry.
2732 mds
->locker
->drop_locks(mdr
.get(), NULL
);
2733 mdr
->drop_local_auth_pins();
2740 for (int i
=0; i
<(int)mdr
->dn
[n
].size(); i
++)
2741 rdlocks
.insert(&mdr
->dn
[n
][i
]->lock
);
2743 mds
->locker
->include_snap_rdlocks_wlayout(rdlocks
, ref
, layout
);
2745 mds
->locker
->include_snap_rdlocks(rdlocks
, ref
);
2753 /** rdlock_path_xlock_dentry
2754 * traverse path to the directory that could/would contain dentry.
2755 * make sure i am auth for that dentry, forward as necessary.
2756 * create null dentry in place (or use existing if okexist).
2757 * get rdlocks on traversed dentries, xlock on new dentry.
2759 CDentry
* Server::rdlock_path_xlock_dentry(MDRequestRef
& mdr
, int n
,
2760 set
<SimpleLock
*>& rdlocks
, set
<SimpleLock
*>& wrlocks
, set
<SimpleLock
*>& xlocks
,
2761 bool okexist
, bool mustexist
, bool alwaysxlock
,
2762 file_layout_t
**layout
)
2764 const filepath
& refpath
= n
? mdr
->get_filepath2() : mdr
->get_filepath();
2766 dout(10) << "rdlock_path_xlock_dentry " << *mdr
<< " " << refpath
<< dendl
;
2768 client_t client
= mdr
->get_client();
2770 if (mdr
->done_locking
)
2771 return mdr
->dn
[n
].back();
2773 CDir
*dir
= traverse_to_auth_dir(mdr
, mdr
->dn
[n
], refpath
);
2775 dout(10) << "rdlock_path_xlock_dentry dir " << *dir
<< dendl
;
2777 // make sure we can auth_pin (or have already authpinned) dir
2778 if (dir
->is_frozen()) {
2779 dout(7) << "waiting for !frozen/authpinnable on " << *dir
<< dendl
;
2780 dir
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2784 CInode
*diri
= dir
->get_inode();
2785 if (!mdr
->reqid
.name
.is_mds()) {
2786 if (diri
->is_system() && !diri
->is_root()) {
2787 respond_to_request(mdr
, -EROFS
);
2791 if (!diri
->is_base() && diri
->get_projected_parent_dir()->inode
->is_stray()) {
2792 respond_to_request(mdr
, -ENOENT
);
2796 // make a null dentry?
2797 const string
&dname
= refpath
.last_dentry();
2800 dn
= dir
->lookup(dname
);
2802 // make sure dir is complete
2803 if (!dn
&& !dir
->is_complete() &&
2804 (!dir
->has_bloom() || dir
->is_in_bloom(dname
))) {
2805 dout(7) << " incomplete dir contents for " << *dir
<< ", fetching" << dendl
;
2806 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
));
2811 if (dn
&& !dn
->lock
.can_read(client
) && dn
->lock
.get_xlock_by() != mdr
) {
2812 dout(10) << "waiting on xlocked dentry " << *dn
<< dendl
;
2813 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDS_RetryRequest(mdcache
, mdr
));
2818 if (!dn
|| dn
->get_linkage(client
, mdr
)->is_null()) {
2819 dout(7) << "dentry " << dname
<< " dne in " << *dir
<< dendl
;
2820 respond_to_request(mdr
, -ENOENT
);
2824 dn
= prepare_null_dentry(mdr
, dir
, dname
, okexist
);
2829 mdr
->dn
[n
].push_back(dn
);
2830 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
2831 mdr
->in
[n
] = dnl
->get_inode();
2834 // NOTE: rename takes the same set of locks for srcdn
2835 for (int i
=0; i
<(int)mdr
->dn
[n
].size(); i
++)
2836 rdlocks
.insert(&mdr
->dn
[n
][i
]->lock
);
2837 if (alwaysxlock
|| dnl
->is_null())
2838 xlocks
.insert(&dn
->lock
); // new dn, xlock
2840 rdlocks
.insert(&dn
->lock
); // existing dn, rdlock
2841 wrlocks
.insert(&dn
->get_dir()->inode
->filelock
); // also, wrlock on dir mtime
2842 wrlocks
.insert(&dn
->get_dir()->inode
->nestlock
); // also, wrlock on dir mtime
2844 mds
->locker
->include_snap_rdlocks_wlayout(rdlocks
, dn
->get_dir()->inode
, layout
);
2846 mds
->locker
->include_snap_rdlocks(rdlocks
, dn
->get_dir()->inode
);
2856 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
2858 * @param diri base inode
2859 * @param fg the exact frag we want
2860 * @param mdr request
2861 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
2863 CDir
* Server::try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
)
2865 CDir
*dir
= diri
->get_dirfrag(fg
);
2867 // not open and inode not mine?
2868 if (!dir
&& !diri
->is_auth()) {
2869 mds_rank_t inauth
= diri
->authority().first
;
2870 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth
<< dendl
;
2871 mdcache
->request_forward(mdr
, inauth
);
2875 // not open and inode frozen?
2876 if (!dir
&& diri
->is_frozen()) {
2877 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri
<< dendl
;
2878 assert(diri
->get_parent_dir());
2879 diri
->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2885 dir
= diri
->get_or_open_dirfrag(mdcache
, fg
);
2887 // am i auth for the dirfrag?
2888 if (!dir
->is_auth()) {
2889 mds_rank_t auth
= dir
->authority().first
;
2890 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
2891 << ", fw to mds." << auth
<< dendl
;
2892 mdcache
->request_forward(mdr
, auth
);
2900 // ===============================================================================
2903 void Server::handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
)
2905 MClientRequest
*req
= mdr
->client_request
;
2906 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
2908 if (req
->get_filepath().depth() == 0 && is_lookup
) {
2909 // refpath can't be empty for lookup but it can for
2910 // getattr (we do getattr with empty refpath for mount of '/')
2911 respond_to_request(mdr
, -EINVAL
);
2915 CInode
*ref
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, false, false, NULL
, !is_lookup
);
2919 * if client currently holds the EXCL cap on a field, do not rdlock
2920 * it; client's stat() will result in valid info if _either_ EXCL
2921 * cap is held or MDS rdlocks and reads the value here.
2923 * handling this case here is easier than weakening rdlock
2924 * semantics... that would cause problems elsewhere.
2926 client_t client
= mdr
->get_client();
2928 Capability
*cap
= ref
->get_client_cap(client
);
2929 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
||
2930 mdr
->snapid
<= cap
->client_follows
))
2931 issued
= cap
->issued();
2933 int mask
= req
->head
.args
.getattr
.mask
;
2934 if ((mask
& CEPH_CAP_LINK_SHARED
) && (issued
& CEPH_CAP_LINK_EXCL
) == 0) rdlocks
.insert(&ref
->linklock
);
2935 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0) rdlocks
.insert(&ref
->authlock
);
2936 if ((mask
& CEPH_CAP_FILE_SHARED
) && (issued
& CEPH_CAP_FILE_EXCL
) == 0) rdlocks
.insert(&ref
->filelock
);
2937 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0) rdlocks
.insert(&ref
->xattrlock
);
2939 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
2942 if (!check_access(mdr
, ref
, MAY_READ
))
2945 // note which caps are requested, so we return at least a snapshot
2946 // value for them. (currently this matters for xattrs and inline data)
2947 mdr
->getattr_caps
= mask
;
2949 mds
->balancer
->hit_inode(ceph_clock_now(), ref
, META_POP_IRD
,
2950 req
->get_source().num());
2953 dout(10) << "reply to stat on " << *req
<< dendl
;
2956 mdr
->tracedn
= mdr
->dn
[0].back();
2957 respond_to_request(mdr
, 0);
2960 struct C_MDS_LookupIno2
: public ServerContext
{
2962 C_MDS_LookupIno2(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
2963 void finish(int r
) override
{
2964 server
->_lookup_ino_2(mdr
, r
);
2968 /* This function DOES clean up the mdr before returning*/
2972 void Server::handle_client_lookup_ino(MDRequestRef
& mdr
,
2973 bool want_parent
, bool want_dentry
)
2975 MClientRequest
*req
= mdr
->client_request
;
2977 inodeno_t ino
= req
->get_filepath().get_ino();
2978 CInode
*in
= mdcache
->get_inode(ino
);
2979 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
2980 respond_to_request(mdr
, -ESTALE
);
2984 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_LookupIno2(this, mdr
), false);
2988 if (mdr
&& in
->snaprealm
&& !in
->snaprealm
->is_open() &&
2989 !in
->snaprealm
->open_parents(new C_MDS_RetryRequest(mdcache
, mdr
))) {
2993 // check for nothing (not read or write); this still applies the
2995 if (!check_access(mdr
, in
, 0))
2998 CDentry
*dn
= in
->get_projected_parent_dn();
2999 CInode
*diri
= dn
? dn
->get_dir()->inode
: NULL
;
3001 set
<SimpleLock
*> rdlocks
;
3002 if (dn
&& (want_parent
|| want_dentry
)) {
3004 rdlocks
.insert(&dn
->lock
);
3007 unsigned mask
= req
->head
.args
.getattr
.mask
;
3009 Capability
*cap
= in
->get_client_cap(mdr
->get_client());
3011 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
3012 issued
= cap
->issued();
3013 // permission bits, ACL/security xattrs
3014 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
3015 rdlocks
.insert(&in
->authlock
);
3016 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
3017 rdlocks
.insert(&in
->xattrlock
);
3019 mdr
->getattr_caps
= mask
;
3022 if (!rdlocks
.empty()) {
3023 set
<SimpleLock
*> wrlocks
, xlocks
;
3024 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3027 // need read access to directory inode
3028 if (!check_access(mdr
, diri
, MAY_READ
))
3033 if (in
->is_base()) {
3034 respond_to_request(mdr
, -EINVAL
);
3037 if (!diri
|| diri
->is_stray()) {
3038 respond_to_request(mdr
, -ESTALE
);
3041 dout(10) << "reply to lookup_parent " << *in
<< dendl
;
3043 respond_to_request(mdr
, 0);
3046 inodeno_t dirino
= req
->get_filepath2().get_ino();
3047 if (!diri
|| (dirino
!= inodeno_t() && diri
->ino() != dirino
)) {
3048 respond_to_request(mdr
, -ENOENT
);
3051 dout(10) << "reply to lookup_name " << *in
<< dendl
;
3053 dout(10) << "reply to lookup_ino " << *in
<< dendl
;
3058 respond_to_request(mdr
, 0);
3062 void Server::_lookup_ino_2(MDRequestRef
& mdr
, int r
)
3064 inodeno_t ino
= mdr
->client_request
->get_filepath().get_ino();
3065 dout(10) << "_lookup_ino_2 " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
3067 // `r` is a rank if >=0, else an error code
3069 mds_rank_t
dest_rank(r
);
3070 if (dest_rank
== mds
->get_nodeid())
3071 dispatch_client_request(mdr
);
3073 mdcache
->request_forward(mdr
, dest_rank
);
3078 if (r
== -ENOENT
|| r
== -ENODATA
)
3080 respond_to_request(mdr
, r
);
3084 /* This function takes responsibility for the passed mdr*/
3085 void Server::handle_client_open(MDRequestRef
& mdr
)
3087 MClientRequest
*req
= mdr
->client_request
;
3088 dout(7) << "open on " << req
->get_filepath() << dendl
;
3090 int flags
= req
->head
.args
.open
.flags
;
3091 int cmode
= ceph_flags_to_mode(flags
);
3093 respond_to_request(mdr
, -EINVAL
);
3097 bool need_auth
= !file_mode_is_readonly(cmode
) || (flags
& CEPH_O_TRUNC
);
3099 if ((cmode
& CEPH_FILE_MODE_WR
) && mdcache
->is_readonly()) {
3100 dout(7) << "read-only FS" << dendl
;
3101 respond_to_request(mdr
, -EROFS
);
3105 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3106 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, need_auth
);
3110 if (cur
->is_frozen() || cur
->state_test(CInode::STATE_EXPORTINGCAPS
)) {
3112 mdr
->done_locking
= false;
3113 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
3118 if (!cur
->inode
.is_file()) {
3119 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3120 cmode
= CEPH_FILE_MODE_PIN
;
3121 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3122 if (cur
->inode
.is_symlink() && !(flags
& CEPH_O_NOFOLLOW
))
3123 flags
&= ~CEPH_O_TRUNC
;
3126 dout(10) << "open flags = " << flags
3127 << ", filemode = " << cmode
3128 << ", need_auth = " << need_auth
3132 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3133 dout(7) << "not a file or dir " << *cur << dendl;
3134 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3137 if ((flags
& CEPH_O_DIRECTORY
) && !cur
->inode
.is_dir() && !cur
->inode
.is_symlink()) {
3138 dout(7) << "specified O_DIRECTORY on non-directory " << *cur
<< dendl
;
3139 respond_to_request(mdr
, -EINVAL
);
3143 if ((flags
& CEPH_O_TRUNC
) && !cur
->inode
.is_file()) {
3144 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur
<< dendl
;
3145 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3146 respond_to_request(mdr
, cur
->inode
.is_dir() ? -EISDIR
: -EINVAL
);
3150 if (cur
->inode
.inline_data
.version
!= CEPH_INLINE_NONE
&&
3151 !mdr
->session
->connection
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
3152 dout(7) << "old client cannot open inline data file " << *cur
<< dendl
;
3153 respond_to_request(mdr
, -EPERM
);
3157 // snapped data is read only
3158 if (mdr
->snapid
!= CEPH_NOSNAP
&&
3159 ((cmode
& CEPH_FILE_MODE_WR
) || req
->may_write())) {
3160 dout(7) << "snap " << mdr
->snapid
<< " is read-only " << *cur
<< dendl
;
3161 respond_to_request(mdr
, -EROFS
);
3165 unsigned mask
= req
->head
.args
.open
.mask
;
3167 Capability
*cap
= cur
->get_client_cap(mdr
->get_client());
3169 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
3170 issued
= cap
->issued();
3171 // permission bits, ACL/security xattrs
3172 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
3173 rdlocks
.insert(&cur
->authlock
);
3174 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
3175 rdlocks
.insert(&cur
->xattrlock
);
3177 mdr
->getattr_caps
= mask
;
3181 if ((flags
& CEPH_O_TRUNC
) && !mdr
->has_completed
) {
3182 assert(cur
->is_auth());
3184 xlocks
.insert(&cur
->filelock
);
3185 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3188 if (!check_access(mdr
, cur
, MAY_WRITE
))
3191 // wait for pending truncate?
3192 const inode_t
*pi
= cur
->get_projected_inode();
3193 if (pi
->is_truncating()) {
3194 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
3195 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
3196 mds
->locker
->drop_locks(mdr
.get());
3197 mdr
->drop_local_auth_pins();
3198 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
3202 do_open_truncate(mdr
, cmode
);
3206 // sync filelock if snapped.
3207 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3208 // and that data itself is flushed so that we can read the snapped data off disk.
3209 if (mdr
->snapid
!= CEPH_NOSNAP
&& !cur
->is_dir()) {
3210 rdlocks
.insert(&cur
->filelock
);
3213 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3217 if (cmode
& CEPH_FILE_MODE_WR
)
3219 if (!check_access(mdr
, cur
, mask
))
3222 if (cur
->is_file() || cur
->is_dir()) {
3223 if (mdr
->snapid
== CEPH_NOSNAP
) {
3225 Capability
*cap
= mds
->locker
->issue_new_caps(cur
, cmode
, mdr
->session
, 0, req
->is_replay());
3227 dout(12) << "open issued caps " << ccap_string(cap
->pending())
3228 << " for " << req
->get_source()
3229 << " on " << *cur
<< dendl
;
3231 int caps
= ceph_caps_for_mode(cmode
);
3232 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps
)
3233 << " for " << req
->get_source()
3234 << " snapid " << mdr
->snapid
3235 << " on " << *cur
<< dendl
;
3236 mdr
->snap_caps
= caps
;
3240 // increase max_size?
3241 if (cmode
& CEPH_FILE_MODE_WR
)
3242 mds
->locker
->check_inode_max_size(cur
);
3244 // make sure this inode gets into the journal
3245 if (cur
->is_auth() && cur
->last
== CEPH_NOSNAP
&&
3246 !cur
->item_open_file
.is_on_list()) {
3247 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
3248 EOpen
*le
= new EOpen(mds
->mdlog
);
3249 mdlog
->start_entry(le
);
3250 le
->add_clean_inode(cur
);
3251 ls
->open_files
.push_back(&cur
->item_open_file
);
3252 mdlog
->submit_entry(le
);
3256 if (cmode
& CEPH_FILE_MODE_WR
)
3257 mds
->balancer
->hit_inode(mdr
->get_mds_stamp(), cur
, META_POP_IWR
);
3259 mds
->balancer
->hit_inode(mdr
->get_mds_stamp(), cur
, META_POP_IRD
,
3260 mdr
->client_request
->get_source().num());
3263 if (req
->get_dentry_wanted()) {
3264 assert(mdr
->dn
[0].size());
3265 dn
= mdr
->dn
[0].back();
3270 respond_to_request(mdr
, 0);
3273 class C_MDS_openc_finish
: public ServerLogContext
{
3278 C_MDS_openc_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
, snapid_t f
) :
3279 ServerLogContext(s
, r
), dn(d
), newi(ni
), follows(f
) {}
3280 void finish(int r
) override
{
3283 dn
->pop_projected_linkage();
3285 // dirty inode, dn, dir
3286 newi
->inode
.version
--; // a bit hacky, see C_MDS_mknod_finish
3287 newi
->mark_dirty(newi
->inode
.version
+1, mdr
->ls
);
3288 newi
->_mark_dirty_parent(mdr
->ls
, true);
3292 get_mds()->locker
->share_inode_max_size(newi
);
3294 MDRequestRef null_ref
;
3295 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
3297 get_mds()->balancer
->hit_inode(mdr
->get_mds_stamp(), newi
, META_POP_IWR
);
3299 server
->respond_to_request(mdr
, 0);
3301 assert(g_conf
->mds_kill_openc_at
!= 1);
3305 /* This function takes responsibility for the passed mdr*/
3306 void Server::handle_client_openc(MDRequestRef
& mdr
)
3308 MClientRequest
*req
= mdr
->client_request
;
3309 client_t client
= mdr
->get_client();
3311 dout(7) << "open w/ O_CREAT on " << req
->get_filepath() << dendl
;
3313 int cmode
= ceph_flags_to_mode(req
->head
.args
.open
.flags
);
3315 respond_to_request(mdr
, -EINVAL
);
3319 if (!(req
->head
.args
.open
.flags
& CEPH_O_EXCL
)) {
3320 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, req
->get_filepath(),
3321 &mdr
->dn
[0], NULL
, MDS_TRAVERSE_FORWARD
);
3325 handle_client_open(mdr
);
3328 if (r
< 0 && r
!= -ENOENT
) {
3330 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
3331 MDSInternalContextBase
*c
= new C_MDS_TryFindInode(this, mdr
);
3332 mdcache
->find_ino_peers(req
->get_filepath().get_ino(), c
);
3334 dout(10) << "FAIL on error " << r
<< dendl
;
3335 respond_to_request(mdr
, r
);
3342 bool excl
= (req
->head
.args
.open
.flags
& CEPH_O_EXCL
);
3343 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3344 file_layout_t
*dir_layout
= NULL
;
3345 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
,
3346 !excl
, false, false, &dir_layout
);
3348 if (mdr
->snapid
!= CEPH_NOSNAP
) {
3349 respond_to_request(mdr
, -EROFS
);
3353 file_layout_t layout
;
3355 layout
= *dir_layout
;
3357 layout
= mdcache
->default_file_layout
;
3359 // What kind of client caps are required to complete this operation
3360 uint64_t access
= MAY_WRITE
;
3362 const auto default_layout
= layout
;
3364 // fill in any special params from client
3365 if (req
->head
.args
.open
.stripe_unit
)
3366 layout
.stripe_unit
= req
->head
.args
.open
.stripe_unit
;
3367 if (req
->head
.args
.open
.stripe_count
)
3368 layout
.stripe_count
= req
->head
.args
.open
.stripe_count
;
3369 if (req
->head
.args
.open
.object_size
)
3370 layout
.object_size
= req
->head
.args
.open
.object_size
;
3371 if (req
->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID
) &&
3372 (__s32
)req
->head
.args
.open
.pool
>= 0) {
3373 layout
.pool_id
= req
->head
.args
.open
.pool
;
3375 // make sure we have as new a map as the client
3376 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
3377 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
3382 // If client doesn't have capability to modify layout pools, then
3383 // only permit this request if the requested pool matches what the
3384 // file would have inherited anyway from its parent.
3385 if (default_layout
!= layout
) {
3386 access
|= MAY_SET_VXATTR
;
3389 if (!layout
.is_valid()) {
3390 dout(10) << " invalid initial file layout" << dendl
;
3391 respond_to_request(mdr
, -EINVAL
);
3394 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
3395 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
3396 respond_to_request(mdr
, -EINVAL
);
3400 CDir
*dir
= dn
->get_dir();
3401 CInode
*diri
= dir
->get_inode();
3402 rdlocks
.insert(&diri
->authlock
);
3403 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3406 if (!check_access(mdr
, diri
, access
))
3409 if (!check_fragment_space(mdr
, dir
))
3412 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
3414 if (!dnl
->is_null()) {
3416 assert(req
->head
.args
.open
.flags
& CEPH_O_EXCL
);
3417 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl
;
3418 mdr
->tracei
= dnl
->get_inode();
3420 respond_to_request(mdr
, -EEXIST
);
3427 SnapRealm
*realm
= diri
->find_snaprealm(); // use directory's realm; inode isn't attached yet.
3428 snapid_t follows
= realm
->get_newest_seq();
3430 CInode
*in
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
3431 req
->head
.args
.open
.mode
| S_IFREG
, &layout
);
3435 dn
->push_projected_linkage(in
);
3437 in
->inode
.version
= dn
->pre_dirty();
3438 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
3439 in
->inode
.add_old_pool(mdcache
->default_file_layout
.pool_id
);
3440 in
->inode
.update_backtrace();
3441 if (cmode
& CEPH_FILE_MODE_WR
) {
3442 in
->inode
.client_ranges
[client
].range
.first
= 0;
3443 in
->inode
.client_ranges
[client
].range
.last
= in
->inode
.get_layout_size_increment();
3444 in
->inode
.client_ranges
[client
].follows
= follows
;
3446 in
->inode
.rstat
.rfiles
= 1;
3448 assert(dn
->first
== follows
+1);
3449 in
->first
= dn
->first
;
3452 mdr
->ls
= mdlog
->get_current_segment();
3453 EUpdate
*le
= new EUpdate(mdlog
, "openc");
3454 mdlog
->start_entry(le
);
3455 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
3456 journal_allocated_inos(mdr
, &le
->metablob
);
3457 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
3458 le
->metablob
.add_primary_dentry(dn
, in
, true, true, true);
3461 mds
->locker
->issue_new_caps(in
, cmode
, mdr
->session
, realm
, req
->is_replay());
3462 in
->authlock
.set_state(LOCK_EXCL
);
3463 in
->xattrlock
.set_state(LOCK_EXCL
);
3465 // make sure this inode gets into the journal
3466 le
->metablob
.add_opened_ino(in
->ino());
3467 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
3468 ls
->open_files
.push_back(&in
->item_open_file
);
3470 C_MDS_openc_finish
*fin
= new C_MDS_openc_finish(this, mdr
, dn
, in
, follows
);
3472 if (mdr
->client_request
->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE
)) {
3473 dout(10) << "adding ino to reply to indicate inode was created" << dendl
;
3474 // add the file created flag onto the reply if create_flags features is supported
3475 ::encode(in
->inode
.ino
, mdr
->reply_extra_bl
);
3478 journal_and_reply(mdr
, in
, dn
, le
, fin
);
3480 // We hit_dir (via hit_inode) in our finish callback, but by then we might
3481 // have overshot the split size (multiple opencs in flight), so here is
3482 // an early chance to split the dir if this openc makes it oversized.
3483 mds
->balancer
->maybe_fragment(dir
, false);
3488 void Server::handle_client_readdir(MDRequestRef
& mdr
)
3490 MClientRequest
*req
= mdr
->client_request
;
3491 client_t client
= req
->get_source().num();
3492 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3493 CInode
*diri
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, false, true);
3496 // it's a directory, right?
3497 if (!diri
->is_dir()) {
3499 dout(10) << "reply to " << *req
<< " readdir -ENOTDIR" << dendl
;
3500 respond_to_request(mdr
, -ENOTDIR
);
3504 rdlocks
.insert(&diri
->filelock
);
3505 rdlocks
.insert(&diri
->dirfragtreelock
);
3507 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3510 if (!check_access(mdr
, diri
, MAY_READ
))
3514 frag_t fg
= (__u32
)req
->head
.args
.readdir
.frag
;
3515 unsigned req_flags
= (__u32
)req
->head
.args
.readdir
.flags
;
3516 string offset_str
= req
->get_path2();
3518 __u32 offset_hash
= 0;
3519 if (!offset_str
.empty())
3520 offset_hash
= ceph_frag_value(diri
->hash_dentry_name(offset_str
));
3522 offset_hash
= (__u32
)req
->head
.args
.readdir
.offset_hash
;
3524 dout(10) << " frag " << fg
<< " offset '" << offset_str
<< "'"
3525 << " offset_hash " << offset_hash
<< " flags " << req_flags
<< dendl
;
3527 // does the frag exist?
3528 if (diri
->dirfragtree
[fg
.value()] != fg
) {
3530 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
3531 if (fg
.contains((unsigned)offset_hash
)) {
3532 newfg
= diri
->dirfragtree
[offset_hash
];
3534 // client actually wants next frag
3535 newfg
= diri
->dirfragtree
[fg
.value()];
3539 newfg
= diri
->dirfragtree
[fg
.value()];
3541 dout(10) << " adjust frag " << fg
<< " -> " << newfg
<< " " << diri
->dirfragtree
<< dendl
;
3545 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
3549 dout(10) << "handle_client_readdir on " << *dir
<< dendl
;
3550 assert(dir
->is_auth());
3552 if (!dir
->is_complete()) {
3553 if (dir
->is_frozen()) {
3554 dout(7) << "dir is frozen " << *dir
<< dendl
;
3555 mds
->locker
->drop_locks(mdr
.get());
3556 mdr
->drop_local_auth_pins();
3557 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3561 dout(10) << " incomplete dir contents for readdir on " << *dir
<< ", fetching" << dendl
;
3562 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
3566 #ifdef MDS_VERIFY_FRAGSTAT
3567 dir
->verify_fragstat();
3570 utime_t now
= ceph_clock_now();
3571 mdr
->set_mds_stamp(now
);
3573 snapid_t snapid
= mdr
->snapid
;
3574 dout(10) << "snapid " << snapid
<< dendl
;
3576 SnapRealm
*realm
= diri
->find_snaprealm();
3578 unsigned max
= req
->head
.args
.readdir
.max_entries
;
3580 max
= dir
->get_num_any(); // whatever, something big.
3581 unsigned max_bytes
= req
->head
.args
.readdir
.max_bytes
;
3583 // make sure at least one item can be encoded
3584 max_bytes
= (512 << 10) + g_conf
->mds_max_xattr_pairs_size
;
3588 dir
->encode_dirstat(dirbl
, mds
->get_nodeid());
3590 // count bytes available.
3591 // this isn't perfect, but we should capture the main variable/unbounded size items!
3592 int front_bytes
= dirbl
.length() + sizeof(__u32
) + sizeof(__u8
)*2;
3593 int bytes_left
= max_bytes
- front_bytes
;
3594 bytes_left
-= realm
->get_snap_trace().length();
3596 // build dir contents
3599 bool start
= !offset_hash
&& offset_str
.empty();
3600 bool end
= (dir
->begin() == dir
->end());
3601 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
3602 dentry_key_t
skip_key(snapid
, offset_str
.c_str(), offset_hash
);
3603 for (CDir::map_t::iterator it
= start
? dir
->begin() : dir
->lower_bound(skip_key
);
3604 !end
&& numfiles
< max
;
3605 end
= (it
== dir
->end())) {
3606 CDentry
*dn
= it
->second
;
3609 if (dn
->state_test(CDentry::STATE_PURGING
))
3612 bool dnp
= dn
->use_projected(client
, mdr
);
3613 CDentry::linkage_t
*dnl
= dnp
? dn
->get_projected_linkage() : dn
->get_linkage();
3618 if (dn
->last
< snapid
|| dn
->first
> snapid
) {
3619 dout(20) << "skipping non-overlapping snap " << *dn
<< dendl
;
3624 dentry_key_t
offset_key(dn
->last
, offset_str
.c_str(), offset_hash
);
3625 if (!(offset_key
< dn
->key()))
3629 CInode
*in
= dnl
->get_inode();
3631 if (in
&& in
->ino() == CEPH_INO_CEPH
)
3635 // better for the MDS to do the work, if we think the client will stat any of these files.
3636 if (dnl
->is_remote() && !in
) {
3637 in
= mdcache
->get_inode(dnl
->get_remote_ino());
3639 dn
->link_remote(dnl
, in
);
3640 } else if (dn
->state_test(CDentry::STATE_BADREMOTEINO
)) {
3641 dout(10) << "skipping bad remote ino on " << *dn
<< dendl
;
3644 // touch everything i _do_ have
3645 for (CDir::map_t::iterator p
= dir
->begin(); p
!= dir
->end(); ++p
)
3646 if (!p
->second
->get_linkage()->is_null())
3647 mdcache
->lru
.lru_touch(p
->second
);
3649 // already issued caps and leases, reply immediately.
3650 if (dnbl
.length() > 0) {
3651 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDSInternalNoop
);
3652 dout(10) << " open remote dentry after caps were issued, stopping at "
3653 << dnbl
.length() << " < " << bytes_left
<< dendl
;
3657 mds
->locker
->drop_locks(mdr
.get());
3658 mdr
->drop_local_auth_pins();
3659 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDS_RetryRequest(mdcache
, mdr
));
3665 if ((int)(dnbl
.length() + dn
->name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > bytes_left
) {
3666 dout(10) << " ran out of room, stopping at " << dnbl
.length() << " < " << bytes_left
<< dendl
;
3670 unsigned start_len
= dnbl
.length();
3673 dout(12) << "including dn " << *dn
<< dendl
;
3674 ::encode(dn
->name
, dnbl
);
3675 mds
->locker
->issue_client_lease(dn
, client
, dnbl
, now
, mdr
->session
);
3678 dout(12) << "including inode " << *in
<< dendl
;
3679 int r
= in
->encode_inodestat(dnbl
, mdr
->session
, realm
, snapid
, bytes_left
- (int)dnbl
.length());
3681 // chop off dn->name, lease
3682 dout(10) << " ran out of room, stopping at " << start_len
<< " < " << bytes_left
<< dendl
;
3684 keep
.substr_of(dnbl
, 0, start_len
);
3692 mdcache
->lru
.lru_touch(dn
);
3697 flags
= CEPH_READDIR_FRAG_END
;
3699 flags
|= CEPH_READDIR_FRAG_COMPLETE
; // FIXME: what purpose does this serve
3701 // client only understand END and COMPLETE flags ?
3702 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
3703 flags
|= CEPH_READDIR_HASH_ORDER
| CEPH_READDIR_OFFSET_HASH
;
3706 // finish final blob
3707 ::encode(numfiles
, dirbl
);
3708 ::encode(flags
, dirbl
);
3709 dirbl
.claim_append(dnbl
);
3712 dout(10) << "reply to " << *req
<< " readdir num=" << numfiles
3713 << " bytes=" << dirbl
.length()
3714 << " start=" << (int)start
3715 << " end=" << (int)end
3717 mdr
->reply_extra_bl
= dirbl
;
3719 // bump popularity. NOTE: this doesn't quite capture it.
3720 mds
->balancer
->hit_dir(now
, dir
, META_POP_IRD
, -1, numfiles
);
3724 respond_to_request(mdr
, 0);
3729 // ===============================================================================
3734 * finisher for basic inode updates
3736 class C_MDS_inode_update_finish
: public ServerLogContext
{
3738 bool truncating_smaller
, changed_ranges
;
3740 C_MDS_inode_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
,
3741 bool sm
=false, bool cr
=false) :
3742 ServerLogContext(s
, r
), in(i
), truncating_smaller(sm
), changed_ranges(cr
) { }
3743 void finish(int r
) override
{
3747 in
->pop_and_dirty_projected_inode(mdr
->ls
);
3750 // notify any clients
3751 if (truncating_smaller
&& in
->inode
.is_truncating()) {
3752 get_mds()->locker
->issue_truncate(in
);
3753 get_mds()->mdcache
->truncate_inode(in
, mdr
->ls
);
3756 get_mds()->balancer
->hit_inode(mdr
->get_mds_stamp(), in
, META_POP_IWR
);
3758 server
->respond_to_request(mdr
, 0);
3761 get_mds()->locker
->share_inode_max_size(in
);
3765 void Server::handle_client_file_setlock(MDRequestRef
& mdr
)
3767 MClientRequest
*req
= mdr
->client_request
;
3768 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3770 // get the inode to operate on, and set up any locks needed for that
3771 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
3775 xlocks
.insert(&cur
->flocklock
);
3776 /* acquire_locks will return true if it gets the locks. If it fails,
3777 it will redeliver this request at a later date, so drop the request.
3779 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
)) {
3780 dout(10) << "handle_client_file_setlock could not get locks!" << dendl
;
3784 // copy the lock change into a ceph_filelock so we can store/apply it
3785 ceph_filelock set_lock
;
3786 set_lock
.start
= req
->head
.args
.filelock_change
.start
;
3787 set_lock
.length
= req
->head
.args
.filelock_change
.length
;
3788 set_lock
.client
= req
->get_orig_source().num();
3789 set_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
3790 set_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
3791 set_lock
.type
= req
->head
.args
.filelock_change
.type
;
3792 bool will_wait
= req
->head
.args
.filelock_change
.wait
;
3794 dout(10) << "handle_client_file_setlock: " << set_lock
<< dendl
;
3796 ceph_lock_state_t
*lock_state
= NULL
;
3797 bool interrupt
= false;
3799 // get the appropriate lock state
3800 switch (req
->head
.args
.filelock_change
.rule
) {
3801 case CEPH_LOCK_FLOCK_INTR
:
3804 case CEPH_LOCK_FLOCK
:
3805 lock_state
= cur
->get_flock_lock_state();
3808 case CEPH_LOCK_FCNTL_INTR
:
3811 case CEPH_LOCK_FCNTL
:
3812 lock_state
= cur
->get_fcntl_lock_state();
3816 dout(10) << "got unknown lock type " << set_lock
.type
3817 << ", dropping request!" << dendl
;
3818 respond_to_request(mdr
, -EOPNOTSUPP
);
3822 dout(10) << " state prior to lock change: " << *lock_state
<< dendl
;
3823 if (CEPH_LOCK_UNLOCK
== set_lock
.type
) {
3824 list
<ceph_filelock
> activated_locks
;
3825 list
<MDSInternalContextBase
*> waiters
;
3826 if (lock_state
->is_waiting(set_lock
)) {
3827 dout(10) << " unlock removing waiting lock " << set_lock
<< dendl
;
3828 lock_state
->remove_waiting(set_lock
);
3829 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
3830 } else if (!interrupt
) {
3831 dout(10) << " unlock attempt on " << set_lock
<< dendl
;
3832 lock_state
->remove_lock(set_lock
, activated_locks
);
3833 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
3835 mds
->queue_waiters(waiters
);
3837 respond_to_request(mdr
, 0);
3839 dout(10) << " lock attempt on " << set_lock
<< dendl
;
3840 bool deadlock
= false;
3841 if (mdr
->more()->flock_was_waiting
&&
3842 !lock_state
->is_waiting(set_lock
)) {
3843 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock
<< dendl
;
3844 respond_to_request(mdr
, -EINTR
);
3845 } else if (!lock_state
->add_lock(set_lock
, will_wait
, mdr
->more()->flock_was_waiting
, &deadlock
)) {
3846 dout(10) << " it failed on this attempt" << dendl
;
3847 // couldn't set lock right now
3849 respond_to_request(mdr
, -EDEADLK
);
3850 } else if (!will_wait
) {
3851 respond_to_request(mdr
, -EWOULDBLOCK
);
3853 dout(10) << " added to waiting list" << dendl
;
3854 assert(lock_state
->is_waiting(set_lock
));
3855 mdr
->more()->flock_was_waiting
= true;
3856 mds
->locker
->drop_locks(mdr
.get());
3857 mdr
->drop_local_auth_pins();
3858 cur
->add_waiter(CInode::WAIT_FLOCK
, new C_MDS_RetryRequest(mdcache
, mdr
));
3861 respond_to_request(mdr
, 0);
3863 dout(10) << " state after lock change: " << *lock_state
<< dendl
;
3866 void Server::handle_client_file_readlock(MDRequestRef
& mdr
)
3868 MClientRequest
*req
= mdr
->client_request
;
3869 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3871 // get the inode to operate on, and set up any locks needed for that
3872 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
3876 /* acquire_locks will return true if it gets the locks. If it fails,
3877 it will redeliver this request at a later date, so drop the request.
3879 rdlocks
.insert(&cur
->flocklock
);
3880 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
)) {
3881 dout(10) << "handle_client_file_readlock could not get locks!" << dendl
;
3885 // copy the lock change into a ceph_filelock so we can store/apply it
3886 ceph_filelock checking_lock
;
3887 checking_lock
.start
= req
->head
.args
.filelock_change
.start
;
3888 checking_lock
.length
= req
->head
.args
.filelock_change
.length
;
3889 checking_lock
.client
= req
->get_orig_source().num();
3890 checking_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
3891 checking_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
3892 checking_lock
.type
= req
->head
.args
.filelock_change
.type
;
3894 // get the appropriate lock state
3895 ceph_lock_state_t
*lock_state
= NULL
;
3896 switch (req
->head
.args
.filelock_change
.rule
) {
3897 case CEPH_LOCK_FLOCK
:
3898 lock_state
= cur
->get_flock_lock_state();
3901 case CEPH_LOCK_FCNTL
:
3902 lock_state
= cur
->get_fcntl_lock_state();
3906 dout(10) << "got unknown lock type " << checking_lock
.type
<< dendl
;
3907 respond_to_request(mdr
, -EINVAL
);
3910 lock_state
->look_for_lock(checking_lock
);
3913 ::encode(checking_lock
, lock_bl
);
3915 mdr
->reply_extra_bl
= lock_bl
;
3916 respond_to_request(mdr
, 0);
3919 void Server::handle_client_setattr(MDRequestRef
& mdr
)
3921 MClientRequest
*req
= mdr
->client_request
;
3922 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3923 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
3926 if (mdr
->snapid
!= CEPH_NOSNAP
) {
3927 respond_to_request(mdr
, -EROFS
);
3930 if (cur
->ino() < MDS_INO_SYSTEM_BASE
&& !cur
->is_base()) {
3931 respond_to_request(mdr
, -EPERM
);
3935 __u32 mask
= req
->head
.args
.setattr
.mask
;
3936 __u32 access_mask
= MAY_WRITE
;
3939 if (mask
& (CEPH_SETATTR_MODE
|CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_BTIME
|CEPH_SETATTR_KILL_SGUID
))
3940 xlocks
.insert(&cur
->authlock
);
3941 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
|CEPH_SETATTR_SIZE
))
3942 xlocks
.insert(&cur
->filelock
);
3943 if (mask
& CEPH_SETATTR_CTIME
)
3944 wrlocks
.insert(&cur
->versionlock
);
3946 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3949 if ((mask
& CEPH_SETATTR_UID
) && (cur
->inode
.uid
!= req
->head
.args
.setattr
.uid
))
3950 access_mask
|= MAY_CHOWN
;
3952 if ((mask
& CEPH_SETATTR_GID
) && (cur
->inode
.gid
!= req
->head
.args
.setattr
.gid
))
3953 access_mask
|= MAY_CHGRP
;
3955 if (!check_access(mdr
, cur
, access_mask
))
3958 // trunc from bigger -> smaller?
3959 inode_t
*pi
= cur
->get_projected_inode();
3961 uint64_t old_size
= MAX(pi
->size
, req
->head
.args
.setattr
.old_size
);
3963 // ENOSPC on growing file while full, but allow shrinks
3964 if (is_full
&& req
->head
.args
.setattr
.size
> old_size
) {
3965 dout(20) << __func__
<< ": full, responding ENOSPC to setattr with larger size" << dendl
;
3966 respond_to_request(mdr
, -ENOSPC
);
3970 bool truncating_smaller
= false;
3971 if (mask
& CEPH_SETATTR_SIZE
) {
3972 truncating_smaller
= req
->head
.args
.setattr
.size
< old_size
;
3973 if (truncating_smaller
&& pi
->is_truncating()) {
3974 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
3975 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
3976 mds
->locker
->drop_locks(mdr
.get());
3977 mdr
->drop_local_auth_pins();
3978 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
3983 bool changed_ranges
= false;
3986 mdr
->ls
= mdlog
->get_current_segment();
3987 EUpdate
*le
= new EUpdate(mdlog
, "setattr");
3988 mdlog
->start_entry(le
);
3990 pi
= cur
->project_inode();
3992 if (mask
& CEPH_SETATTR_UID
)
3993 pi
->uid
= req
->head
.args
.setattr
.uid
;
3994 if (mask
& CEPH_SETATTR_GID
)
3995 pi
->gid
= req
->head
.args
.setattr
.gid
;
3997 if (mask
& CEPH_SETATTR_MODE
)
3998 pi
->mode
= (pi
->mode
& ~07777) | (req
->head
.args
.setattr
.mode
& 07777);
3999 else if ((mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
)) &&
4000 S_ISREG(pi
->mode
)) {
4001 pi
->mode
&= ~S_ISUID
;
4002 if ((pi
->mode
& (S_ISGID
|S_IXGRP
)) == (S_ISGID
|S_IXGRP
))
4003 pi
->mode
&= ~S_ISGID
;
4006 if (mask
& CEPH_SETATTR_MTIME
)
4007 pi
->mtime
= req
->head
.args
.setattr
.mtime
;
4008 if (mask
& CEPH_SETATTR_ATIME
)
4009 pi
->atime
= req
->head
.args
.setattr
.atime
;
4010 if (mask
& CEPH_SETATTR_BTIME
)
4011 pi
->btime
= req
->head
.args
.setattr
.btime
;
4012 if (mask
& (CEPH_SETATTR_ATIME
| CEPH_SETATTR_MTIME
| CEPH_SETATTR_BTIME
))
4013 pi
->time_warp_seq
++; // maybe not a timewarp, but still a serialization point.
4014 if (mask
& CEPH_SETATTR_SIZE
) {
4015 if (truncating_smaller
) {
4016 pi
->truncate(old_size
, req
->head
.args
.setattr
.size
);
4017 le
->metablob
.add_truncate_start(cur
->ino());
4019 pi
->size
= req
->head
.args
.setattr
.size
;
4020 pi
->rstat
.rbytes
= pi
->size
;
4022 pi
->mtime
= mdr
->get_op_stamp();
4024 // adjust client's max_size?
4025 map
<client_t
,client_writeable_range_t
> new_ranges
;
4026 bool max_increased
= false;
4027 mds
->locker
->calc_new_client_ranges(cur
, pi
->size
, &new_ranges
, &max_increased
);
4028 if (pi
->client_ranges
!= new_ranges
) {
4029 dout(10) << " client_ranges " << pi
->client_ranges
<< " -> " << new_ranges
<< dendl
;
4030 pi
->client_ranges
= new_ranges
;
4031 changed_ranges
= true;
4035 pi
->version
= cur
->pre_dirty();
4036 pi
->ctime
= mdr
->get_op_stamp();
4040 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4041 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4042 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4044 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
4045 truncating_smaller
, changed_ranges
));
4047 // flush immediately if there are readers/writers waiting
4048 if (xlocks
.count(&cur
->filelock
) &&
4049 (cur
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
4050 mds
->mdlog
->flush();
4053 /* Takes responsibility for mdr */
4054 void Server::do_open_truncate(MDRequestRef
& mdr
, int cmode
)
4056 CInode
*in
= mdr
->in
[0];
4057 client_t client
= mdr
->get_client();
4060 dout(10) << "do_open_truncate " << *in
<< dendl
;
4062 SnapRealm
*realm
= in
->find_snaprealm();
4063 mds
->locker
->issue_new_caps(in
, cmode
, mdr
->session
, realm
, mdr
->client_request
->is_replay());
4065 mdr
->ls
= mdlog
->get_current_segment();
4066 EUpdate
*le
= new EUpdate(mdlog
, "open_truncate");
4067 mdlog
->start_entry(le
);
4070 inode_t
*pi
= in
->project_inode();
4071 pi
->version
= in
->pre_dirty();
4072 pi
->mtime
= pi
->ctime
= mdr
->get_op_stamp();
4075 uint64_t old_size
= MAX(pi
->size
, mdr
->client_request
->head
.args
.open
.old_size
);
4077 pi
->truncate(old_size
, 0);
4078 le
->metablob
.add_truncate_start(in
->ino());
4081 bool changed_ranges
= false;
4082 if (cmode
& CEPH_FILE_MODE_WR
) {
4083 pi
->client_ranges
[client
].range
.first
= 0;
4084 pi
->client_ranges
[client
].range
.last
= pi
->get_layout_size_increment();
4085 pi
->client_ranges
[client
].follows
= in
->find_snaprealm()->get_newest_seq();
4086 changed_ranges
= true;
4089 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
4091 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
4092 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
4094 // make sure ino gets into the journal
4095 le
->metablob
.add_opened_ino(in
->ino());
4096 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
4097 ls
->open_files
.push_back(&in
->item_open_file
);
4099 mdr
->o_trunc
= true;
4102 if (mdr
->client_request
->get_dentry_wanted()) {
4103 assert(mdr
->dn
[0].size());
4104 dn
= mdr
->dn
[0].back();
4107 journal_and_reply(mdr
, in
, dn
, le
, new C_MDS_inode_update_finish(this, mdr
, in
, old_size
> 0,
4109 // Although the `open` part can give an early reply, the truncation won't
4110 // happen until our EUpdate is persistent, to give the client a prompt
4111 // response we must also flush that event.
4116 /* This function cleans up the passed mdr */
4117 void Server::handle_client_setlayout(MDRequestRef
& mdr
)
4119 MClientRequest
*req
= mdr
->client_request
;
4120 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4121 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4124 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4125 respond_to_request(mdr
, -EROFS
);
4128 if (!cur
->is_file()) {
4129 respond_to_request(mdr
, -EINVAL
);
4132 if (cur
->get_projected_inode()->size
||
4133 cur
->get_projected_inode()->truncate_seq
> 1) {
4134 respond_to_request(mdr
, -ENOTEMPTY
);
4139 file_layout_t layout
= cur
->get_projected_inode()->layout
;
4140 // save existing layout for later
4141 const auto old_layout
= layout
;
4143 int access
= MAY_WRITE
;
4145 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
4146 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
4147 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
4148 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
4149 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
4150 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
4151 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
4152 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
4154 // make sure we have as new a map as the client
4155 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4156 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4161 // Don't permit layout modifications without 'p' caps
4162 if (layout
!= old_layout
) {
4163 access
|= MAY_SET_VXATTR
;
4166 if (!layout
.is_valid()) {
4167 dout(10) << "bad layout" << dendl
;
4168 respond_to_request(mdr
, -EINVAL
);
4171 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4172 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4173 respond_to_request(mdr
, -EINVAL
);
4177 xlocks
.insert(&cur
->filelock
);
4178 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4181 if (!check_access(mdr
, cur
, access
))
4185 inode_t
*pi
= cur
->project_inode();
4186 pi
->layout
= layout
;
4187 // add the old pool to the inode
4188 pi
->add_old_pool(old_layout
.pool_id
);
4189 pi
->version
= cur
->pre_dirty();
4190 pi
->ctime
= mdr
->get_op_stamp();
4194 mdr
->ls
= mdlog
->get_current_segment();
4195 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
4196 mdlog
->start_entry(le
);
4197 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4198 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4199 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4201 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4204 void Server::handle_client_setdirlayout(MDRequestRef
& mdr
)
4206 MClientRequest
*req
= mdr
->client_request
;
4207 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4208 file_layout_t
*dir_layout
= NULL
;
4209 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true, false, &dir_layout
);
4212 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4213 respond_to_request(mdr
, -EROFS
);
4217 if (!cur
->is_dir()) {
4218 respond_to_request(mdr
, -ENOTDIR
);
4222 xlocks
.insert(&cur
->policylock
);
4223 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4227 const inode_t
*old_pi
= cur
->get_projected_inode();
4228 file_layout_t layout
;
4229 if (old_pi
->has_layout())
4230 layout
= old_pi
->layout
;
4231 else if (dir_layout
)
4232 layout
= *dir_layout
;
4234 layout
= mdcache
->default_file_layout
;
4236 // Level of access required to complete
4237 int access
= MAY_WRITE
;
4239 const auto old_layout
= layout
;
4241 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
4242 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
4243 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
4244 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
4245 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
4246 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
4247 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
4248 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
4249 // make sure we have as new a map as the client
4250 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4251 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4256 if (layout
!= old_layout
) {
4257 access
|= MAY_SET_VXATTR
;
4260 if (!layout
.is_valid()) {
4261 dout(10) << "bad layout" << dendl
;
4262 respond_to_request(mdr
, -EINVAL
);
4265 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4266 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4267 respond_to_request(mdr
, -EINVAL
);
4271 if (!check_access(mdr
, cur
, access
))
4274 inode_t
*pi
= cur
->project_inode();
4275 pi
->layout
= layout
;
4276 pi
->version
= cur
->pre_dirty();
4279 mdr
->ls
= mdlog
->get_current_segment();
4280 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
4281 mdlog
->start_entry(le
);
4282 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4283 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4284 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4286 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4291 int Server::parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
4292 file_layout_t
*layout
, bool validate
)
4294 dout(20) << "parse_layout_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
4296 if (name
== "layout") {
4297 string::iterator begin
= value
.begin();
4298 string::iterator end
= value
.end();
4299 keys_and_values
<string::iterator
> p
; // create instance of parser
4300 std::map
<string
, string
> m
; // map to receive results
4301 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
4304 string
left(begin
, end
);
4305 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
4308 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
4309 // Skip validation on each attr, we do it once at the end (avoid
4310 // rejecting intermediate states if the overall result is ok)
4311 int r
= parse_layout_vxattr(string("layout.") + q
->first
, q
->second
,
4312 osdmap
, layout
, false);
4316 } else if (name
== "layout.object_size") {
4317 layout
->object_size
= boost::lexical_cast
<unsigned>(value
);
4318 } else if (name
== "layout.stripe_unit") {
4319 layout
->stripe_unit
= boost::lexical_cast
<unsigned>(value
);
4320 } else if (name
== "layout.stripe_count") {
4321 layout
->stripe_count
= boost::lexical_cast
<unsigned>(value
);
4322 } else if (name
== "layout.pool") {
4324 layout
->pool_id
= boost::lexical_cast
<unsigned>(value
);
4325 } catch (boost::bad_lexical_cast
const&) {
4326 int64_t pool
= osdmap
.lookup_pg_pool_name(value
);
4328 dout(10) << " unknown pool " << value
<< dendl
;
4331 layout
->pool_id
= pool
;
4333 } else if (name
== "layout.pool_namespace") {
4334 layout
->pool_ns
= value
;
4336 dout(10) << " unknown layout vxattr " << name
<< dendl
;
4339 } catch (boost::bad_lexical_cast
const&) {
4340 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
4344 if (validate
&& !layout
->is_valid()) {
4345 dout(10) << "bad layout" << dendl
;
4348 if (!mds
->mdsmap
->is_data_pool(layout
->pool_id
)) {
4349 dout(10) << " invalid data pool " << layout
->pool_id
<< dendl
;
4355 int Server::parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
)
4357 dout(20) << "parse_quota_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
4359 if (name
== "quota") {
4360 string::iterator begin
= value
.begin();
4361 string::iterator end
= value
.end();
4362 keys_and_values
<string::iterator
> p
; // create instance of parser
4363 std::map
<string
, string
> m
; // map to receive results
4364 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
4367 string
left(begin
, end
);
4368 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
4371 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
4372 int r
= parse_quota_vxattr(string("quota.") + q
->first
, q
->second
, quota
);
4376 } else if (name
== "quota.max_bytes") {
4377 int64_t q
= boost::lexical_cast
<int64_t>(value
);
4380 quota
->max_bytes
= q
;
4381 } else if (name
== "quota.max_files") {
4382 int64_t q
= boost::lexical_cast
<int64_t>(value
);
4385 quota
->max_files
= q
;
4387 dout(10) << " unknown quota vxattr " << name
<< dendl
;
4390 } catch (boost::bad_lexical_cast
const&) {
4391 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
4395 if (!quota
->is_valid()) {
4396 dout(10) << "bad quota" << dendl
;
4403 * Verify that the file layout attribute carried by client
4404 * is well-formatted.
4405 * Return 0 on success, otherwise this function takes
4406 * responsibility for the passed mdr.
4408 int Server::check_layout_vxattr(MDRequestRef
& mdr
,
4411 file_layout_t
*layout
)
4413 MClientRequest
*req
= mdr
->client_request
;
4417 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
4418 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
4419 epoch
= osdmap
.get_epoch();
4424 // we don't have the specified pool, make sure our map
4425 // is newer than or as new as the client.
4426 epoch_t req_epoch
= req
->get_osdmap_epoch();
4428 if (req_epoch
> epoch
) {
4430 // well, our map is older. consult mds.
4431 Context
*fin
= new C_IO_Wrapper(mds
, new C_MDS_RetryRequest(mdcache
, mdr
));
4433 if (!mds
->objecter
->wait_for_map(req_epoch
, fin
))
4434 return r
; // wait, fin will retry this request later
4438 // now we have at least as new a map as the client, try again.
4439 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
4440 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
4441 epoch
= osdmap
.get_epoch();
4444 assert(epoch
>= req_epoch
); // otherwise wait_for_map() told a lie
4446 } else if (req_epoch
== 0 && !mdr
->waited_for_osdmap
) {
4448 // For compatibility with client w/ old code, we still need get the
4449 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
4450 // we can remove those code.
4451 mdr
->waited_for_osdmap
= true;
4452 mds
->objecter
->wait_for_latest_osdmap(new C_IO_Wrapper(
4453 mds
, new C_MDS_RetryRequest(mdcache
, mdr
)));
4463 respond_to_request(mdr
, r
);
4471 void Server::handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
,
4472 file_layout_t
*dir_layout
,
4473 set
<SimpleLock
*> rdlocks
,
4474 set
<SimpleLock
*> wrlocks
,
4475 set
<SimpleLock
*> xlocks
)
4477 MClientRequest
*req
= mdr
->client_request
;
4478 string
name(req
->get_path2());
4479 bufferlist bl
= req
->get_data();
4480 string
value (bl
.c_str(), bl
.length());
4481 dout(10) << "handle_set_vxattr " << name
4482 << " val " << value
.length()
4483 << " bytes on " << *cur
4489 if (!check_access(mdr
, cur
, MAY_SET_VXATTR
)) {
4493 if (name
.compare(0, 15, "ceph.dir.layout") == 0) {
4494 if (!cur
->is_dir()) {
4495 respond_to_request(mdr
, -EINVAL
);
4499 file_layout_t layout
;
4500 if (cur
->get_projected_inode()->has_layout())
4501 layout
= cur
->get_projected_inode()->layout
;
4502 else if (dir_layout
)
4503 layout
= *dir_layout
;
4505 layout
= mdcache
->default_file_layout
;
4507 rest
= name
.substr(name
.find("layout"));
4508 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
4511 xlocks
.insert(&cur
->policylock
);
4512 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4515 pi
= cur
->project_inode();
4516 pi
->layout
= layout
;
4517 } else if (name
.compare(0, 16, "ceph.file.layout") == 0) {
4518 if (!cur
->is_file()) {
4519 respond_to_request(mdr
, -EINVAL
);
4522 if (cur
->get_projected_inode()->size
||
4523 cur
->get_projected_inode()->truncate_seq
> 1) {
4524 respond_to_request(mdr
, -ENOTEMPTY
);
4527 file_layout_t layout
= cur
->get_projected_inode()->layout
;
4528 rest
= name
.substr(name
.find("layout"));
4529 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
4532 xlocks
.insert(&cur
->filelock
);
4533 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4536 pi
= cur
->project_inode();
4537 int64_t old_pool
= pi
->layout
.pool_id
;
4538 pi
->add_old_pool(old_pool
);
4539 pi
->layout
= layout
;
4540 pi
->ctime
= mdr
->get_op_stamp();
4541 } else if (name
.compare(0, 10, "ceph.quota") == 0) {
4542 if (!cur
->is_dir() || cur
->is_root()) {
4543 respond_to_request(mdr
, -EINVAL
);
4547 quota_info_t quota
= cur
->get_projected_inode()->quota
;
4549 rest
= name
.substr(name
.find("quota"));
4550 int r
= parse_quota_vxattr(rest
, value
, "a
);
4552 respond_to_request(mdr
, r
);
4556 xlocks
.insert(&cur
->policylock
);
4557 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4560 pi
= cur
->project_inode();
4562 } else if (name
.find("ceph.dir.pin") == 0) {
4563 if (!cur
->is_dir() || cur
->is_root()) {
4564 respond_to_request(mdr
, -EINVAL
);
4570 rank
= boost::lexical_cast
<mds_rank_t
>(value
);
4571 if (rank
< 0) rank
= MDS_RANK_NONE
;
4572 } catch (boost::bad_lexical_cast
const&) {
4573 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
4574 respond_to_request(mdr
, -EINVAL
);
4578 xlocks
.insert(&cur
->policylock
);
4579 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4582 pi
= cur
->project_inode();
4583 cur
->set_export_pin(rank
);
4585 dout(10) << " unknown vxattr " << name
<< dendl
;
4586 respond_to_request(mdr
, -EINVAL
);
4591 pi
->ctime
= mdr
->get_op_stamp();
4592 pi
->version
= cur
->pre_dirty();
4594 pi
->update_backtrace();
4597 mdr
->ls
= mdlog
->get_current_segment();
4598 EUpdate
*le
= new EUpdate(mdlog
, "set vxattr layout");
4599 mdlog
->start_entry(le
);
4600 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4601 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4602 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4604 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4608 void Server::handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
,
4609 file_layout_t
*dir_layout
,
4610 set
<SimpleLock
*> rdlocks
,
4611 set
<SimpleLock
*> wrlocks
,
4612 set
<SimpleLock
*> xlocks
)
4614 MClientRequest
*req
= mdr
->client_request
;
4615 string
name(req
->get_path2());
4617 dout(10) << __func__
<< " " << name
<< " on " << *cur
<< dendl
;
4619 if (name
== "ceph.dir.layout") {
4620 if (!cur
->is_dir()) {
4621 respond_to_request(mdr
, -ENODATA
);
4624 if (cur
->is_root()) {
4625 dout(10) << "can't remove layout policy on the root directory" << dendl
;
4626 respond_to_request(mdr
, -EINVAL
);
4630 if (!cur
->get_projected_inode()->has_layout()) {
4631 respond_to_request(mdr
, -ENODATA
);
4635 xlocks
.insert(&cur
->policylock
);
4636 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4639 inode_t
*pi
= cur
->project_inode();
4641 pi
->version
= cur
->pre_dirty();
4644 mdr
->ls
= mdlog
->get_current_segment();
4645 EUpdate
*le
= new EUpdate(mdlog
, "remove dir layout vxattr");
4646 mdlog
->start_entry(le
);
4647 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4648 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4649 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4651 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4653 } else if (name
== "ceph.dir.layout.pool_namespace"
4654 || name
== "ceph.file.layout.pool_namespace") {
4655 // Namespace is the only layout field that has a meaningful
4656 // null/none value (empty string, means default layout). Is equivalent
4657 // to a setxattr with empty string: pass through the empty payload of
4658 // the rmxattr request to do this.
4659 handle_set_vxattr(mdr
, cur
, dir_layout
, rdlocks
, wrlocks
, xlocks
);
4663 respond_to_request(mdr
, -ENODATA
);
4666 class C_MDS_inode_xattr_update_finish
: public ServerLogContext
{
4670 C_MDS_inode_xattr_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
) :
4671 ServerLogContext(s
, r
), in(i
) { }
4672 void finish(int r
) override
{
4676 in
->pop_and_dirty_projected_inode(mdr
->ls
);
4680 get_mds()->balancer
->hit_inode(mdr
->get_mds_stamp(), in
, META_POP_IWR
);
4682 server
->respond_to_request(mdr
, 0);
4686 void Server::handle_client_setxattr(MDRequestRef
& mdr
)
4688 MClientRequest
*req
= mdr
->client_request
;
4689 string
name(req
->get_path2());
4690 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4693 file_layout_t
*dir_layout
= NULL
;
4694 if (name
.compare(0, 15, "ceph.dir.layout") == 0)
4695 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true, false, &dir_layout
);
4697 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4701 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4702 respond_to_request(mdr
, -EROFS
);
4706 int flags
= req
->head
.args
.setxattr
.flags
;
4708 // magic ceph.* namespace?
4709 if (name
.compare(0, 5, "ceph.") == 0) {
4710 handle_set_vxattr(mdr
, cur
, dir_layout
, rdlocks
, wrlocks
, xlocks
);
4714 xlocks
.insert(&cur
->xattrlock
);
4715 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4718 if (!check_access(mdr
, cur
, MAY_WRITE
))
4721 map
<string
, bufferptr
> *pxattrs
= cur
->get_projected_xattrs();
4722 size_t len
= req
->get_data().length();
4723 size_t inc
= len
+ name
.length();
4725 // check xattrs kv pairs size
4726 size_t cur_xattrs_size
= 0;
4727 for (const auto& p
: *pxattrs
) {
4728 if ((flags
& CEPH_XATTR_REPLACE
) && (name
.compare(p
.first
) == 0)) {
4731 cur_xattrs_size
+= p
.first
.length() + p
.second
.length();
4734 if (((cur_xattrs_size
+ inc
) > g_conf
->mds_max_xattr_pairs_size
)) {
4735 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
4736 << cur_xattrs_size
<< ", inc " << inc
<< dendl
;
4737 respond_to_request(mdr
, -ENOSPC
);
4741 if ((flags
& CEPH_XATTR_CREATE
) && pxattrs
->count(name
)) {
4742 dout(10) << "setxattr '" << name
<< "' XATTR_CREATE and EEXIST on " << *cur
<< dendl
;
4743 respond_to_request(mdr
, -EEXIST
);
4746 if ((flags
& CEPH_XATTR_REPLACE
) && !pxattrs
->count(name
)) {
4747 dout(10) << "setxattr '" << name
<< "' XATTR_REPLACE and ENODATA on " << *cur
<< dendl
;
4748 respond_to_request(mdr
, -ENODATA
);
4752 dout(10) << "setxattr '" << name
<< "' len " << len
<< " on " << *cur
<< dendl
;
4755 map
<string
,bufferptr
> *px
= new map
<string
,bufferptr
>;
4756 inode_t
*pi
= cur
->project_inode(px
);
4757 pi
->version
= cur
->pre_dirty();
4758 pi
->ctime
= mdr
->get_op_stamp();
4760 pi
->xattr_version
++;
4762 if (!(flags
& CEPH_XATTR_REMOVE
)) {
4763 (*px
)[name
] = buffer::create(len
);
4765 req
->get_data().copy(0, len
, (*px
)[name
].c_str());
4769 mdr
->ls
= mdlog
->get_current_segment();
4770 EUpdate
*le
= new EUpdate(mdlog
, "setxattr");
4771 mdlog
->start_entry(le
);
4772 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4773 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4774 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4776 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4779 void Server::handle_client_removexattr(MDRequestRef
& mdr
)
4781 MClientRequest
*req
= mdr
->client_request
;
4782 string
name(req
->get_path2());
4783 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4784 file_layout_t
*dir_layout
= NULL
;
4786 if (name
== "ceph.dir.layout")
4787 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true, false, &dir_layout
);
4789 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4793 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4794 respond_to_request(mdr
, -EROFS
);
4798 if (name
.compare(0, 5, "ceph.") == 0) {
4799 handle_remove_vxattr(mdr
, cur
, dir_layout
, rdlocks
, wrlocks
, xlocks
);
4803 xlocks
.insert(&cur
->xattrlock
);
4804 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4807 map
<string
, bufferptr
> *pxattrs
= cur
->get_projected_xattrs();
4808 if (pxattrs
->count(name
) == 0) {
4809 dout(10) << "removexattr '" << name
<< "' and ENODATA on " << *cur
<< dendl
;
4810 respond_to_request(mdr
, -ENODATA
);
4814 dout(10) << "removexattr '" << name
<< "' on " << *cur
<< dendl
;
4817 map
<string
,bufferptr
> *px
= new map
<string
,bufferptr
>;
4818 inode_t
*pi
= cur
->project_inode(px
);
4819 pi
->version
= cur
->pre_dirty();
4820 pi
->ctime
= mdr
->get_op_stamp();
4822 pi
->xattr_version
++;
4826 mdr
->ls
= mdlog
->get_current_segment();
4827 EUpdate
*le
= new EUpdate(mdlog
, "removexattr");
4828 mdlog
->start_entry(le
);
4829 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4830 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4831 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4833 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4837 // =================================================================
4838 // DIRECTORY and NAMESPACE OPS
4841 // ------------------------------------------------
4845 class C_MDS_mknod_finish
: public ServerLogContext
{
4849 C_MDS_mknod_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
4850 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
4851 void finish(int r
) override
{
4855 dn
->pop_projected_linkage();
4857 // be a bit hacky with the inode version, here.. we decrement it
4858 // just to keep mark_dirty() happen. (we didn't bother projecting
4859 // a new version of hte inode since it's just been created)
4860 newi
->inode
.version
--;
4861 newi
->mark_dirty(newi
->inode
.version
+ 1, mdr
->ls
);
4862 newi
->_mark_dirty_parent(mdr
->ls
, true);
4865 if (newi
->inode
.is_dir()) {
4866 CDir
*dir
= newi
->get_dirfrag(frag_t());
4868 dir
->fnode
.version
--;
4869 dir
->mark_dirty(dir
->fnode
.version
+ 1, mdr
->ls
);
4870 dir
->mark_new(mdr
->ls
);
4875 MDRequestRef null_ref
;
4876 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
4878 if (newi
->inode
.is_file())
4879 get_mds()->locker
->share_inode_max_size(newi
);
4882 get_mds()->balancer
->hit_inode(mdr
->get_mds_stamp(), newi
, META_POP_IWR
);
4885 server
->respond_to_request(mdr
, 0);
4890 void Server::handle_client_mknod(MDRequestRef
& mdr
)
4892 MClientRequest
*req
= mdr
->client_request
;
4893 client_t client
= mdr
->get_client();
4894 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4895 file_layout_t
*dir_layout
= NULL
;
4896 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false,
4899 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4900 respond_to_request(mdr
, -EROFS
);
4903 CInode
*diri
= dn
->get_dir()->get_inode();
4904 rdlocks
.insert(&diri
->authlock
);
4905 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4908 if (!check_access(mdr
, diri
, MAY_WRITE
))
4911 if (!check_fragment_space(mdr
, dn
->get_dir()))
4914 unsigned mode
= req
->head
.args
.mknod
.mode
;
4915 if ((mode
& S_IFMT
) == 0)
4919 file_layout_t layout
;
4920 if (dir_layout
&& S_ISREG(mode
))
4921 layout
= *dir_layout
;
4923 layout
= mdcache
->default_file_layout
;
4925 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
4926 snapid_t follows
= realm
->get_newest_seq();
4927 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
4931 dn
->push_projected_linkage(newi
);
4933 newi
->inode
.rdev
= req
->head
.args
.mknod
.rdev
;
4934 newi
->inode
.version
= dn
->pre_dirty();
4935 newi
->inode
.rstat
.rfiles
= 1;
4936 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
4937 newi
->inode
.add_old_pool(mdcache
->default_file_layout
.pool_id
);
4938 newi
->inode
.update_backtrace();
4940 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
4941 // want to write to it (e.g., if they are reexporting NFS)
4942 if (S_ISREG(newi
->inode
.mode
)) {
4943 dout(15) << " setting a client_range too, since this is a regular file" << dendl
;
4944 newi
->inode
.client_ranges
[client
].range
.first
= 0;
4945 newi
->inode
.client_ranges
[client
].range
.last
= newi
->inode
.get_layout_size_increment();
4946 newi
->inode
.client_ranges
[client
].follows
= follows
;
4948 // issue a cap on the file
4949 int cmode
= CEPH_FILE_MODE_RDWR
;
4950 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
->session
, realm
, req
->is_replay());
4954 // put locks in excl mode
4955 newi
->filelock
.set_state(LOCK_EXCL
);
4956 newi
->authlock
.set_state(LOCK_EXCL
);
4957 newi
->xattrlock
.set_state(LOCK_EXCL
);
4961 assert(dn
->first
== follows
+ 1);
4962 newi
->first
= dn
->first
;
4964 dout(10) << "mknod mode " << newi
->inode
.mode
<< " rdev " << newi
->inode
.rdev
<< dendl
;
4967 mdr
->ls
= mdlog
->get_current_segment();
4968 EUpdate
*le
= new EUpdate(mdlog
, "mknod");
4969 mdlog
->start_entry(le
);
4970 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4971 journal_allocated_inos(mdr
, &le
->metablob
);
4973 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(),
4974 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
4975 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
4977 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
4983 /* This function takes responsibility for the passed mdr*/
4984 void Server::handle_client_mkdir(MDRequestRef
& mdr
)
4986 MClientRequest
*req
= mdr
->client_request
;
4987 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4988 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false);
4990 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4991 respond_to_request(mdr
, -EROFS
);
4994 CDir
*dir
= dn
->get_dir();
4995 CInode
*diri
= dir
->get_inode();
4996 rdlocks
.insert(&diri
->authlock
);
4997 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5000 // mkdir check access
5001 if (!check_access(mdr
, diri
, MAY_WRITE
))
5004 if (!check_fragment_space(mdr
, dir
))
5008 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
5009 snapid_t follows
= realm
->get_newest_seq();
5011 unsigned mode
= req
->head
.args
.mkdir
.mode
;
5014 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
);
5017 // it's a directory.
5018 dn
->push_projected_linkage(newi
);
5020 newi
->inode
.version
= dn
->pre_dirty();
5021 newi
->inode
.rstat
.rsubdirs
= 1;
5022 newi
->inode
.update_backtrace();
5024 dout(12) << " follows " << follows
<< dendl
;
5025 assert(dn
->first
== follows
+ 1);
5026 newi
->first
= dn
->first
;
5028 // ...and that new dir is empty.
5029 CDir
*newdir
= newi
->get_or_open_dirfrag(mdcache
, frag_t());
5030 newdir
->state_set(CDir::STATE_CREATING
);
5031 newdir
->mark_complete();
5032 newdir
->fnode
.version
= newdir
->pre_dirty();
5035 mdr
->ls
= mdlog
->get_current_segment();
5036 EUpdate
*le
= new EUpdate(mdlog
, "mkdir");
5037 mdlog
->start_entry(le
);
5038 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5039 journal_allocated_inos(mdr
, &le
->metablob
);
5040 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5041 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
5042 le
->metablob
.add_new_dir(newdir
); // dirty AND complete AND new
5044 // issue a cap on the directory
5045 int cmode
= CEPH_FILE_MODE_RDWR
;
5046 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
->session
, realm
, req
->is_replay());
5050 // put locks in excl mode
5051 newi
->filelock
.set_state(LOCK_EXCL
);
5052 newi
->authlock
.set_state(LOCK_EXCL
);
5053 newi
->xattrlock
.set_state(LOCK_EXCL
);
5056 // make sure this inode gets into the journal
5057 le
->metablob
.add_opened_ino(newi
->ino());
5058 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
5059 ls
->open_files
.push_back(&newi
->item_open_file
);
5061 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
5067 void Server::handle_client_symlink(MDRequestRef
& mdr
)
5069 MClientRequest
*req
= mdr
->client_request
;
5070 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5071 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false);
5073 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5074 respond_to_request(mdr
, -EROFS
);
5077 CDir
*dir
= dn
->get_dir();
5078 CInode
*diri
= dir
->get_inode();
5079 rdlocks
.insert(&diri
->authlock
);
5080 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5083 if (!check_access(mdr
, diri
, MAY_WRITE
))
5086 if (!check_fragment_space(mdr
, dir
))
5089 unsigned mode
= S_IFLNK
| 0777;
5090 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
);
5094 dn
->push_projected_linkage(newi
);
5096 newi
->symlink
= req
->get_path2();
5097 newi
->inode
.size
= newi
->symlink
.length();
5098 newi
->inode
.rstat
.rbytes
= newi
->inode
.size
;
5099 newi
->inode
.rstat
.rfiles
= 1;
5100 newi
->inode
.version
= dn
->pre_dirty();
5101 newi
->inode
.update_backtrace();
5103 newi
->first
= dn
->first
;
5106 mdr
->ls
= mdlog
->get_current_segment();
5107 EUpdate
*le
= new EUpdate(mdlog
, "symlink");
5108 mdlog
->start_entry(le
);
5109 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5110 journal_allocated_inos(mdr
, &le
->metablob
);
5111 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5112 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
5114 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
5123 void Server::handle_client_link(MDRequestRef
& mdr
)
5125 MClientRequest
*req
= mdr
->client_request
;
5127 dout(7) << "handle_client_link " << req
->get_filepath()
5128 << " to " << req
->get_filepath2()
5131 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5133 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false);
5135 CInode
*targeti
= rdlock_path_pin_ref(mdr
, 1, rdlocks
, false);
5136 if (!targeti
) return;
5137 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5138 respond_to_request(mdr
, -EROFS
);
5142 CDir
*dir
= dn
->get_dir();
5143 dout(7) << "handle_client_link link " << dn
->get_name() << " in " << *dir
<< dendl
;
5144 dout(7) << "target is " << *targeti
<< dendl
;
5145 if (targeti
->is_dir()) {
5146 dout(7) << "target is a dir, failing..." << dendl
;
5147 respond_to_request(mdr
, -EINVAL
);
5151 xlocks
.insert(&targeti
->linklock
);
5153 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5156 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
5157 if (!check_access(mdr
, targeti
, MAY_WRITE
))
5160 if (!check_access(mdr
, dir
->get_inode(), MAY_WRITE
))
5163 if (!check_fragment_space(mdr
, dir
))
5168 assert(g_conf
->mds_kill_link_at
!= 1);
5171 if (targeti
->is_auth())
5172 _link_local(mdr
, dn
, targeti
);
5174 _link_remote(mdr
, true, dn
, targeti
);
5178 class C_MDS_link_local_finish
: public ServerLogContext
{
5184 C_MDS_link_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ti
,
5185 version_t dnpv_
, version_t tipv_
) :
5186 ServerLogContext(s
, r
), dn(d
), targeti(ti
),
5187 dnpv(dnpv_
), tipv(tipv_
) { }
5188 void finish(int r
) override
{
5190 server
->_link_local_finish(mdr
, dn
, targeti
, dnpv
, tipv
);
5195 void Server::_link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
)
5197 dout(10) << "_link_local " << *dn
<< " to " << *targeti
<< dendl
;
5199 mdr
->ls
= mdlog
->get_current_segment();
5201 // predirty NEW dentry
5202 version_t dnpv
= dn
->pre_dirty();
5203 version_t tipv
= targeti
->pre_dirty();
5205 // project inode update
5206 inode_t
*pi
= targeti
->project_inode();
5208 pi
->ctime
= mdr
->get_op_stamp();
5213 EUpdate
*le
= new EUpdate(mdlog
, "link_local");
5214 mdlog
->start_entry(le
);
5215 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5216 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1); // new dn
5217 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, 0, PREDIRTY_PRIMARY
); // targeti
5218 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
5219 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, targeti
);
5221 // do this after predirty_*, to avoid funky extra dnl arg
5222 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
5224 journal_and_reply(mdr
, targeti
, dn
, le
, new C_MDS_link_local_finish(this, mdr
, dn
, targeti
, dnpv
, tipv
));
5227 void Server::_link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
5228 version_t dnpv
, version_t tipv
)
5230 dout(10) << "_link_local_finish " << *dn
<< " to " << *targeti
<< dendl
;
5232 // link and unlock the NEW dentry
5233 dn
->pop_projected_linkage();
5234 dn
->mark_dirty(dnpv
, mdr
->ls
);
5237 targeti
->pop_and_dirty_projected_inode(mdr
->ls
);
5241 MDRequestRef null_ref
;
5242 mdcache
->send_dentry_link(dn
, null_ref
);
5244 // bump target popularity
5245 mds
->balancer
->hit_inode(mdr
->get_mds_stamp(), targeti
, META_POP_IWR
);
5246 mds
->balancer
->hit_dir(mdr
->get_mds_stamp(), dn
->get_dir(), META_POP_IWR
);
5249 respond_to_request(mdr
, 0);
5253 // link / unlink remote
5255 class C_MDS_link_remote_finish
: public ServerLogContext
{
5261 C_MDS_link_remote_finish(Server
*s
, MDRequestRef
& r
, bool i
, CDentry
*d
, CInode
*ti
) :
5262 ServerLogContext(s
, r
), inc(i
), dn(d
), targeti(ti
),
5263 dpv(d
->get_projected_version()) {}
5264 void finish(int r
) override
{
5266 server
->_link_remote_finish(mdr
, inc
, dn
, targeti
, dpv
);
5270 void Server::_link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
)
5272 dout(10) << "_link_remote "
5273 << (inc
? "link ":"unlink ")
5274 << *dn
<< " to " << *targeti
<< dendl
;
5276 // 1. send LinkPrepare to dest (journal nlink++ prepare)
5277 mds_rank_t linkauth
= targeti
->authority().first
;
5278 if (mdr
->more()->witnessed
.count(linkauth
) == 0) {
5279 if (mds
->is_cluster_degraded() &&
5280 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(linkauth
)) {
5281 dout(10) << " targeti auth mds." << linkauth
<< " is not active" << dendl
;
5282 if (mdr
->more()->waiting_on_slave
.empty())
5283 mds
->wait_for_active_peer(linkauth
, new C_MDS_RetryRequest(mdcache
, mdr
));
5287 dout(10) << " targeti auth must prepare nlink++/--" << dendl
;
5290 op
= MMDSSlaveRequest::OP_LINKPREP
;
5292 op
= MMDSSlaveRequest::OP_UNLINKPREP
;
5293 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, op
);
5294 targeti
->set_object_info(req
->get_object_info());
5295 req
->op_stamp
= mdr
->get_op_stamp();
5296 mds
->send_message_mds(req
, linkauth
);
5298 assert(mdr
->more()->waiting_on_slave
.count(linkauth
) == 0);
5299 mdr
->more()->waiting_on_slave
.insert(linkauth
);
5302 dout(10) << " targeti auth has prepared nlink++/--" << dendl
;
5304 assert(g_conf
->mds_kill_link_at
!= 2);
5306 mdr
->set_mds_stamp(ceph_clock_now());
5309 mdr
->ls
= mdlog
->get_current_segment();
5310 EUpdate
*le
= new EUpdate(mdlog
, inc
? "link_remote":"unlink_remote");
5311 mdlog
->start_entry(le
);
5312 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5313 if (!mdr
->more()->witnessed
.empty()) {
5314 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
5315 le
->reqid
= mdr
->reqid
;
5316 le
->had_slaves
= true;
5317 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
5322 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1);
5323 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
5324 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
5327 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, -1);
5328 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
5329 le
->metablob
.add_null_dentry(dn
, true);
5332 journal_and_reply(mdr
, targeti
, dn
, le
, new C_MDS_link_remote_finish(this, mdr
, inc
, dn
, targeti
));
5335 void Server::_link_remote_finish(MDRequestRef
& mdr
, bool inc
,
5336 CDentry
*dn
, CInode
*targeti
,
5339 dout(10) << "_link_remote_finish "
5340 << (inc
? "link ":"unlink ")
5341 << *dn
<< " to " << *targeti
<< dendl
;
5343 assert(g_conf
->mds_kill_link_at
!= 3);
5345 if (!mdr
->more()->witnessed
.empty())
5346 mdcache
->logged_master_update(mdr
->reqid
);
5349 // link the new dentry
5350 dn
->pop_projected_linkage();
5351 dn
->mark_dirty(dpv
, mdr
->ls
);
5353 // unlink main dentry
5354 dn
->get_dir()->unlink_inode(dn
);
5355 dn
->mark_dirty(dn
->get_projected_version(), mdr
->ls
); // dirty old dentry
5360 MDRequestRef null_ref
;
5362 mdcache
->send_dentry_link(dn
, null_ref
);
5364 mdcache
->send_dentry_unlink(dn
, NULL
, null_ref
);
5366 // bump target popularity
5367 mds
->balancer
->hit_inode(mdr
->get_mds_stamp(), targeti
, META_POP_IWR
);
5368 mds
->balancer
->hit_dir(mdr
->get_mds_stamp(), dn
->get_dir(), META_POP_IWR
);
5371 respond_to_request(mdr
, 0);
5374 // removing a new dn?
5375 dn
->get_dir()->try_remove_unlinked_dn(dn
);
5379 // remote linking/unlinking
5381 class C_MDS_SlaveLinkPrep
: public ServerLogContext
{
5384 C_MDS_SlaveLinkPrep(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
5385 ServerLogContext(s
, r
), targeti(t
) { }
5386 void finish(int r
) override
{
5388 server
->_logged_slave_link(mdr
, targeti
);
5392 class C_MDS_SlaveLinkCommit
: public ServerContext
{
5396 C_MDS_SlaveLinkCommit(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
5397 ServerContext(s
), mdr(r
), targeti(t
) { }
5398 void finish(int r
) override
{
5399 server
->_commit_slave_link(mdr
, r
, targeti
);
5403 /* This function DOES put the mdr->slave_request before returning*/
5404 void Server::handle_slave_link_prep(MDRequestRef
& mdr
)
5406 dout(10) << "handle_slave_link_prep " << *mdr
5407 << " on " << mdr
->slave_request
->get_object_info()
5410 assert(g_conf
->mds_kill_link_at
!= 4);
5412 CInode
*targeti
= mdcache
->get_inode(mdr
->slave_request
->get_object_info().ino
);
5414 dout(10) << "targeti " << *targeti
<< dendl
;
5415 CDentry
*dn
= targeti
->get_parent_dn();
5416 CDentry::linkage_t
*dnl
= dn
->get_linkage();
5417 assert(dnl
->is_primary());
5419 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
5421 mdr
->auth_pin(targeti
);
5423 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
5424 assert(g_conf
->mds_kill_link_at
!= 5);
5427 mdr
->ls
= mdlog
->get_current_segment();
5428 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_prep", mdr
->reqid
, mdr
->slave_to_mds
,
5429 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::LINK
);
5430 mdlog
->start_entry(le
);
5432 inode_t
*pi
= dnl
->get_inode()->project_inode();
5434 // update journaled target inode
5436 if (mdr
->slave_request
->get_op() == MMDSSlaveRequest::OP_LINKPREP
) {
5444 link_rollback rollback
;
5445 rollback
.reqid
= mdr
->reqid
;
5446 rollback
.ino
= targeti
->ino();
5447 rollback
.old_ctime
= targeti
->inode
.ctime
; // we hold versionlock xlock; no concorrent projections
5448 const fnode_t
*pf
= targeti
->get_parent_dn()->get_dir()->get_projected_fnode();
5449 rollback
.old_dir_mtime
= pf
->fragstat
.mtime
;
5450 rollback
.old_dir_rctime
= pf
->rstat
.rctime
;
5451 rollback
.was_inc
= inc
;
5452 ::encode(rollback
, le
->rollback
);
5453 mdr
->more()->rollback_bl
= le
->rollback
;
5455 pi
->ctime
= mdr
->get_op_stamp();
5456 pi
->version
= targeti
->pre_dirty();
5458 dout(10) << " projected inode " << pi
<< " v " << pi
->version
<< dendl
;
5461 mdcache
->predirty_journal_parents(mdr
, &le
->commit
, dnl
->get_inode(), 0, PREDIRTY_SHALLOW
|PREDIRTY_PRIMARY
);
5462 mdcache
->journal_dirty_inode(mdr
.get(), &le
->commit
, targeti
);
5464 // set up commit waiter
5465 mdr
->more()->slave_commit
= new C_MDS_SlaveLinkCommit(this, mdr
, targeti
);
5467 mdr
->more()->slave_update_journaled
= true;
5468 submit_mdlog_entry(le
, new C_MDS_SlaveLinkPrep(this, mdr
, targeti
),
5473 void Server::_logged_slave_link(MDRequestRef
& mdr
, CInode
*targeti
)
5475 dout(10) << "_logged_slave_link " << *mdr
5476 << " " << *targeti
<< dendl
;
5478 assert(g_conf
->mds_kill_link_at
!= 6);
5480 // update the target
5481 targeti
->pop_and_dirty_projected_inode(mdr
->ls
);
5485 mds
->balancer
->hit_inode(mdr
->get_mds_stamp(), targeti
, META_POP_IWR
);
5488 mdr
->slave_request
->put();
5489 mdr
->slave_request
= 0;
5492 if (!mdr
->aborted
) {
5493 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
5494 MMDSSlaveRequest::OP_LINKPREPACK
);
5495 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
5497 dout(10) << " abort flag set, finishing" << dendl
;
5498 mdcache
->request_finish(mdr
);
5503 struct C_MDS_CommittedSlave
: public ServerLogContext
{
5504 C_MDS_CommittedSlave(Server
*s
, MDRequestRef
& m
) : ServerLogContext(s
, m
) {}
5505 void finish(int r
) override
{
5506 server
->_committed_slave(mdr
);
5510 void Server::_commit_slave_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
)
5512 dout(10) << "_commit_slave_link " << *mdr
5514 << " " << *targeti
<< dendl
;
5516 assert(g_conf
->mds_kill_link_at
!= 7);
5519 // drop our pins, etc.
5522 // write a commit to the journal
5523 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_commit", mdr
->reqid
, mdr
->slave_to_mds
,
5524 ESlaveUpdate::OP_COMMIT
, ESlaveUpdate::LINK
);
5525 mdlog
->start_entry(le
);
5526 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
5529 do_link_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
);
5533 void Server::_committed_slave(MDRequestRef
& mdr
)
5535 dout(10) << "_committed_slave " << *mdr
<< dendl
;
5537 assert(g_conf
->mds_kill_link_at
!= 8);
5539 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
5540 MMDSSlaveRequest::OP_COMMITTED
);
5541 mds
->send_message_mds(req
, mdr
->slave_to_mds
);
5542 mdcache
->request_finish(mdr
);
5545 struct C_MDS_LoggedLinkRollback
: public ServerLogContext
{
5547 C_MDS_LoggedLinkRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
) : ServerLogContext(s
, r
), mut(m
) {}
5548 void finish(int r
) override
{
5549 server
->_link_rollback_finish(mut
, mdr
);
5553 void Server::do_link_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
)
5555 link_rollback rollback
;
5556 bufferlist::iterator p
= rbl
.begin();
5557 ::decode(rollback
, p
);
5559 dout(10) << "do_link_rollback on " << rollback
.reqid
5560 << (rollback
.was_inc
? " inc":" dec")
5561 << " ino " << rollback
.ino
5564 assert(g_conf
->mds_kill_link_at
!= 9);
5566 mdcache
->add_rollback(rollback
.reqid
, master
); // need to finish this update before resolve finishes
5567 assert(mdr
|| mds
->is_resolve());
5569 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
5570 mut
->ls
= mds
->mdlog
->get_current_segment();
5572 CInode
*in
= mdcache
->get_inode(rollback
.ino
);
5574 dout(10) << " target is " << *in
<< dendl
;
5575 assert(!in
->is_projected()); // live slave request hold versionlock xlock.
5577 inode_t
*pi
= in
->project_inode();
5578 pi
->version
= in
->pre_dirty();
5579 mut
->add_projected_inode(in
);
5581 // parent dir rctime
5582 CDir
*parent
= in
->get_projected_parent_dn()->get_dir();
5583 fnode_t
*pf
= parent
->project_fnode();
5584 mut
->add_projected_fnode(parent
);
5585 pf
->version
= parent
->pre_dirty();
5586 if (pf
->fragstat
.mtime
== pi
->ctime
) {
5587 pf
->fragstat
.mtime
= rollback
.old_dir_mtime
;
5588 if (pf
->rstat
.rctime
== pi
->ctime
)
5589 pf
->rstat
.rctime
= rollback
.old_dir_rctime
;
5590 mut
->add_updated_lock(&parent
->get_inode()->filelock
);
5591 mut
->add_updated_lock(&parent
->get_inode()->nestlock
);
5595 pi
->ctime
= rollback
.old_ctime
;
5596 if (rollback
.was_inc
)
5602 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_rollback", rollback
.reqid
, master
,
5603 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::LINK
);
5604 mdlog
->start_entry(le
);
5605 le
->commit
.add_dir_context(parent
);
5606 le
->commit
.add_dir(parent
, true);
5607 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), 0, true);
5609 submit_mdlog_entry(le
, new C_MDS_LoggedLinkRollback(this, mut
, mdr
),
5614 void Server::_link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
)
5616 dout(10) << "_link_rollback_finish" << dendl
;
5618 assert(g_conf
->mds_kill_link_at
!= 10);
5622 mdcache
->request_finish(mdr
);
5624 mdcache
->finish_rollback(mut
->reqid
);
5630 /* This function DOES NOT put the passed message before returning*/
5631 void Server::handle_slave_link_prep_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*m
)
5633 dout(10) << "handle_slave_link_prep_ack " << *mdr
5634 << " " << *m
<< dendl
;
5635 mds_rank_t from
= mds_rank_t(m
->get_source().num());
5637 assert(g_conf
->mds_kill_link_at
!= 11);
5640 mdr
->more()->slaves
.insert(from
);
5643 assert(mdr
->more()->witnessed
.count(from
) == 0);
5644 mdr
->more()->witnessed
.insert(from
);
5645 assert(!m
->is_not_journaled());
5646 mdr
->more()->has_journaled_slaves
= true;
5648 // remove from waiting list
5649 assert(mdr
->more()->waiting_on_slave
.count(from
));
5650 mdr
->more()->waiting_on_slave
.erase(from
);
5652 assert(mdr
->more()->waiting_on_slave
.empty());
5654 dispatch_client_request(mdr
); // go again!
5663 void Server::handle_client_unlink(MDRequestRef
& mdr
)
5665 MClientRequest
*req
= mdr
->client_request
;
5666 client_t client
= mdr
->get_client();
5670 if (req
->get_op() == CEPH_MDS_OP_RMDIR
) rmdir
= true;
5672 if (req
->get_filepath().depth() == 0) {
5673 respond_to_request(mdr
, -EINVAL
);
5678 vector
<CDentry
*> trace
;
5680 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, req
->get_filepath(), &trace
, &in
, MDS_TRAVERSE_FORWARD
);
5684 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
5685 mdcache
->find_ino_peers(req
->get_filepath().get_ino(), new C_MDS_TryFindInode(this, mdr
));
5688 respond_to_request(mdr
, r
);
5691 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5692 respond_to_request(mdr
, -EROFS
);
5696 CDentry
*dn
= trace
[trace
.size()-1];
5698 if (!dn
->is_auth()) {
5699 mdcache
->request_forward(mdr
, dn
->authority().first
);
5703 CInode
*diri
= dn
->get_dir()->get_inode();
5705 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
5706 assert(!dnl
->is_null());
5709 dout(7) << "handle_client_rmdir on " << *dn
<< dendl
;
5711 dout(7) << "handle_client_unlink on " << *dn
<< dendl
;
5713 dout(7) << "dn links to " << *in
<< dendl
;
5718 // do empty directory checks
5719 if (_dir_is_nonempty_unlocked(mdr
, in
)) {
5720 respond_to_request(mdr
, -ENOTEMPTY
);
5724 dout(7) << "handle_client_unlink on dir " << *in
<< ", returning error" << dendl
;
5725 respond_to_request(mdr
, -EISDIR
);
5731 dout(7) << "handle_client_rmdir on non-dir " << *in
<< ", returning error" << dendl
;
5732 respond_to_request(mdr
, -ENOTDIR
);
5737 // -- create stray dentry? --
5738 CDentry
*straydn
= NULL
;
5739 if (dnl
->is_primary()) {
5740 straydn
= prepare_stray_dentry(mdr
, dnl
->get_inode());
5743 dout(10) << " straydn is " << *straydn
<< dendl
;
5744 } else if (mdr
->straydn
) {
5745 mdr
->unpin(mdr
->straydn
);
5746 mdr
->straydn
= NULL
;
5750 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5752 for (int i
=0; i
<(int)trace
.size()-1; i
++)
5753 rdlocks
.insert(&trace
[i
]->lock
);
5754 xlocks
.insert(&dn
->lock
);
5755 wrlocks
.insert(&diri
->filelock
);
5756 wrlocks
.insert(&diri
->nestlock
);
5757 xlocks
.insert(&in
->linklock
);
5759 wrlocks
.insert(&straydn
->get_dir()->inode
->filelock
);
5760 wrlocks
.insert(&straydn
->get_dir()->inode
->nestlock
);
5761 xlocks
.insert(&straydn
->lock
);
5764 rdlocks
.insert(&in
->filelock
); // to verify it's empty
5765 mds
->locker
->include_snap_rdlocks(rdlocks
, dnl
->get_inode());
5767 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5771 _dir_is_nonempty(mdr
, in
)) {
5772 respond_to_request(mdr
, -ENOTEMPTY
);
5776 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
5777 if (!check_access(mdr
, diri
, MAY_WRITE
))
5782 if (in
->is_dir() && in
->has_subtree_root_dirfrag()) {
5783 // subtree root auths need to be witnesses
5784 set
<mds_rank_t
> witnesses
;
5785 in
->list_replicas(witnesses
);
5786 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
5788 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
5789 p
!= witnesses
.end();
5791 if (mdr
->more()->witnessed
.count(*p
)) {
5792 dout(10) << " already witnessed by mds." << *p
<< dendl
;
5793 } else if (mdr
->more()->waiting_on_slave
.count(*p
)) {
5794 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
5796 if (!_rmdir_prepare_witness(mdr
, *p
, trace
, straydn
))
5800 if (!mdr
->more()->waiting_on_slave
.empty())
5801 return; // we're waiting for a witness.
5805 if (dnl
->is_remote() && !dnl
->get_inode()->is_auth())
5806 _link_remote(mdr
, false, dn
, dnl
->get_inode());
5808 _unlink_local(mdr
, dn
, straydn
);
5811 class C_MDS_unlink_local_finish
: public ServerLogContext
{
5814 version_t dnpv
; // deleted dentry
5816 C_MDS_unlink_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*sd
) :
5817 ServerLogContext(s
, r
), dn(d
), straydn(sd
),
5818 dnpv(d
->get_projected_version()) {}
5819 void finish(int r
) override
{
5821 server
->_unlink_local_finish(mdr
, dn
, straydn
, dnpv
);
5825 void Server::_unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
5827 dout(10) << "_unlink_local " << *dn
<< dendl
;
5829 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
5830 CInode
*in
= dnl
->get_inode();
5832 SnapRealm
*realm
= in
->find_snaprealm();
5833 snapid_t follows
= realm
->get_newest_seq();
5836 mdr
->ls
= mdlog
->get_current_segment();
5838 // prepare log entry
5839 EUpdate
*le
= new EUpdate(mdlog
, "unlink_local");
5840 mdlog
->start_entry(le
);
5841 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5842 if (!mdr
->more()->witnessed
.empty()) {
5843 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
5844 le
->reqid
= mdr
->reqid
;
5845 le
->had_slaves
= true;
5846 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
5850 assert(dnl
->is_primary());
5851 straydn
->push_projected_linkage(in
);
5852 straydn
->first
= follows
+ 1;
5855 // the unlinked dentry
5858 inode_t
*pi
= in
->project_inode();
5859 dn
->make_path_string(pi
->stray_prior_path
);
5860 mdr
->add_projected_inode(in
); // do this _after_ my dn->pre_dirty().. we apply that one manually.
5861 pi
->version
= in
->pre_dirty();
5862 pi
->ctime
= mdr
->get_op_stamp();
5866 in
->state_set(CInode::STATE_ORPHAN
);
5868 if (dnl
->is_primary()) {
5869 // primary link. add stray dentry.
5871 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, -1);
5872 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, straydn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5874 // project snaprealm, too
5875 if (in
->snaprealm
|| follows
+ 1 > in
->get_oldest_snap())
5876 in
->project_past_snaprealm_parent(straydn
->get_dir()->inode
->find_snaprealm());
5878 pi
->update_backtrace();
5879 le
->metablob
.add_primary_dentry(straydn
, in
, true, true);
5881 // remote link. update remote inode.
5882 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_DIR
, -1);
5883 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
5884 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
5887 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
5888 le
->metablob
.add_null_dentry(dn
, true);
5891 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
5892 le
->metablob
.renamed_dirino
= in
->ino();
5895 dn
->push_projected_linkage();
5899 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
5902 journal_and_reply(mdr
, 0, dn
, le
, new C_MDS_unlink_local_finish(this, mdr
, dn
, straydn
));
5905 void Server::_unlink_local_finish(MDRequestRef
& mdr
,
5906 CDentry
*dn
, CDentry
*straydn
,
5909 dout(10) << "_unlink_local_finish " << *dn
<< dendl
;
5911 if (!mdr
->more()->witnessed
.empty())
5912 mdcache
->logged_master_update(mdr
->reqid
);
5914 // unlink main dentry
5915 dn
->get_dir()->unlink_inode(dn
);
5916 dn
->pop_projected_linkage();
5918 // relink as stray? (i.e. was primary link?)
5919 CInode
*strayin
= NULL
;
5920 bool snap_is_new
= false;
5922 dout(20) << " straydn is " << *straydn
<< dendl
;
5923 CDentry::linkage_t
*straydnl
= straydn
->pop_projected_linkage();
5924 strayin
= straydnl
->get_inode();
5926 snap_is_new
= strayin
->snaprealm
? true : false;
5927 mdcache
->touch_dentry_bottom(straydn
);
5930 dn
->mark_dirty(dnpv
, mdr
->ls
);
5933 if (snap_is_new
) //only new if strayin exists
5934 mdcache
->do_realm_invalidate_and_update_notify(strayin
, CEPH_SNAP_OP_SPLIT
, true);
5936 mdcache
->send_dentry_unlink(dn
, straydn
, mdr
);
5938 // update subtree map?
5939 if (straydn
&& strayin
->is_dir())
5940 mdcache
->adjust_subtree_after_rename(strayin
, dn
->get_dir(), true);
5943 mds
->balancer
->hit_dir(mdr
->get_mds_stamp(), dn
->get_dir(), META_POP_IWR
);
5946 respond_to_request(mdr
, 0);
5948 // removing a new dn?
5949 dn
->get_dir()->try_remove_unlinked_dn(dn
);
5952 // respond_to_request() drops locks. So stray reintegration can race with us.
5953 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
5954 // Tip off the MDCache that this dentry is a stray that
5955 // might be elegible for purge.
5956 mdcache
->notify_stray(straydn
);
5960 bool Server::_rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
)
5962 if (mds
->is_cluster_degraded() &&
5963 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
5964 dout(10) << "_rmdir_prepare_witness mds." << who
<< " is not active" << dendl
;
5965 if (mdr
->more()->waiting_on_slave
.empty())
5966 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
5970 dout(10) << "_rmdir_prepare_witness mds." << who
<< dendl
;
5971 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
5972 MMDSSlaveRequest::OP_RMDIRPREP
);
5973 req
->srcdnpath
= filepath(trace
.front()->get_dir()->ino());
5974 for (auto dn
: trace
)
5975 req
->srcdnpath
.push_dentry(dn
->name
);
5976 mdcache
->replicate_stray(straydn
, who
, req
->stray
);
5978 req
->op_stamp
= mdr
->get_op_stamp();
5979 mds
->send_message_mds(req
, who
);
5981 assert(mdr
->more()->waiting_on_slave
.count(who
) == 0);
5982 mdr
->more()->waiting_on_slave
.insert(who
);
5986 struct C_MDS_SlaveRmdirPrep
: public ServerLogContext
{
5987 CDentry
*dn
, *straydn
;
5988 C_MDS_SlaveRmdirPrep(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*st
)
5989 : ServerLogContext(s
, r
), dn(d
), straydn(st
) {}
5990 void finish(int r
) override
{
5991 server
->_logged_slave_rmdir(mdr
, dn
, straydn
);
5995 struct C_MDS_SlaveRmdirCommit
: public ServerContext
{
5997 C_MDS_SlaveRmdirCommit(Server
*s
, MDRequestRef
& r
)
5998 : ServerContext(s
), mdr(r
) { }
5999 void finish(int r
) override
{
6000 server
->_commit_slave_rmdir(mdr
, r
);
6004 void Server::handle_slave_rmdir_prep(MDRequestRef
& mdr
)
6006 dout(10) << "handle_slave_rmdir_prep " << *mdr
6007 << " " << mdr
->slave_request
->srcdnpath
6008 << " to " << mdr
->slave_request
->destdnpath
6011 vector
<CDentry
*> trace
;
6012 filepath
srcpath(mdr
->slave_request
->srcdnpath
);
6013 dout(10) << " src " << srcpath
<< dendl
;
6015 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, srcpath
, &trace
, &in
, MDS_TRAVERSE_DISCOVERXLOCK
);
6018 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
6023 CDentry
*dn
= trace
[trace
.size()-1];
6024 dout(10) << " dn " << *dn
<< dendl
;
6027 assert(mdr
->straydn
);
6028 CDentry
*straydn
= mdr
->straydn
;
6029 dout(10) << " straydn " << *straydn
<< dendl
;
6031 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
6033 rmdir_rollback rollback
;
6034 rollback
.reqid
= mdr
->reqid
;
6035 rollback
.src_dir
= dn
->get_dir()->dirfrag();
6036 rollback
.src_dname
= dn
->name
;
6037 rollback
.dest_dir
= straydn
->get_dir()->dirfrag();
6038 rollback
.dest_dname
= straydn
->name
;
6039 ::encode(rollback
, mdr
->more()->rollback_bl
);
6040 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
6042 // set up commit waiter
6043 mdr
->more()->slave_commit
= new C_MDS_SlaveRmdirCommit(this, mdr
);
6045 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
6046 dout(10) << " no auth subtree in " << *in
<< ", skipping journal" << dendl
;
6047 dn
->get_dir()->unlink_inode(dn
);
6048 straydn
->get_dir()->link_primary_inode(straydn
, in
);
6050 assert(straydn
->first
>= in
->first
);
6051 in
->first
= straydn
->first
;
6053 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), false);
6055 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
6056 MMDSSlaveRequest::OP_RMDIRPREPACK
);
6057 reply
->mark_not_journaled();
6058 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
6060 // send caps to auth (if we're not already)
6061 if (in
->is_any_caps() && !in
->state_test(CInode::STATE_EXPORTINGCAPS
))
6062 mdcache
->migrator
->export_caps(in
);
6064 mdcache
->touch_dentry_bottom(straydn
); // move stray to end of lru
6066 mdr
->slave_request
->put();
6067 mdr
->slave_request
= 0;
6072 straydn
->push_projected_linkage(in
);
6073 dn
->push_projected_linkage();
6075 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir", mdr
->reqid
, mdr
->slave_to_mds
,
6076 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::RMDIR
);
6077 mdlog
->start_entry(le
);
6078 le
->rollback
= mdr
->more()->rollback_bl
;
6080 le
->commit
.add_dir_context(straydn
->get_dir());
6081 le
->commit
.add_primary_dentry(straydn
, in
, true);
6082 // slave: no need to journal original dentry
6084 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
6085 le
->commit
.renamed_dirino
= in
->ino();
6087 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
6089 mdr
->more()->slave_update_journaled
= true;
6090 submit_mdlog_entry(le
, new C_MDS_SlaveRmdirPrep(this, mdr
, dn
, straydn
),
6095 void Server::_logged_slave_rmdir(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
6097 dout(10) << "_logged_slave_rmdir " << *mdr
<< " on " << *dn
<< dendl
;
6099 // update our cache now, so we are consistent with what is in the journal
6100 // when we journal a subtree map
6101 CInode
*in
= dn
->get_linkage()->get_inode();
6102 dn
->get_dir()->unlink_inode(dn
);
6103 straydn
->pop_projected_linkage();
6104 dn
->pop_projected_linkage();
6105 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), true);
6108 mdr
->slave_request
->put();
6109 mdr
->slave_request
= 0;
6112 if (!mdr
->aborted
) {
6113 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
6114 MMDSSlaveRequest::OP_RMDIRPREPACK
);
6115 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
6117 dout(10) << " abort flag set, finishing" << dendl
;
6118 mdcache
->request_finish(mdr
);
6122 void Server::handle_slave_rmdir_prep_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
6124 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
6125 << " " << *ack
<< dendl
;
6127 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
6129 mdr
->more()->slaves
.insert(from
);
6130 mdr
->more()->witnessed
.insert(from
);
6131 if (!ack
->is_not_journaled())
6132 mdr
->more()->has_journaled_slaves
= true;
6134 // remove from waiting list
6135 assert(mdr
->more()->waiting_on_slave
.count(from
));
6136 mdr
->more()->waiting_on_slave
.erase(from
);
6138 if (mdr
->more()->waiting_on_slave
.empty())
6139 dispatch_client_request(mdr
); // go again!
6141 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
6144 void Server::_commit_slave_rmdir(MDRequestRef
& mdr
, int r
)
6146 dout(10) << "_commit_slave_rmdir " << *mdr
<< " r=" << r
<< dendl
;
6151 if (mdr
->more()->slave_update_journaled
) {
6152 // write a commit to the journal
6153 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir_commit", mdr
->reqid
,
6154 mdr
->slave_to_mds
, ESlaveUpdate::OP_COMMIT
,
6155 ESlaveUpdate::RMDIR
);
6156 mdlog
->start_entry(le
);
6157 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
6160 _committed_slave(mdr
);
6164 do_rmdir_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
);
6168 struct C_MDS_LoggedRmdirRollback
: public ServerLogContext
{
6172 C_MDS_LoggedRmdirRollback(Server
*s
, MDRequestRef
& m
, metareqid_t mr
, CDentry
*d
, CDentry
*st
)
6173 : ServerLogContext(s
, m
), reqid(mr
), dn(d
), straydn(st
) {}
6174 void finish(int r
) override
{
6175 server
->_rmdir_rollback_finish(mdr
, reqid
, dn
, straydn
);
6179 void Server::do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
)
6181 // unlink the other rollback methods, the rmdir rollback is only
6182 // needed to record the subtree changes in the journal for inode
6183 // replicas who are auth for empty dirfrags. no actual changes to
6184 // the file system are taking place here, so there is no Mutation.
6186 rmdir_rollback rollback
;
6187 bufferlist::iterator p
= rbl
.begin();
6188 ::decode(rollback
, p
);
6190 dout(10) << "do_rmdir_rollback on " << rollback
.reqid
<< dendl
;
6191 mdcache
->add_rollback(rollback
.reqid
, master
); // need to finish this update before resolve finishes
6192 assert(mdr
|| mds
->is_resolve());
6194 CDir
*dir
= mdcache
->get_dirfrag(rollback
.src_dir
);
6196 dir
= mdcache
->get_dirfrag(rollback
.src_dir
.ino
, rollback
.src_dname
);
6198 CDentry
*dn
= dir
->lookup(rollback
.src_dname
);
6200 dout(10) << " dn " << *dn
<< dendl
;
6201 dir
= mdcache
->get_dirfrag(rollback
.dest_dir
);
6203 CDentry
*straydn
= dir
->lookup(rollback
.dest_dname
);
6205 dout(10) << " straydn " << *dn
<< dendl
;
6206 CInode
*in
= straydn
->get_linkage()->get_inode();
6208 if (mdr
&& !mdr
->more()->slave_update_journaled
) {
6209 assert(!in
->has_subtree_root_dirfrag(mds
->get_nodeid()));
6211 straydn
->get_dir()->unlink_inode(straydn
);
6212 dn
->get_dir()->link_primary_inode(dn
, in
);
6214 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(), false);
6216 mdcache
->request_finish(mdr
);
6217 mdcache
->finish_rollback(rollback
.reqid
);
6221 dn
->push_projected_linkage(in
);
6222 straydn
->push_projected_linkage();
6224 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir_rollback", rollback
.reqid
, master
,
6225 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::RMDIR
);
6226 mdlog
->start_entry(le
);
6228 le
->commit
.add_dir_context(dn
->get_dir());
6229 le
->commit
.add_primary_dentry(dn
, in
, true);
6230 // slave: no need to journal straydn
6232 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
6233 le
->commit
.renamed_dirino
= in
->ino();
6235 mdcache
->project_subtree_rename(in
, straydn
->get_dir(), dn
->get_dir());
6237 submit_mdlog_entry(le
,
6238 new C_MDS_LoggedRmdirRollback(this, mdr
,rollback
.reqid
,
6244 void Server::_rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
)
6246 dout(10) << "_rmdir_rollback_finish " << reqid
<< dendl
;
6248 straydn
->get_dir()->unlink_inode(straydn
);
6249 dn
->pop_projected_linkage();
6250 straydn
->pop_projected_linkage();
6252 CInode
*in
= dn
->get_linkage()->get_inode();
6253 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(), true);
6254 if (mds
->is_resolve()) {
6255 CDir
*root
= mdcache
->get_subtree_root(straydn
->get_dir());
6256 mdcache
->try_trim_non_auth_subtree(root
);
6260 mdcache
->request_finish(mdr
);
6262 mdcache
->finish_rollback(reqid
);
6266 /** _dir_is_nonempty[_unlocked]
6268 * check if a directory is non-empty (i.e. we can rmdir it).
6270 * the unlocked varient this is a fastpath check. we can't really be
6271 * sure until we rdlock the filelock.
6273 bool Server::_dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*in
)
6275 dout(10) << "dir_is_nonempty_unlocked " << *in
<< dendl
;
6276 assert(in
->is_auth());
6278 if (in
->snaprealm
&& in
->snaprealm
->srnode
.snaps
.size())
6279 return true; // in a snapshot!
6282 in
->get_dirfrags(ls
);
6283 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6285 // is the frag obviously non-empty?
6286 if (dir
->is_auth()) {
6287 if (dir
->get_projected_fnode()->fragstat
.size()) {
6288 dout(10) << "dir_is_nonempty_unlocked dirstat has "
6289 << dir
->get_projected_fnode()->fragstat
.size() << " items " << *dir
<< dendl
;
6298 bool Server::_dir_is_nonempty(MDRequestRef
& mdr
, CInode
*in
)
6300 dout(10) << "dir_is_nonempty " << *in
<< dendl
;
6301 assert(in
->is_auth());
6302 assert(in
->filelock
.can_read(mdr
->get_client()));
6304 frag_info_t dirstat
;
6305 version_t dirstat_version
= in
->get_projected_inode()->dirstat
.version
;
6308 in
->get_dirfrags(ls
);
6309 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6311 const fnode_t
*pf
= dir
->get_projected_fnode();
6312 if (pf
->fragstat
.size()) {
6313 dout(10) << "dir_is_nonempty dirstat has "
6314 << pf
->fragstat
.size() << " items " << *dir
<< dendl
;
6318 if (pf
->accounted_fragstat
.version
== dirstat_version
)
6319 dirstat
.add(pf
->accounted_fragstat
);
6321 dirstat
.add(pf
->fragstat
);
6324 return dirstat
.size() != in
->get_projected_inode()->dirstat
.size();
6328 // ======================================================
6331 class C_MDS_rename_finish
: public ServerLogContext
{
6336 C_MDS_rename_finish(Server
*s
, MDRequestRef
& r
,
6337 CDentry
*sdn
, CDentry
*ddn
, CDentry
*stdn
) :
6338 ServerLogContext(s
, r
),
6339 srcdn(sdn
), destdn(ddn
), straydn(stdn
) { }
6340 void finish(int r
) override
{
6342 server
->_rename_finish(mdr
, srcdn
, destdn
, straydn
);
6347 /** handle_client_rename
6349 * rename master is the destdn auth. this is because cached inodes
6350 * must remain connected. thus, any replica of srci, must also
6351 * replicate destdn, and possibly straydn, so that srci (and
6352 * destdn->inode) remain connected during the rename.
6354 * to do this, we freeze srci, then master (destdn auth) verifies that
6355 * all other nodes have also replciated destdn and straydn. note that
6356 * destdn replicas need not also replicate srci. this only works when
6359 * This function takes responsibility for the passed mdr.
6361 void Server::handle_client_rename(MDRequestRef
& mdr
)
6363 MClientRequest
*req
= mdr
->client_request
;
6364 dout(7) << "handle_client_rename " << *req
<< dendl
;
6366 filepath destpath
= req
->get_filepath();
6367 filepath srcpath
= req
->get_filepath2();
6368 if (destpath
.depth() == 0 || srcpath
.depth() == 0) {
6369 respond_to_request(mdr
, -EINVAL
);
6372 const string
&destname
= destpath
.last_dentry();
6374 vector
<CDentry
*>& srctrace
= mdr
->dn
[1];
6375 vector
<CDentry
*>& desttrace
= mdr
->dn
[0];
6377 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
6379 CDentry
*destdn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, true, false, true);
6380 if (!destdn
) return;
6381 dout(10) << " destdn " << *destdn
<< dendl
;
6382 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6383 respond_to_request(mdr
, -EROFS
);
6386 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
6387 CDir
*destdir
= destdn
->get_dir();
6388 assert(destdir
->is_auth());
6390 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, srcpath
, &srctrace
, NULL
, MDS_TRAVERSE_DISCOVER
);
6395 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
6396 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
6398 dout(10) << "FAIL on error " << r
<< dendl
;
6399 respond_to_request(mdr
, r
);
6404 assert(!srctrace
.empty());
6405 CDentry
*srcdn
= srctrace
[srctrace
.size()-1];
6406 dout(10) << " srcdn " << *srcdn
<< dendl
;
6407 if (srcdn
->last
!= CEPH_NOSNAP
) {
6408 respond_to_request(mdr
, -EROFS
);
6411 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
6412 CInode
*srci
= srcdnl
->get_inode();
6413 dout(10) << " srci " << *srci
<< dendl
;
6416 if (!destdnl
->is_null()) {
6417 //dout(10) << "dest dn exists " << *destdn << dendl;
6418 oldin
= mdcache
->get_dentry_inode(destdn
, mdr
, true);
6420 dout(10) << " oldin " << *oldin
<< dendl
;
6422 // mv /some/thing /to/some/existing_other_thing
6423 if (oldin
->is_dir() && !srci
->is_dir()) {
6424 respond_to_request(mdr
, -EISDIR
);
6427 if (!oldin
->is_dir() && srci
->is_dir()) {
6428 respond_to_request(mdr
, -ENOTDIR
);
6432 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
6433 if (oldin
->is_dir() && _dir_is_nonempty_unlocked(mdr
, oldin
)) {
6434 respond_to_request(mdr
, -ENOTEMPTY
);
6437 if (srci
== oldin
&& !srcdn
->get_dir()->inode
->is_stray()) {
6438 respond_to_request(mdr
, 0); // no-op. POSIX makes no sense.
6443 // -- some sanity checks --
6445 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
6446 if (destpath
.get_ino() != srcpath
.get_ino() &&
6447 !(req
->get_source().is_mds() &&
6448 MDS_INO_IS_MDSDIR(srcpath
.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
6449 CInode
*srcbase
= srctrace
[0]->get_dir()->get_inode();
6450 CInode
*destbase
= desttrace
[0]->get_dir()->get_inode();
6451 // ok, extend srctrace toward root until it is an ancestor of desttrace.
6452 while (srcbase
!= destbase
&&
6453 !srcbase
->is_projected_ancestor_of(destbase
)) {
6454 CDentry
*pdn
= srcbase
->get_projected_parent_dn();
6455 srctrace
.insert(srctrace
.begin(), pdn
);
6456 dout(10) << "rename prepending srctrace with " << *pdn
<< dendl
;
6457 srcbase
= pdn
->get_dir()->get_inode();
6460 // then, extend destpath until it shares the same parent inode as srcpath.
6461 while (destbase
!= srcbase
) {
6462 CDentry
*pdn
= destbase
->get_projected_parent_dn();
6463 desttrace
.insert(desttrace
.begin(), pdn
);
6464 rdlocks
.insert(&pdn
->lock
);
6465 dout(10) << "rename prepending desttrace with " << *pdn
<< dendl
;
6466 destbase
= pdn
->get_dir()->get_inode();
6468 dout(10) << "rename src and dest traces now share common ancestor " << *destbase
<< dendl
;
6472 if (srcdn
->get_dir() == destdir
&& srcdn
->name
== destname
) {
6473 dout(7) << "rename src=dest, noop" << dendl
;
6474 respond_to_request(mdr
, 0);
6478 // dest a child of src?
6479 // e.g. mv /usr /usr/foo
6480 CDentry
*pdn
= destdir
->inode
->get_projected_parent_dn();
6483 dout(7) << "cannot rename item to be a child of itself" << dendl
;
6484 respond_to_request(mdr
, -EINVAL
);
6487 pdn
= pdn
->get_dir()->inode
->parent
;
6490 // is this a stray migration, reintegration or merge? (sanity checks!)
6491 if (mdr
->reqid
.name
.is_mds() &&
6492 !(MDS_INO_IS_MDSDIR(srcpath
.get_ino()) &&
6493 MDS_INO_IS_MDSDIR(destpath
.get_ino())) &&
6494 !(destdnl
->is_remote() &&
6495 destdnl
->get_remote_ino() == srci
->ino())) {
6496 respond_to_request(mdr
, -EINVAL
); // actually, this won't reply, but whatev.
6500 bool linkmerge
= (srcdnl
->get_inode() == destdnl
->get_inode() &&
6501 (srcdnl
->is_primary() || destdnl
->is_primary()));
6503 dout(10) << " this is a link merge" << dendl
;
6505 // -- create stray dentry? --
6506 CDentry
*straydn
= NULL
;
6507 if (destdnl
->is_primary() && !linkmerge
) {
6508 straydn
= prepare_stray_dentry(mdr
, destdnl
->get_inode());
6511 dout(10) << " straydn is " << *straydn
<< dendl
;
6512 } else if (mdr
->straydn
) {
6513 mdr
->unpin(mdr
->straydn
);
6514 mdr
->straydn
= NULL
;
6517 // -- prepare witness list --
6519 * NOTE: we use _all_ replicas as witnesses.
6520 * this probably isn't totally necessary (esp for file renames),
6521 * but if/when we change that, we have to make sure rejoin is
6522 * sufficiently robust to handle strong rejoins from survivors
6523 * with totally wrong dentry->inode linkage.
6524 * (currently, it can ignore rename effects, because the resolve
6525 * stage will sort them out.)
6527 set
<mds_rank_t
> witnesses
= mdr
->more()->extra_witnesses
;
6528 if (srcdn
->is_auth())
6529 srcdn
->list_replicas(witnesses
);
6531 witnesses
.insert(srcdn
->authority().first
);
6532 if (srcdnl
->is_remote() && !srci
->is_auth())
6533 witnesses
.insert(srci
->authority().first
);
6534 destdn
->list_replicas(witnesses
);
6535 if (destdnl
->is_remote() && !oldin
->is_auth())
6536 witnesses
.insert(oldin
->authority().first
);
6537 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
6541 map
<SimpleLock
*, mds_rank_t
> remote_wrlocks
;
6543 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
6544 for (int i
=0; i
<(int)srctrace
.size(); i
++)
6545 rdlocks
.insert(&srctrace
[i
]->lock
);
6546 xlocks
.insert(&srcdn
->lock
);
6547 mds_rank_t srcdirauth
= srcdn
->get_dir()->authority().first
;
6548 if (srcdirauth
!= mds
->get_nodeid()) {
6549 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth
<< dendl
;
6550 remote_wrlocks
[&srcdn
->get_dir()->inode
->filelock
] = srcdirauth
;
6551 remote_wrlocks
[&srcdn
->get_dir()->inode
->nestlock
] = srcdirauth
;
6553 rdlocks
.insert(&srci
->dirfragtreelock
);
6555 wrlocks
.insert(&srcdn
->get_dir()->inode
->filelock
);
6556 wrlocks
.insert(&srcdn
->get_dir()->inode
->nestlock
);
6558 mds
->locker
->include_snap_rdlocks(rdlocks
, srcdn
->get_dir()->inode
);
6562 wrlocks
.insert(&straydn
->get_dir()->inode
->filelock
);
6563 wrlocks
.insert(&straydn
->get_dir()->inode
->nestlock
);
6564 xlocks
.insert(&straydn
->lock
);
6567 // xlock versionlock on dentries if there are witnesses.
6568 // replicas can't see projected dentry linkages, and will get
6569 // confused if we try to pipeline things.
6570 if (!witnesses
.empty()) {
6571 // take xlock on all projected ancestor dentries for srcdn and destdn.
6572 // this ensures the srcdn and destdn can be traversed to by the witnesses.
6573 for (int i
= 0; i
<(int)srctrace
.size(); i
++) {
6574 if (srctrace
[i
]->is_auth() && srctrace
[i
]->is_projected())
6575 xlocks
.insert(&srctrace
[i
]->versionlock
);
6577 for (int i
=0; i
<(int)desttrace
.size(); i
++) {
6578 if (desttrace
[i
]->is_auth() && desttrace
[i
]->is_projected())
6579 xlocks
.insert(&desttrace
[i
]->versionlock
);
6581 // xlock srci and oldin's primary dentries, so witnesses can call
6582 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
6584 if (srcdnl
->is_remote())
6585 xlocks
.insert(&srci
->get_projected_parent_dn()->lock
);
6586 if (destdnl
->is_remote())
6587 xlocks
.insert(&oldin
->get_projected_parent_dn()->lock
);
6590 // we need to update srci's ctime. xlock its least contended lock to do that...
6591 xlocks
.insert(&srci
->linklock
);
6593 // xlock oldin (for nlink--)
6595 xlocks
.insert(&oldin
->linklock
);
6596 if (oldin
->is_dir())
6597 rdlocks
.insert(&oldin
->filelock
);
6599 if (srcdnl
->is_primary() && srci
->is_dir())
6600 // FIXME: this should happen whenever we are renamning between
6601 // realms, regardless of the file type
6602 // FIXME: If/when this changes, make sure to update the
6603 // "allowance" in handle_slave_rename_prep
6604 xlocks
.insert(&srci
->snaplock
); // FIXME: an auth bcast could be sufficient?
6606 rdlocks
.insert(&srci
->snaplock
);
6608 CInode
*auth_pin_freeze
= !srcdn
->is_auth() && srcdnl
->is_primary() ? srci
: NULL
;
6609 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
,
6610 &remote_wrlocks
, auth_pin_freeze
))
6613 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
6614 if (!check_access(mdr
, srcdn
->get_dir()->get_inode(), MAY_WRITE
))
6617 if (!check_access(mdr
, destdn
->get_dir()->get_inode(), MAY_WRITE
))
6620 if (!check_fragment_space(mdr
, destdn
->get_dir()))
6623 if (!check_access(mdr
, srci
, MAY_WRITE
))
6627 // with read lock, really verify oldin is empty
6630 _dir_is_nonempty(mdr
, oldin
)) {
6631 respond_to_request(mdr
, -ENOTEMPTY
);
6635 /* project_past_snaprealm_parent() will do this job
6637 // moving between snaprealms?
6638 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
6639 SnapRealm *srcrealm = srci->find_snaprealm();
6640 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
6641 if (srcrealm != destrealm &&
6642 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
6643 destrealm->get_newest_seq() + 1 > srcdn->first)) {
6644 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
6645 mdcache->snaprealm_create(mdr, srci);
6651 assert(g_conf
->mds_kill_rename_at
!= 1);
6653 // -- open all srcdn inode frags, if any --
6654 // we need these open so that auth can properly delegate from inode to dirfrags
6655 // after the inode is _ours_.
6656 if (srcdnl
->is_primary() &&
6657 !srcdn
->is_auth() &&
6659 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl
;
6660 mdr
->set_stickydirs(srci
);
6663 srci
->dirfragtree
.get_leaves(frags
);
6664 for (list
<frag_t
>::iterator p
= frags
.begin();
6667 CDir
*dir
= srci
->get_dirfrag(*p
);
6669 dout(10) << " opening " << *p
<< " under " << *srci
<< dendl
;
6670 mdcache
->open_remote_dirfrag(srci
, *p
, new C_MDS_RetryRequest(mdcache
, mdr
));
6676 // -- prepare witnesses --
6678 // do srcdn auth last
6679 mds_rank_t last
= MDS_RANK_NONE
;
6680 if (!srcdn
->is_auth()) {
6681 last
= srcdn
->authority().first
;
6682 mdr
->more()->srcdn_auth_mds
= last
;
6683 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
6684 // are involved in the rename operation.
6685 if (srcdnl
->is_primary() && !mdr
->more()->is_ambiguous_auth
) {
6686 dout(10) << " preparing ambiguous auth for srci" << dendl
;
6687 assert(mdr
->more()->is_remote_frozen_authpin
);
6688 assert(mdr
->more()->rename_inode
== srci
);
6689 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
6694 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
6695 p
!= witnesses
.end();
6697 if (*p
== last
) continue; // do it last!
6698 if (mdr
->more()->witnessed
.count(*p
)) {
6699 dout(10) << " already witnessed by mds." << *p
<< dendl
;
6700 } else if (mdr
->more()->waiting_on_slave
.count(*p
)) {
6701 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
6703 if (!_rename_prepare_witness(mdr
, *p
, witnesses
, srctrace
, desttrace
, straydn
))
6707 if (!mdr
->more()->waiting_on_slave
.empty())
6708 return; // we're waiting for a witness.
6710 if (last
!= MDS_RANK_NONE
&& mdr
->more()->witnessed
.count(last
) == 0) {
6711 dout(10) << " preparing last witness (srcdn auth)" << dendl
;
6712 assert(mdr
->more()->waiting_on_slave
.count(last
) == 0);
6713 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
6717 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
6718 if (!mdr
->more()->slaves
.empty() && !srci
->is_dir())
6719 assert(g_conf
->mds_kill_rename_at
!= 3);
6720 if (!mdr
->more()->slaves
.empty() && srci
->is_dir())
6721 assert(g_conf
->mds_kill_rename_at
!= 4);
6723 // -- declare now --
6724 mdr
->set_mds_stamp(ceph_clock_now());
6726 // -- prepare journal entry --
6727 mdr
->ls
= mdlog
->get_current_segment();
6728 EUpdate
*le
= new EUpdate(mdlog
, "rename");
6729 mdlog
->start_entry(le
);
6730 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6731 if (!mdr
->more()->witnessed
.empty()) {
6732 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
6734 le
->reqid
= mdr
->reqid
;
6735 le
->had_slaves
= true;
6737 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
6738 // no need to send frozen auth pin to recovring auth MDS of srci
6739 mdr
->more()->is_remote_frozen_authpin
= false;
6742 _rename_prepare(mdr
, &le
->metablob
, &le
->client_map
, srcdn
, destdn
, straydn
);
6743 if (le
->client_map
.length())
6744 le
->cmapv
= mds
->sessionmap
.get_projected();
6746 // -- commit locally --
6747 C_MDS_rename_finish
*fin
= new C_MDS_rename_finish(this, mdr
, srcdn
, destdn
, straydn
);
6749 journal_and_reply(mdr
, srci
, destdn
, le
, fin
);
6753 void Server::_rename_finish(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
6755 dout(10) << "_rename_finish " << *mdr
<< dendl
;
6757 if (!mdr
->more()->witnessed
.empty())
6758 mdcache
->logged_master_update(mdr
->reqid
);
6761 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
6763 mdcache
->send_dentry_link(destdn
, mdr
);
6765 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
6766 CInode
*in
= destdnl
->get_inode();
6767 bool need_eval
= mdr
->more()->cap_imports
.count(in
);
6769 // test hack: test slave commit
6770 if (!mdr
->more()->slaves
.empty() && !in
->is_dir())
6771 assert(g_conf
->mds_kill_rename_at
!= 5);
6772 if (!mdr
->more()->slaves
.empty() && in
->is_dir())
6773 assert(g_conf
->mds_kill_rename_at
!= 6);
6776 mds
->balancer
->hit_dir(mdr
->get_mds_stamp(), srcdn
->get_dir(), META_POP_IWR
);
6777 if (destdnl
->is_remote() && in
->is_auth())
6778 mds
->balancer
->hit_inode(mdr
->get_mds_stamp(), in
, META_POP_IWR
);
6780 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
6782 assert(g_conf
->mds_kill_rename_at
!= 7);
6785 respond_to_request(mdr
, 0);
6788 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
6791 // respond_to_request() drops locks. So stray reintegration can race with us.
6792 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
6793 mdcache
->notify_stray(straydn
);
6801 bool Server::_rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
6802 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
)
6804 if (mds
->is_cluster_degraded() &&
6805 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
6806 dout(10) << "_rename_prepare_witness mds." << who
<< " is not active" << dendl
;
6807 if (mdr
->more()->waiting_on_slave
.empty())
6808 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
6812 dout(10) << "_rename_prepare_witness mds." << who
<< dendl
;
6813 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
6814 MMDSSlaveRequest::OP_RENAMEPREP
);
6816 req
->srcdnpath
= filepath(srctrace
.front()->get_dir()->ino());
6817 for (auto dn
: srctrace
)
6818 req
->srcdnpath
.push_dentry(dn
->name
);
6819 req
->destdnpath
= filepath(dsttrace
.front()->get_dir()->ino());
6820 for (auto dn
: dsttrace
)
6821 req
->destdnpath
.push_dentry(dn
->name
);
6823 mdcache
->replicate_stray(straydn
, who
, req
->stray
);
6825 // srcdn auth will verify our current witness list is sufficient
6826 req
->witnesses
= witnesse
;
6828 req
->op_stamp
= mdr
->get_op_stamp();
6829 mds
->send_message_mds(req
, who
);
6831 assert(mdr
->more()->waiting_on_slave
.count(who
) == 0);
6832 mdr
->more()->waiting_on_slave
.insert(who
);
6836 version_t
Server::_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
)
6838 version_t oldpv
= mdr
->more()->inode_import_v
;
6840 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
6843 bufferlist::iterator blp
= mdr
->more()->inode_import
.begin();
6846 ::decode(mdr
->more()->imported_client_map
, blp
);
6847 ::encode(mdr
->more()->imported_client_map
, *client_map_bl
,
6848 mds
->mdsmap
->get_up_features());
6849 prepare_force_open_sessions(mdr
->more()->imported_client_map
, mdr
->more()->sseq_map
);
6851 list
<ScatterLock
*> updated_scatterlocks
;
6852 mdcache
->migrator
->decode_import_inode(srcdn
, blp
, srcdn
->authority().first
, mdr
->ls
,
6853 mdr
->more()->cap_imports
, updated_scatterlocks
);
6855 // hack: force back to !auth and clean, temporarily
6856 srcdnl
->get_inode()->state_clear(CInode::STATE_AUTH
);
6857 srcdnl
->get_inode()->mark_clean();
6862 bool Server::_need_force_journal(CInode
*diri
, bool empty
)
6865 diri
->get_dirfrags(ls
);
6867 bool force_journal
= false;
6869 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6870 if ((*p
)->is_subtree_root() && (*p
)->get_dir_auth().first
== mds
->get_nodeid()) {
6871 dout(10) << " frag " << (*p
)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl
;
6872 force_journal
= true;
6875 dout(20) << " frag " << (*p
)->get_frag() << " is not auth subtree dirfrag" << dendl
;
6878 // see if any children of our frags are auth subtrees.
6879 list
<CDir
*> subtrees
;
6880 mdcache
->list_subtrees(subtrees
);
6881 dout(10) << " subtrees " << subtrees
<< " frags " << ls
<< dendl
;
6882 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6884 for (list
<CDir
*>::iterator q
= subtrees
.begin(); q
!= subtrees
.end(); ++q
) {
6885 if (dir
->contains(*q
)) {
6886 if ((*q
)->get_dir_auth().first
== mds
->get_nodeid()) {
6887 dout(10) << " frag " << (*p
)->get_frag() << " contains (maybe) auth subtree, will force journal "
6889 force_journal
= true;
6892 dout(20) << " frag " << (*p
)->get_frag() << " contains but isn't auth for " << **q
<< dendl
;
6894 dout(20) << " frag " << (*p
)->get_frag() << " does not contain " << **q
<< dendl
;
6900 return force_journal
;
6903 void Server::_rename_prepare(MDRequestRef
& mdr
,
6904 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
6905 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
6907 dout(10) << "_rename_prepare " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
6909 dout(10) << " straydn " << *straydn
<< dendl
;
6911 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
6912 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
6913 CInode
*srci
= srcdnl
->get_inode();
6914 CInode
*oldin
= destdnl
->get_inode();
6916 // primary+remote link merge?
6917 bool linkmerge
= (srci
== destdnl
->get_inode() &&
6918 (srcdnl
->is_primary() || destdnl
->is_primary()));
6919 bool silent
= srcdn
->get_dir()->inode
->is_stray();
6921 bool force_journal_dest
= false;
6922 if (srci
->is_dir() && !destdn
->is_auth()) {
6923 if (srci
->is_auth()) {
6924 // if we are auth for srci and exporting it, force journal because journal replay needs
6925 // the source inode to create auth subtrees.
6926 dout(10) << " we are exporting srci, will force journal destdn" << dendl
;
6927 force_journal_dest
= true;
6929 force_journal_dest
= _need_force_journal(srci
, false);
6932 bool force_journal_stray
= false;
6933 if (oldin
&& oldin
->is_dir() && straydn
&& !straydn
->is_auth())
6934 force_journal_stray
= _need_force_journal(oldin
, true);
6937 dout(10) << " merging remote and primary links to the same inode" << dendl
;
6939 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl
;
6940 if (force_journal_dest
)
6941 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl
;
6942 if (force_journal_stray
)
6943 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl
;
6945 if (srci
->is_dir() && (destdn
->is_auth() || force_journal_dest
)) {
6946 dout(10) << " noting renamed dir ino " << srci
->ino() << " in metablob" << dendl
;
6947 metablob
->renamed_dirino
= srci
->ino();
6948 } else if (oldin
&& oldin
->is_dir() && force_journal_stray
) {
6949 dout(10) << " noting rename target dir " << oldin
->ino() << " in metablob" << dendl
;
6950 metablob
->renamed_dirino
= oldin
->ino();
6954 inode_t
*pi
= 0; // renamed inode
6955 inode_t
*tpi
= 0; // target/overwritten inode
6959 if (destdnl
->is_primary()) {
6960 assert(straydn
); // moving to straydn.
6961 // link--, and move.
6962 if (destdn
->is_auth()) {
6963 tpi
= oldin
->project_inode(); //project_snaprealm
6964 tpi
->version
= straydn
->pre_dirty(tpi
->version
);
6965 tpi
->update_backtrace();
6967 straydn
->push_projected_linkage(oldin
);
6968 } else if (destdnl
->is_remote()) {
6970 if (oldin
->is_auth()) {
6971 tpi
= oldin
->project_inode();
6972 tpi
->version
= oldin
->pre_dirty();
6978 if (srcdnl
->is_remote()) {
6981 if (destdn
->is_auth())
6982 mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty();
6983 destdn
->push_projected_linkage(srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
6985 if (srci
->is_auth()) {
6986 pi
= srci
->project_inode();
6987 pi
->version
= srci
->pre_dirty();
6990 dout(10) << " will merge remote onto primary link" << dendl
;
6991 if (destdn
->is_auth()) {
6992 pi
= oldin
->project_inode();
6993 pi
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldin
->inode
.version
);
6997 if (destdn
->is_auth()) {
6999 if (srcdn
->is_auth())
7000 oldpv
= srci
->get_projected_version();
7002 oldpv
= _rename_prepare_import(mdr
, srcdn
, client_map_bl
);
7004 // note which dirfrags have child subtrees in the journal
7005 // event, so that we can open those (as bounds) during replay.
7006 if (srci
->is_dir()) {
7008 srci
->get_dirfrags(ls
);
7009 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7011 if (!dir
->is_auth())
7012 metablob
->renamed_dir_frags
.push_back(dir
->get_frag());
7014 dout(10) << " noting renamed dir open frags " << metablob
->renamed_dir_frags
<< dendl
;
7017 pi
= srci
->project_inode(); // project snaprealm if srcdnl->is_primary
7018 // & srcdnl->snaprealm
7019 pi
->version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldpv
);
7020 pi
->update_backtrace();
7022 destdn
->push_projected_linkage(srci
);
7026 if (srcdn
->is_auth())
7027 mdr
->more()->pvmap
[srcdn
] = srcdn
->pre_dirty();
7028 srcdn
->push_projected_linkage(); // push null linkage
7032 pi
->ctime
= mdr
->get_op_stamp();
7038 tpi
->ctime
= mdr
->get_op_stamp();
7040 destdn
->make_path_string(tpi
->stray_prior_path
);
7042 if (tpi
->nlink
== 0)
7043 oldin
->state_set(CInode::STATE_ORPHAN
);
7047 // prepare nesting, mtime updates
7048 int predirty_dir
= silent
? 0:PREDIRTY_DIR
;
7050 // guarantee stray dir is processed first during journal replay. unlink the old inode,
7051 // then link the source inode to destdn
7052 if (destdnl
->is_primary()) {
7054 if (straydn
->is_auth()) {
7055 metablob
->add_dir_context(straydn
->get_dir());
7056 metablob
->add_dir(straydn
->get_dir(), true);
7061 if (destdn
->is_auth() && !destdnl
->is_null()) {
7062 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, destdn
->get_dir(),
7063 (destdnl
->is_primary() ? PREDIRTY_PRIMARY
:0)|predirty_dir
, -1);
7064 if (destdnl
->is_primary())
7065 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, straydn
->get_dir(),
7066 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
7070 int predirty_primary
= (srcdnl
->is_primary() && srcdn
->get_dir() != destdn
->get_dir()) ? PREDIRTY_PRIMARY
:0;
7071 int flags
= predirty_dir
| predirty_primary
;
7072 if (srcdn
->is_auth())
7073 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srcdn
->get_dir(), PREDIRTY_SHALLOW
|flags
, -1);
7074 if (destdn
->is_auth())
7075 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, destdn
->get_dir(), flags
, 1);
7077 SnapRealm
*src_realm
= srci
->find_snaprealm();
7078 SnapRealm
*dest_realm
= destdn
->get_dir()->inode
->find_snaprealm();
7079 snapid_t next_dest_snap
= dest_realm
->get_newest_seq() + 1;
7081 // add it all to the metablob
7084 if (destdnl
->is_primary()) {
7085 if (destdn
->is_auth()) {
7086 // project snaprealm, too
7087 if (oldin
->snaprealm
|| dest_realm
->get_newest_seq() + 1 > oldin
->get_oldest_snap())
7088 oldin
->project_past_snaprealm_parent(straydn
->get_dir()->inode
->find_snaprealm());
7089 straydn
->first
= MAX(oldin
->first
, next_dest_snap
);
7090 metablob
->add_primary_dentry(straydn
, oldin
, true, true);
7091 } else if (force_journal_stray
) {
7092 dout(10) << " forced journaling straydn " << *straydn
<< dendl
;
7093 metablob
->add_dir_context(straydn
->get_dir());
7094 metablob
->add_primary_dentry(straydn
, oldin
, true);
7096 } else if (destdnl
->is_remote()) {
7097 if (oldin
->is_auth()) {
7099 metablob
->add_dir_context(oldin
->get_projected_parent_dir());
7100 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, oldin
->get_projected_parent_dn(),
7101 CEPH_NOSNAP
, 0, destdnl
);
7102 metablob
->add_primary_dentry(oldin
->get_projected_parent_dn(), oldin
, true);
7108 if (srcdnl
->is_remote()) {
7110 if (destdn
->is_auth() && !destdnl
->is_null())
7111 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
7113 destdn
->first
= MAX(destdn
->first
, next_dest_snap
);
7115 if (destdn
->is_auth())
7116 metablob
->add_remote_dentry(destdn
, true, srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
7117 if (srci
->get_projected_parent_dn()->is_auth()) { // it's remote
7118 metablob
->add_dir_context(srci
->get_projected_parent_dir());
7119 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srci
->get_projected_parent_dn(), CEPH_NOSNAP
, 0, srcdnl
);
7120 metablob
->add_primary_dentry(srci
->get_projected_parent_dn(), srci
, true);
7123 if (destdn
->is_auth() && !destdnl
->is_null())
7124 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
7126 destdn
->first
= MAX(destdn
->first
, next_dest_snap
);
7128 if (destdn
->is_auth())
7129 metablob
->add_primary_dentry(destdn
, destdnl
->get_inode(), true, true);
7131 } else if (srcdnl
->is_primary()) {
7132 // project snap parent update?
7133 if (destdn
->is_auth() && src_realm
!= dest_realm
&&
7134 (srci
->snaprealm
|| src_realm
->get_newest_seq() + 1 > srci
->get_oldest_snap()))
7135 srci
->project_past_snaprealm_parent(dest_realm
);
7137 if (destdn
->is_auth() && !destdnl
->is_null())
7138 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
7140 destdn
->first
= MAX(destdn
->first
, next_dest_snap
);
7142 if (destdn
->is_auth())
7143 metablob
->add_primary_dentry(destdn
, srci
, true, true);
7144 else if (force_journal_dest
) {
7145 dout(10) << " forced journaling destdn " << *destdn
<< dendl
;
7146 metablob
->add_dir_context(destdn
->get_dir());
7147 metablob
->add_primary_dentry(destdn
, srci
, true);
7148 if (srcdn
->is_auth() && srci
->is_dir()) {
7149 // journal new subtrees root dirfrags
7151 srci
->get_dirfrags(ls
);
7152 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7155 metablob
->add_dir(dir
, true);
7162 if (srcdn
->is_auth()) {
7163 dout(10) << " journaling srcdn " << *srcdn
<< dendl
;
7164 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srcdn
, CEPH_NOSNAP
, 0, srcdnl
);
7165 // also journal the inode in case we need do slave rename rollback. It is Ok to add
7166 // both primary and NULL dentries. Because during journal replay, null dentry is
7167 // processed after primary dentry.
7168 if (srcdnl
->is_primary() && !srci
->is_dir() && !destdn
->is_auth())
7169 metablob
->add_primary_dentry(srcdn
, srci
, true);
7170 metablob
->add_null_dentry(srcdn
, true);
7172 dout(10) << " NOT journaling srcdn " << *srcdn
<< dendl
;
7174 // make renamed inode first track the dn
7175 if (srcdnl
->is_primary() && destdn
->is_auth())
7176 srci
->first
= destdn
->first
;
7178 if (oldin
&& oldin
->is_dir())
7179 mdcache
->project_subtree_rename(oldin
, destdn
->get_dir(), straydn
->get_dir());
7181 mdcache
->project_subtree_rename(srci
, srcdn
->get_dir(), destdn
->get_dir());
7186 void Server::_rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7188 dout(10) << "_rename_apply " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
7189 dout(10) << " pvs " << mdr
->more()->pvmap
<< dendl
;
7191 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
7192 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
7194 CInode
*oldin
= destdnl
->get_inode();
7196 bool imported_inode
= false;
7198 // primary+remote link merge?
7199 bool linkmerge
= (srcdnl
->get_inode() == destdnl
->get_inode() &&
7200 (srcdnl
->is_primary() || destdnl
->is_primary()));
7204 if (destdnl
->is_primary()) {
7206 dout(10) << "straydn is " << *straydn
<< dendl
;
7207 destdn
->get_dir()->unlink_inode(destdn
);
7209 straydn
->pop_projected_linkage();
7210 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7211 assert(!straydn
->is_projected()); // no other projected
7213 mdcache
->touch_dentry_bottom(straydn
); // drop dn as quickly as possible.
7216 if (destdn
->is_auth()) {
7217 bool hadrealm
= (oldin
->snaprealm
? true : false);
7218 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
7219 if (oldin
->snaprealm
&& !hadrealm
)
7220 mdcache
->do_realm_invalidate_and_update_notify(oldin
, CEPH_SNAP_OP_SPLIT
);
7222 // FIXME this snaprealm is not filled out correctly
7223 //oldin->open_snaprealm(); might be sufficient..
7225 } else if (destdnl
->is_remote()) {
7226 destdn
->get_dir()->unlink_inode(destdn
);
7227 if (oldin
->is_auth())
7228 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
7232 // unlink src before we relink it at dest
7233 CInode
*in
= srcdnl
->get_inode();
7236 bool srcdn_was_remote
= srcdnl
->is_remote();
7237 srcdn
->get_dir()->unlink_inode(srcdn
);
7240 if (srcdn_was_remote
) {
7243 destdnl
= destdn
->pop_projected_linkage();
7244 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7245 assert(!destdn
->is_projected()); // no other projected
7247 destdn
->link_remote(destdnl
, in
);
7248 if (destdn
->is_auth())
7249 destdn
->mark_dirty(mdr
->more()->pvmap
[destdn
], mdr
->ls
);
7252 in
->pop_and_dirty_projected_inode(mdr
->ls
);
7254 dout(10) << "merging remote onto primary link" << dendl
;
7255 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
7259 dout(10) << "merging primary onto remote link" << dendl
;
7260 destdn
->get_dir()->unlink_inode(destdn
);
7262 destdnl
= destdn
->pop_projected_linkage();
7263 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7264 assert(!destdn
->is_projected()); // no other projected
7266 // srcdn inode import?
7267 if (!srcdn
->is_auth() && destdn
->is_auth()) {
7268 assert(mdr
->more()->inode_import
.length() > 0);
7270 map
<client_t
,Capability::Import
> imported_caps
;
7272 // finish cap imports
7273 finish_force_open_sessions(mdr
->more()->imported_client_map
, mdr
->more()->sseq_map
);
7274 if (mdr
->more()->cap_imports
.count(destdnl
->get_inode())) {
7275 mdcache
->migrator
->finish_import_inode_caps(destdnl
->get_inode(),
7276 mdr
->more()->srcdn_auth_mds
, true,
7277 mdr
->more()->cap_imports
[destdnl
->get_inode()],
7281 mdr
->more()->inode_import
.clear();
7282 ::encode(imported_caps
, mdr
->more()->inode_import
);
7284 /* hack: add an auth pin for each xlock we hold. These were
7285 * remote xlocks previously but now they're local and
7286 * we're going to try and unpin when we xlock_finish. */
7287 for (set
<SimpleLock
*>::iterator i
= mdr
->xlocks
.begin();
7288 i
!= mdr
->xlocks
.end();
7290 if ((*i
)->get_parent() == destdnl
->get_inode() &&
7291 !(*i
)->is_locallock())
7292 mds
->locker
->xlock_import(*i
);
7294 // hack: fix auth bit
7295 in
->state_set(CInode::STATE_AUTH
);
7296 imported_inode
= true;
7298 mdr
->clear_ambiguous_auth();
7301 if (destdn
->is_auth()) {
7302 in
->pop_and_dirty_projected_inode(mdr
->ls
);
7305 // FIXME: fix up snaprealm!
7310 if (srcdn
->is_auth())
7311 srcdn
->mark_dirty(mdr
->more()->pvmap
[srcdn
], mdr
->ls
);
7312 srcdn
->pop_projected_linkage();
7313 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7314 assert(!srcdn
->is_projected()); // no other projected
7316 // apply remaining projected inodes (nested)
7319 // update subtree map?
7320 if (destdnl
->is_primary() && in
->is_dir())
7321 mdcache
->adjust_subtree_after_rename(in
, srcdn
->get_dir(), true, imported_inode
);
7323 if (straydn
&& oldin
->is_dir())
7324 mdcache
->adjust_subtree_after_rename(oldin
, destdn
->get_dir(), true);
7326 // removing a new dn?
7327 if (srcdn
->is_auth())
7328 srcdn
->get_dir()->try_remove_unlinked_dn(srcdn
);
7336 class C_MDS_SlaveRenamePrep
: public ServerLogContext
{
7337 CDentry
*srcdn
, *destdn
, *straydn
;
7339 C_MDS_SlaveRenamePrep(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
7340 ServerLogContext(s
, m
), srcdn(sr
), destdn(de
), straydn(st
) {}
7341 void finish(int r
) override
{
7342 server
->_logged_slave_rename(mdr
, srcdn
, destdn
, straydn
);
7346 class C_MDS_SlaveRenameCommit
: public ServerContext
{
7348 CDentry
*srcdn
, *destdn
, *straydn
;
7350 C_MDS_SlaveRenameCommit(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
7351 ServerContext(s
), mdr(m
), srcdn(sr
), destdn(de
), straydn(st
) {}
7352 void finish(int r
) override
{
7353 server
->_commit_slave_rename(mdr
, r
, srcdn
, destdn
, straydn
);
7357 class C_MDS_SlaveRenameSessionsFlushed
: public ServerContext
{
7360 C_MDS_SlaveRenameSessionsFlushed(Server
*s
, MDRequestRef
& r
) :
7361 ServerContext(s
), mdr(r
) {}
7362 void finish(int r
) override
{
7363 server
->_slave_rename_sessions_flushed(mdr
);
7367 /* This function DOES put the mdr->slave_request before returning*/
7368 void Server::handle_slave_rename_prep(MDRequestRef
& mdr
)
7370 dout(10) << "handle_slave_rename_prep " << *mdr
7371 << " " << mdr
->slave_request
->srcdnpath
7372 << " to " << mdr
->slave_request
->destdnpath
7376 filepath
destpath(mdr
->slave_request
->destdnpath
);
7377 dout(10) << " dest " << destpath
<< dendl
;
7378 vector
<CDentry
*> trace
;
7379 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, destpath
, &trace
, NULL
, MDS_TRAVERSE_DISCOVERXLOCK
);
7382 mdcache
->find_ino_peers(destpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
7386 assert(r
== 0); // we shouldn't get an error here!
7388 CDentry
*destdn
= trace
[trace
.size()-1];
7389 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
7390 dout(10) << " destdn " << *destdn
<< dendl
;
7394 filepath
srcpath(mdr
->slave_request
->srcdnpath
);
7395 dout(10) << " src " << srcpath
<< dendl
;
7396 CInode
*srci
= nullptr;
7397 r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, srcpath
, &trace
, &srci
, MDS_TRAVERSE_DISCOVERXLOCK
);
7401 // srcpath must not point to a null dentry
7402 assert(srci
!= nullptr);
7404 CDentry
*srcdn
= trace
[trace
.size()-1];
7405 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
7406 dout(10) << " srcdn " << *srcdn
<< dendl
;
7411 bool linkmerge
= (srcdnl
->get_inode() == destdnl
->get_inode() &&
7412 (srcdnl
->is_primary() || destdnl
->is_primary()));
7413 CDentry
*straydn
= mdr
->straydn
;
7414 if (destdnl
->is_primary() && !linkmerge
)
7417 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
7418 mdr
->more()->srcdn_auth_mds
= srcdn
->authority().first
;
7420 // set up commit waiter (early, to clean up any freezing etc we do)
7421 if (!mdr
->more()->slave_commit
)
7422 mdr
->more()->slave_commit
= new C_MDS_SlaveRenameCommit(this, mdr
, srcdn
, destdn
, straydn
);
7425 if (srcdn
->is_auth()) {
7426 set
<mds_rank_t
> srcdnrep
;
7427 srcdn
->list_replicas(srcdnrep
);
7429 bool reply_witness
= false;
7430 if (srcdnl
->is_primary() && !srcdnl
->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
7433 // - avoid conflicting lock state changes
7434 // - avoid concurrent updates to the inode
7435 // (this could also be accomplished with the versionlock)
7436 int allowance
= 2; // 1 for the mdr auth_pin, 1 for the link lock
7437 allowance
+= srcdnl
->get_inode()->is_dir(); // for the snap lock
7438 dout(10) << " freezing srci " << *srcdnl
->get_inode() << " with allowance " << allowance
<< dendl
;
7439 bool frozen_inode
= srcdnl
->get_inode()->freeze_inode(allowance
);
7441 // unfreeze auth pin after freezing the inode to avoid queueing waiters
7442 if (srcdnl
->get_inode()->is_frozen_auth_pin())
7443 mdr
->unfreeze_auth_pin();
7445 if (!frozen_inode
) {
7446 srcdnl
->get_inode()->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
7451 * set ambiguous auth for srci
7452 * NOTE: we don't worry about ambiguous cache expire as we do
7453 * with subtree migrations because all slaves will pin
7454 * srcdn->get_inode() for duration of this rename.
7456 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
7458 // just mark the source inode as ambiguous auth if more than two MDS are involved.
7459 // the master will send another OP_RENAMEPREP slave request later.
7460 if (mdr
->slave_request
->witnesses
.size() > 1) {
7461 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl
;
7462 reply_witness
= true;
7465 // make sure bystanders have received all lock related messages
7466 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
7467 if (*p
== mdr
->slave_to_mds
||
7468 (mds
->is_cluster_degraded() &&
7469 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)))
7471 MMDSSlaveRequest
*notify
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
7472 MMDSSlaveRequest::OP_RENAMENOTIFY
);
7473 mds
->send_message_mds(notify
, *p
);
7474 mdr
->more()->waiting_on_slave
.insert(*p
);
7477 // make sure clients have received all cap related messages
7478 set
<client_t
> export_client_set
;
7479 mdcache
->migrator
->get_export_client_set(srcdnl
->get_inode(), export_client_set
);
7481 MDSGatherBuilder
gather(g_ceph_context
);
7482 flush_client_sessions(export_client_set
, gather
);
7483 if (gather
.has_subs()) {
7484 mdr
->more()->waiting_on_slave
.insert(MDS_RANK_NONE
);
7485 gather
.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr
));
7490 // is witness list sufficient?
7491 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
7492 if (*p
== mdr
->slave_to_mds
||
7493 mdr
->slave_request
->witnesses
.count(*p
)) continue;
7494 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl
;
7495 reply_witness
= true;
7499 if (reply_witness
) {
7500 assert(!srcdnrep
.empty());
7501 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
7502 MMDSSlaveRequest::OP_RENAMEPREPACK
);
7503 reply
->witnesses
.swap(srcdnrep
);
7504 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
7505 mdr
->slave_request
->put();
7506 mdr
->slave_request
= 0;
7509 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl
;
7510 if (!mdr
->more()->waiting_on_slave
.empty()) {
7511 dout(10) << " still waiting for rename notify acks from "
7512 << mdr
->more()->waiting_on_slave
<< dendl
;
7515 } else if (srcdnl
->is_primary() && srcdn
->authority() != destdn
->authority()) {
7516 // set ambiguous auth for srci on witnesses
7517 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
7520 // encode everything we'd need to roll this back... basically, just the original state.
7521 rename_rollback rollback
;
7523 rollback
.reqid
= mdr
->reqid
;
7525 rollback
.orig_src
.dirfrag
= srcdn
->get_dir()->dirfrag();
7526 rollback
.orig_src
.dirfrag_old_mtime
= srcdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
7527 rollback
.orig_src
.dirfrag_old_rctime
= srcdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
7528 rollback
.orig_src
.dname
= srcdn
->name
;
7529 if (srcdnl
->is_primary())
7530 rollback
.orig_src
.ino
= srcdnl
->get_inode()->ino();
7532 assert(srcdnl
->is_remote());
7533 rollback
.orig_src
.remote_ino
= srcdnl
->get_remote_ino();
7534 rollback
.orig_src
.remote_d_type
= srcdnl
->get_remote_d_type();
7537 rollback
.orig_dest
.dirfrag
= destdn
->get_dir()->dirfrag();
7538 rollback
.orig_dest
.dirfrag_old_mtime
= destdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
7539 rollback
.orig_dest
.dirfrag_old_rctime
= destdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
7540 rollback
.orig_dest
.dname
= destdn
->name
;
7541 if (destdnl
->is_primary())
7542 rollback
.orig_dest
.ino
= destdnl
->get_inode()->ino();
7543 else if (destdnl
->is_remote()) {
7544 rollback
.orig_dest
.remote_ino
= destdnl
->get_remote_ino();
7545 rollback
.orig_dest
.remote_d_type
= destdnl
->get_remote_d_type();
7549 rollback
.stray
.dirfrag
= straydn
->get_dir()->dirfrag();
7550 rollback
.stray
.dirfrag_old_mtime
= straydn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
7551 rollback
.stray
.dirfrag_old_rctime
= straydn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
7552 rollback
.stray
.dname
= straydn
->name
;
7554 ::encode(rollback
, mdr
->more()->rollback_bl
);
7555 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
7558 mdr
->ls
= mdlog
->get_current_segment();
7559 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_prep", mdr
->reqid
, mdr
->slave_to_mds
,
7560 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::RENAME
);
7561 mdlog
->start_entry(le
);
7562 le
->rollback
= mdr
->more()->rollback_bl
;
7564 bufferlist blah
; // inode import data... obviously not used if we're the slave
7565 _rename_prepare(mdr
, &le
->commit
, &blah
, srcdn
, destdn
, straydn
);
7567 if (le
->commit
.empty()) {
7568 dout(10) << " empty metablob, skipping journal" << dendl
;
7569 mdlog
->cancel_entry(le
);
7571 _logged_slave_rename(mdr
, srcdn
, destdn
, straydn
);
7573 mdr
->more()->slave_update_journaled
= true;
7574 submit_mdlog_entry(le
, new C_MDS_SlaveRenamePrep(this, mdr
, srcdn
, destdn
, straydn
),
7580 void Server::_logged_slave_rename(MDRequestRef
& mdr
,
7581 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7583 dout(10) << "_logged_slave_rename " << *mdr
<< dendl
;
7586 MMDSSlaveRequest
*reply
= NULL
;
7587 if (!mdr
->aborted
) {
7588 reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
7589 if (!mdr
->more()->slave_update_journaled
)
7590 reply
->mark_not_journaled();
7593 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
7594 CDentry::linkage_t
*destdnl
= NULL
;
7595 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
7598 if (srcdn
->is_auth() && srcdnl
->is_primary()) {
7599 // set export bounds for CInode::encode_export()
7601 if (srcdnl
->get_inode()->is_dir()) {
7602 srcdnl
->get_inode()->get_dirfrags(bounds
);
7603 for (list
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
)
7604 (*p
)->state_set(CDir::STATE_EXPORTBOUND
);
7607 map
<client_t
,entity_inst_t
> exported_client_map
;
7609 mdcache
->migrator
->encode_export_inode(srcdnl
->get_inode(), inodebl
,
7610 exported_client_map
);
7612 for (list
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
)
7613 (*p
)->state_clear(CDir::STATE_EXPORTBOUND
);
7616 ::encode(exported_client_map
, reply
->inode_export
, mds
->mdsmap
->get_up_features());
7617 reply
->inode_export
.claim_append(inodebl
);
7618 reply
->inode_export_v
= srcdnl
->get_inode()->inode
.version
;
7621 // remove mdr auth pin
7622 mdr
->auth_unpin(srcdnl
->get_inode());
7623 mdr
->more()->is_inode_exporter
= true;
7625 if (srcdnl
->get_inode()->is_dirty())
7626 srcdnl
->get_inode()->mark_clean();
7628 dout(10) << " exported srci " << *srcdnl
->get_inode() << dendl
;
7632 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
7634 destdnl
= destdn
->get_linkage();
7637 mds
->balancer
->hit_dir(mdr
->get_mds_stamp(), srcdn
->get_dir(), META_POP_IWR
);
7638 if (destdnl
->get_inode() && destdnl
->get_inode()->is_auth())
7639 mds
->balancer
->hit_inode(mdr
->get_mds_stamp(), destdnl
->get_inode(),
7643 mdr
->slave_request
->put();
7644 mdr
->slave_request
= 0;
7648 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
7650 assert(mdr
->aborted
);
7651 dout(10) << " abort flag set, finishing" << dendl
;
7652 mdcache
->request_finish(mdr
);
7656 void Server::_commit_slave_rename(MDRequestRef
& mdr
, int r
,
7657 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7659 dout(10) << "_commit_slave_rename " << *mdr
<< " r=" << r
<< dendl
;
7661 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
7663 list
<MDSInternalContextBase
*> finished
;
7665 // unfreeze+singleauth inode
7666 // hmm, do i really need to delay this?
7667 if (mdr
->more()->is_inode_exporter
) {
7669 CInode
*in
= destdnl
->get_inode();
7672 // we exported, clear out any xlocks that we moved to another MDS
7673 set
<SimpleLock
*>::iterator i
= mdr
->xlocks
.begin();
7674 while (i
!= mdr
->xlocks
.end()) {
7675 SimpleLock
*lock
= *i
++;
7677 // we only care about xlocks on the exported inode
7678 if (lock
->get_parent() == in
&&
7679 !lock
->is_locallock())
7680 mds
->locker
->xlock_export(lock
, mdr
.get());
7683 map
<client_t
,Capability::Import
> peer_imported
;
7684 bufferlist::iterator bp
= mdr
->more()->inode_import
.begin();
7685 ::decode(peer_imported
, bp
);
7687 dout(10) << " finishing inode export on " << *destdnl
->get_inode() << dendl
;
7688 mdcache
->migrator
->finish_export_inode(destdnl
->get_inode(),
7689 mdr
->get_mds_stamp(),
7690 mdr
->slave_to_mds
, peer_imported
, finished
);
7691 mds
->queue_waiters(finished
); // this includes SINGLEAUTH waiters.
7694 assert(destdnl
->get_inode()->is_frozen_inode());
7695 destdnl
->get_inode()->unfreeze_inode(finished
);
7699 if (mdr
->more()->is_ambiguous_auth
) {
7700 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
7701 mdr
->more()->is_ambiguous_auth
= false;
7705 mds
->queue_waiters(finished
);
7708 if (mdr
->more()->slave_update_journaled
) {
7709 // write a commit to the journal
7710 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_commit", mdr
->reqid
,
7711 mdr
->slave_to_mds
, ESlaveUpdate::OP_COMMIT
,
7712 ESlaveUpdate::RENAME
);
7713 mdlog
->start_entry(le
);
7714 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
7717 _committed_slave(mdr
);
7722 // rollback_bl may be empty if we froze the inode but had to provide an expanded
7723 // witness list from the master, and they failed before we tried prep again.
7724 if (mdr
->more()->rollback_bl
.length()) {
7725 if (mdr
->more()->is_inode_exporter
) {
7726 dout(10) << " reversing inode export of " << *destdnl
->get_inode() << dendl
;
7727 destdnl
->get_inode()->abort_export();
7729 if (mdcache
->is_ambiguous_slave_update(mdr
->reqid
, mdr
->slave_to_mds
)) {
7730 mdcache
->remove_ambiguous_slave_update(mdr
->reqid
, mdr
->slave_to_mds
);
7731 // rollback but preserve the slave request
7732 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
, false);
7733 mdr
->more()->rollback_bl
.clear();
7735 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
, true);
7737 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl
;
7739 if (mdr
->more()->is_ambiguous_auth
) {
7740 if (srcdn
->is_auth())
7741 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
7743 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
7744 mdr
->more()->is_ambiguous_auth
= false;
7746 mds
->queue_waiters(finished
);
7747 mdcache
->request_finish(mdr
);
7752 void _rollback_repair_dir(MutationRef
& mut
, CDir
*dir
, rename_rollback::drec
&r
, utime_t ctime
,
7753 bool isdir
, int linkunlink
, nest_info_t
&rstat
)
7756 pf
= dir
->project_fnode();
7757 mut
->add_projected_fnode(dir
);
7758 pf
->version
= dir
->pre_dirty();
7761 pf
->fragstat
.nsubdirs
+= linkunlink
;
7763 pf
->fragstat
.nfiles
+= linkunlink
;
7766 pf
->rstat
.rbytes
+= linkunlink
* rstat
.rbytes
;
7767 pf
->rstat
.rfiles
+= linkunlink
* rstat
.rfiles
;
7768 pf
->rstat
.rsubdirs
+= linkunlink
* rstat
.rsubdirs
;
7769 pf
->rstat
.rsnaprealms
+= linkunlink
* rstat
.rsnaprealms
;
7771 if (pf
->fragstat
.mtime
== ctime
) {
7772 pf
->fragstat
.mtime
= r
.dirfrag_old_mtime
;
7773 if (pf
->rstat
.rctime
== ctime
)
7774 pf
->rstat
.rctime
= r
.dirfrag_old_rctime
;
7776 mut
->add_updated_lock(&dir
->get_inode()->filelock
);
7777 mut
->add_updated_lock(&dir
->get_inode()->nestlock
);
7780 struct C_MDS_LoggedRenameRollback
: public ServerLogContext
{
7787 C_MDS_LoggedRenameRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
7788 CDentry
*sd
, version_t pv
, CDentry
*dd
,
7789 CDentry
*st
, bool f
) :
7790 ServerLogContext(s
, r
), mut(m
), srcdn(sd
), srcdnpv(pv
), destdn(dd
),
7791 straydn(st
), finish_mdr(f
) {}
7792 void finish(int r
) override
{
7793 server
->_rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
,
7794 destdn
, straydn
, finish_mdr
);
7798 void Server::do_rename_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
,
7801 rename_rollback rollback
;
7802 bufferlist::iterator p
= rbl
.begin();
7803 ::decode(rollback
, p
);
7805 dout(10) << "do_rename_rollback on " << rollback
.reqid
<< dendl
;
7806 // need to finish this update before sending resolve to claim the subtree
7807 mdcache
->add_rollback(rollback
.reqid
, master
);
7809 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
7810 mut
->ls
= mds
->mdlog
->get_current_segment();
7812 CDentry
*srcdn
= NULL
;
7813 CDir
*srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
);
7815 srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
.ino
, rollback
.orig_src
.dname
);
7817 dout(10) << " srcdir " << *srcdir
<< dendl
;
7818 srcdn
= srcdir
->lookup(rollback
.orig_src
.dname
);
7820 dout(10) << " srcdn " << *srcdn
<< dendl
;
7821 assert(srcdn
->get_linkage()->is_null());
7823 dout(10) << " srcdn not found" << dendl
;
7825 dout(10) << " srcdir not found" << dendl
;
7827 CDentry
*destdn
= NULL
;
7828 CDir
*destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
);
7830 destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
.ino
, rollback
.orig_dest
.dname
);
7832 dout(10) << " destdir " << *destdir
<< dendl
;
7833 destdn
= destdir
->lookup(rollback
.orig_dest
.dname
);
7835 dout(10) << " destdn " << *destdn
<< dendl
;
7837 dout(10) << " destdn not found" << dendl
;
7839 dout(10) << " destdir not found" << dendl
;
7842 if (rollback
.orig_src
.ino
) {
7843 in
= mdcache
->get_inode(rollback
.orig_src
.ino
);
7844 if (in
&& in
->is_dir())
7845 assert(srcdn
&& destdn
);
7847 in
= mdcache
->get_inode(rollback
.orig_src
.remote_ino
);
7849 CDir
*straydir
= NULL
;
7850 CDentry
*straydn
= NULL
;
7851 if (rollback
.stray
.dirfrag
.ino
) {
7852 straydir
= mdcache
->get_dirfrag(rollback
.stray
.dirfrag
);
7854 dout(10) << "straydir " << *straydir
<< dendl
;
7855 straydn
= straydir
->lookup(rollback
.stray
.dname
);
7857 dout(10) << " straydn " << *straydn
<< dendl
;
7858 assert(straydn
->get_linkage()->is_primary());
7860 dout(10) << " straydn not found" << dendl
;
7862 dout(10) << "straydir not found" << dendl
;
7865 CInode
*target
= NULL
;
7866 if (rollback
.orig_dest
.ino
) {
7867 target
= mdcache
->get_inode(rollback
.orig_dest
.ino
);
7869 assert(destdn
&& straydn
);
7870 } else if (rollback
.orig_dest
.remote_ino
)
7871 target
= mdcache
->get_inode(rollback
.orig_dest
.remote_ino
);
7873 // can't use is_auth() in the resolve stage
7874 mds_rank_t whoami
= mds
->get_nodeid();
7876 assert(!destdn
|| destdn
->authority().first
!= whoami
);
7877 assert(!straydn
|| straydn
->authority().first
!= whoami
);
7879 bool force_journal_src
= false;
7880 bool force_journal_dest
= false;
7881 if (in
&& in
->is_dir() && srcdn
->authority().first
!= whoami
)
7882 force_journal_src
= _need_force_journal(in
, false);
7883 if (in
&& target
&& target
->is_dir())
7884 force_journal_dest
= _need_force_journal(in
, true);
7886 version_t srcdnpv
= 0;
7889 if (srcdn
->authority().first
== whoami
)
7890 srcdnpv
= srcdn
->pre_dirty();
7891 if (rollback
.orig_src
.ino
) {
7893 srcdn
->push_projected_linkage(in
);
7895 srcdn
->push_projected_linkage(rollback
.orig_src
.remote_ino
,
7896 rollback
.orig_src
.remote_d_type
);
7901 if (in
->authority().first
== whoami
) {
7902 pi
= in
->project_inode();
7903 mut
->add_projected_inode(in
);
7904 pi
->version
= in
->pre_dirty();
7906 pi
= in
->get_projected_inode();
7907 if (pi
->ctime
== rollback
.ctime
)
7908 pi
->ctime
= rollback
.orig_src
.old_ctime
;
7911 if (srcdn
&& srcdn
->authority().first
== whoami
) {
7913 _rollback_repair_dir(mut
, srcdir
, rollback
.orig_src
, rollback
.ctime
,
7914 in
? in
->is_dir() : false, 1, pi
? pi
->accounted_rstat
: blah
);
7919 if (rollback
.orig_dest
.ino
&& target
) {
7920 destdn
->push_projected_linkage(target
);
7921 } else if (rollback
.orig_dest
.remote_ino
) {
7922 destdn
->push_projected_linkage(rollback
.orig_dest
.remote_ino
,
7923 rollback
.orig_dest
.remote_d_type
);
7925 // the dentry will be trimmed soon, it's ok to have wrong linkage
7926 if (rollback
.orig_dest
.ino
)
7927 assert(mds
->is_resolve());
7928 destdn
->push_projected_linkage();
7933 straydn
->push_projected_linkage();
7937 if (target
->authority().first
== whoami
) {
7938 ti
= target
->project_inode();
7939 mut
->add_projected_inode(target
);
7940 ti
->version
= target
->pre_dirty();
7942 ti
= target
->get_projected_inode();
7943 if (ti
->ctime
== rollback
.ctime
)
7944 ti
->ctime
= rollback
.orig_dest
.old_ctime
;
7945 if (MDS_INO_IS_STRAY(rollback
.orig_src
.dirfrag
.ino
)) {
7946 if (MDS_INO_IS_STRAY(rollback
.orig_dest
.dirfrag
.ino
))
7947 assert(!rollback
.orig_dest
.ino
&& !rollback
.orig_dest
.remote_ino
);
7949 assert(rollback
.orig_dest
.remote_ino
&&
7950 rollback
.orig_dest
.remote_ino
== rollback
.orig_src
.ino
);
7956 dout(0) << " srcdn back to " << *srcdn
<< dendl
;
7958 dout(0) << " srci back to " << *in
<< dendl
;
7960 dout(0) << " destdn back to " << *destdn
<< dendl
;
7962 dout(0) << " desti back to " << *target
<< dendl
;
7965 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_rollback", rollback
.reqid
, master
,
7966 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::RENAME
);
7967 mdlog
->start_entry(le
);
7969 if (srcdn
&& (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
7970 le
->commit
.add_dir_context(srcdir
);
7971 if (rollback
.orig_src
.ino
)
7972 le
->commit
.add_primary_dentry(srcdn
, 0, true);
7974 le
->commit
.add_remote_dentry(srcdn
, true);
7977 if (!rollback
.orig_src
.ino
&& // remote linkage
7978 in
&& in
->authority().first
== whoami
) {
7979 le
->commit
.add_dir_context(in
->get_projected_parent_dir());
7980 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
7983 if (force_journal_dest
) {
7984 assert(rollback
.orig_dest
.ino
);
7985 le
->commit
.add_dir_context(destdir
);
7986 le
->commit
.add_primary_dentry(destdn
, 0, true);
7989 // slave: no need to journal straydn
7991 if (target
&& target
!= in
&& target
->authority().first
== whoami
) {
7992 assert(rollback
.orig_dest
.remote_ino
);
7993 le
->commit
.add_dir_context(target
->get_projected_parent_dir());
7994 le
->commit
.add_primary_dentry(target
->get_projected_parent_dn(), target
, true);
7997 if (in
&& in
->is_dir() && (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
7998 dout(10) << " noting renamed dir ino " << in
->ino() << " in metablob" << dendl
;
7999 le
->commit
.renamed_dirino
= in
->ino();
8000 if (srcdn
->authority().first
== whoami
) {
8002 in
->get_dirfrags(ls
);
8003 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
8005 if (!dir
->is_auth())
8006 le
->commit
.renamed_dir_frags
.push_back(dir
->get_frag());
8008 dout(10) << " noting renamed dir open frags " << le
->commit
.renamed_dir_frags
<< dendl
;
8010 } else if (force_journal_dest
) {
8011 dout(10) << " noting rename target ino " << target
->ino() << " in metablob" << dendl
;
8012 le
->commit
.renamed_dirino
= target
->ino();
8015 if (target
&& target
->is_dir()) {
8017 mdcache
->project_subtree_rename(target
, straydir
, destdir
);
8020 if (in
&& in
->is_dir()) {
8022 mdcache
->project_subtree_rename(in
, destdir
, srcdir
);
8025 if (mdr
&& !mdr
->more()->slave_update_journaled
) {
8026 assert(le
->commit
.empty());
8027 mdlog
->cancel_entry(le
);
8029 _rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
, destdn
, straydn
, finish_mdr
);
8031 assert(!le
->commit
.empty());
8033 mdr
->more()->slave_update_journaled
= false;
8034 MDSLogContextBase
*fin
= new C_MDS_LoggedRenameRollback(this, mut
, mdr
, srcdn
, srcdnpv
,
8035 destdn
, straydn
, finish_mdr
);
8036 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
8041 void Server::_rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
,
8042 version_t srcdnpv
, CDentry
*destdn
,
8043 CDentry
*straydn
, bool finish_mdr
)
8045 dout(10) << "_rename_rollback_finish " << mut
->reqid
<< dendl
;
8048 straydn
->get_dir()->unlink_inode(straydn
);
8049 straydn
->pop_projected_linkage();
8052 destdn
->get_dir()->unlink_inode(destdn
);
8053 destdn
->pop_projected_linkage();
8056 srcdn
->pop_projected_linkage();
8057 if (srcdn
->authority().first
== mds
->get_nodeid())
8058 srcdn
->mark_dirty(srcdnpv
, mut
->ls
);
8063 if (srcdn
&& srcdn
->get_linkage()->is_primary()) {
8064 CInode
*in
= srcdn
->get_linkage()->get_inode();
8065 if (srcdn
->authority().first
== mds
->get_nodeid())
8066 in
->state_set(CInode::STATE_AUTH
);
8067 // update subtree map?
8068 if (in
&& in
->is_dir()) {
8070 mdcache
->adjust_subtree_after_rename(in
, destdn
->get_dir(), true);
8075 CInode
*oldin
= destdn
->get_linkage()->get_inode();
8076 // update subtree map?
8077 if (oldin
&& oldin
->is_dir()) {
8079 mdcache
->adjust_subtree_after_rename(oldin
, straydn
->get_dir(), true);
8083 if (mds
->is_resolve()) {
8086 root
= mdcache
->get_subtree_root(straydn
->get_dir());
8088 root
= mdcache
->get_subtree_root(destdn
->get_dir());
8090 mdcache
->try_trim_non_auth_subtree(root
);
8094 list
<MDSInternalContextBase
*> finished
;
8095 if (mdr
->more()->is_ambiguous_auth
) {
8096 if (srcdn
->is_auth())
8097 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
8099 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
8100 mdr
->more()->is_ambiguous_auth
= false;
8102 mds
->queue_waiters(finished
);
8103 if (finish_mdr
|| mdr
->aborted
)
8104 mdcache
->request_finish(mdr
);
8106 mdr
->more()->slave_rolling_back
= false;
8109 mdcache
->finish_rollback(mut
->reqid
);
8114 /* This function DOES put the passed message before returning*/
8115 void Server::handle_slave_rename_prep_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
8117 dout(10) << "handle_slave_rename_prep_ack " << *mdr
8118 << " witnessed by " << ack
->get_source()
8119 << " " << *ack
<< dendl
;
8120 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
8123 mdr
->more()->slaves
.insert(from
);
8124 if (mdr
->more()->srcdn_auth_mds
== from
&&
8125 mdr
->more()->is_remote_frozen_authpin
&&
8126 !mdr
->more()->is_ambiguous_auth
) {
8127 mdr
->set_ambiguous_auth(mdr
->more()->rename_inode
);
8130 // witnessed? or add extra witnesses?
8131 assert(mdr
->more()->witnessed
.count(from
) == 0);
8132 if (ack
->witnesses
.empty()) {
8133 mdr
->more()->witnessed
.insert(from
);
8134 if (!ack
->is_not_journaled())
8135 mdr
->more()->has_journaled_slaves
= true;
8137 dout(10) << " extra witnesses (srcdn replicas) are " << ack
->witnesses
<< dendl
;
8138 mdr
->more()->extra_witnesses
.swap(ack
->witnesses
);
8139 mdr
->more()->extra_witnesses
.erase(mds
->get_nodeid()); // not me!
8143 if (ack
->inode_export
.length()) {
8144 dout(10) << " got srci import" << dendl
;
8145 mdr
->more()->inode_import
.claim(ack
->inode_export
);
8146 mdr
->more()->inode_import_v
= ack
->inode_export_v
;
8149 // remove from waiting list
8150 assert(mdr
->more()->waiting_on_slave
.count(from
));
8151 mdr
->more()->waiting_on_slave
.erase(from
);
8153 if (mdr
->more()->waiting_on_slave
.empty())
8154 dispatch_client_request(mdr
); // go again!
8156 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
8159 void Server::handle_slave_rename_notify_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
8161 dout(10) << "handle_slave_rename_notify_ack " << *mdr
<< " from mds."
8162 << ack
->get_source() << dendl
;
8163 assert(mdr
->is_slave());
8164 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
8166 if (mdr
->more()->waiting_on_slave
.count(from
)) {
8167 mdr
->more()->waiting_on_slave
.erase(from
);
8169 if (mdr
->more()->waiting_on_slave
.empty()) {
8170 if (mdr
->slave_request
)
8171 dispatch_slave_request(mdr
);
8173 dout(10) << " still waiting for rename notify acks from "
8174 << mdr
->more()->waiting_on_slave
<< dendl
;
8178 void Server::_slave_rename_sessions_flushed(MDRequestRef
& mdr
)
8180 dout(10) << "_slave_rename_sessions_flushed " << *mdr
<< dendl
;
8182 if (mdr
->more()->waiting_on_slave
.count(MDS_RANK_NONE
)) {
8183 mdr
->more()->waiting_on_slave
.erase(MDS_RANK_NONE
);
8185 if (mdr
->more()->waiting_on_slave
.empty()) {
8186 if (mdr
->slave_request
)
8187 dispatch_slave_request(mdr
);
8189 dout(10) << " still waiting for rename notify acks from "
8190 << mdr
->more()->waiting_on_slave
<< dendl
;
8195 /* This function takes responsibility for the passed mdr*/
8196 void Server::handle_client_lssnap(MDRequestRef
& mdr
)
8198 MClientRequest
*req
= mdr
->client_request
;
8201 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8202 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8203 respond_to_request(mdr
, -ESTALE
);
8206 if (!diri
->is_auth()) {
8207 mdcache
->request_forward(mdr
, diri
->authority().first
);
8210 if (!diri
->is_dir()) {
8211 respond_to_request(mdr
, -ENOTDIR
);
8214 dout(10) << "lssnap on " << *diri
<< dendl
;
8217 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8218 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8219 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8222 if (!check_access(mdr
, diri
, MAY_READ
))
8225 SnapRealm
*realm
= diri
->find_snaprealm();
8226 map
<snapid_t
,SnapInfo
*> infomap
;
8227 realm
->get_snap_info(infomap
, diri
->get_oldest_snap());
8229 unsigned max_entries
= req
->head
.args
.readdir
.max_entries
;
8231 max_entries
= infomap
.size();
8232 int max_bytes
= req
->head
.args
.readdir
.max_bytes
;
8234 // make sure at least one item can be encoded
8235 max_bytes
= (512 << 10) + g_conf
->mds_max_xattr_pairs_size
;
8237 __u64 last_snapid
= 0;
8238 string offset_str
= req
->get_path2();
8239 if (!offset_str
.empty())
8240 last_snapid
= realm
->resolve_snapname(offset_str
, diri
->ino());
8243 encode_empty_dirstat(dirbl
);
8245 max_bytes
-= dirbl
.length() - sizeof(__u32
) + sizeof(__u8
) * 2;
8249 map
<snapid_t
,SnapInfo
*>::iterator p
= infomap
.upper_bound(last_snapid
);
8250 for (; p
!= infomap
.end() && num
< max_entries
; ++p
) {
8251 dout(10) << p
->first
<< " -> " << *p
->second
<< dendl
;
8255 if (p
->second
->ino
== diri
->ino())
8256 snap_name
= p
->second
->name
;
8258 snap_name
= p
->second
->get_long_name();
8260 unsigned start_len
= dnbl
.length();
8261 if (int(start_len
+ snap_name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > max_bytes
)
8264 ::encode(snap_name
, dnbl
);
8265 encode_infinite_lease(dnbl
);
8267 int r
= diri
->encode_inodestat(dnbl
, mdr
->session
, realm
, p
->first
, max_bytes
- (int)dnbl
.length());
8270 keep
.substr_of(dnbl
, 0, start_len
);
8277 ::encode(num
, dirbl
);
8279 if (p
== infomap
.end()) {
8280 flags
= CEPH_READDIR_FRAG_END
;
8281 if (last_snapid
== 0)
8282 flags
|= CEPH_READDIR_FRAG_COMPLETE
;
8284 ::encode(flags
, dirbl
);
8285 dirbl
.claim_append(dnbl
);
8287 mdr
->reply_extra_bl
= dirbl
;
8289 respond_to_request(mdr
, 0);
8295 struct C_MDS_mksnap_finish
: public ServerLogContext
{
8298 C_MDS_mksnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, SnapInfo
&i
) :
8299 ServerLogContext(s
, r
), diri(di
), info(i
) {}
8300 void finish(int r
) override
{
8301 server
->_mksnap_finish(mdr
, diri
, info
);
8305 /* This function takes responsibility for the passed mdr*/
8306 void Server::handle_client_mksnap(MDRequestRef
& mdr
)
8308 if (!mds
->mdsmap
->allows_snaps()) {
8309 // you can't make snapshots until you set an option right now
8310 respond_to_request(mdr
, -EPERM
);
8314 MClientRequest
*req
= mdr
->client_request
;
8315 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8316 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8317 respond_to_request(mdr
, -ESTALE
);
8321 if (!diri
->is_auth()) { // fw to auth?
8322 mdcache
->request_forward(mdr
, diri
->authority().first
);
8327 if (!diri
->is_dir()) {
8328 respond_to_request(mdr
, -ENOTDIR
);
8331 if (diri
->is_system() && !diri
->is_root()) {
8332 // no snaps in system dirs (root is ok)
8333 respond_to_request(mdr
, -EPERM
);
8337 const string
&snapname
= req
->get_filepath().last_dentry();
8339 if (mdr
->client_request
->get_caller_uid() < g_conf
->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf
->mds_snap_max_uid
) {
8340 dout(20) << "mksnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
8341 respond_to_request(mdr
, -EPERM
);
8345 dout(10) << "mksnap " << snapname
<< " on " << *diri
<< dendl
;
8348 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8350 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8351 rdlocks
.erase(&diri
->snaplock
);
8352 xlocks
.insert(&diri
->snaplock
);
8354 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8357 if (!check_access(mdr
, diri
, MAY_WRITE
))
8360 // make sure name is unique
8361 if (diri
->snaprealm
&&
8362 diri
->snaprealm
->exists(snapname
)) {
8363 respond_to_request(mdr
, -EEXIST
);
8366 if (snapname
.length() == 0 ||
8367 snapname
[0] == '_') {
8368 respond_to_request(mdr
, -EINVAL
);
8372 // allocate a snapid
8373 if (!mdr
->more()->stid
) {
8375 mds
->snapclient
->prepare_create(diri
->ino(), snapname
,
8376 mdr
->get_mds_stamp(),
8377 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
8378 new C_MDS_RetryRequest(mdcache
, mdr
));
8382 version_t stid
= mdr
->more()->stid
;
8384 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8385 ::decode(snapid
, p
);
8386 dout(10) << " stid " << stid
<< " snapid " << snapid
<< dendl
;
8390 info
.ino
= diri
->ino();
8391 info
.snapid
= snapid
;
8392 info
.name
= snapname
;
8393 info
.stamp
= mdr
->get_op_stamp();
8395 inode_t
*pi
= diri
->project_inode();
8396 pi
->ctime
= info
.stamp
;
8397 pi
->version
= diri
->pre_dirty();
8399 // project the snaprealm
8400 sr_t
*newsnap
= diri
->project_snaprealm(snapid
);
8401 newsnap
->snaps
[snapid
] = info
;
8402 newsnap
->seq
= snapid
;
8403 newsnap
->last_created
= snapid
;
8405 // journal the inode changes
8406 mdr
->ls
= mdlog
->get_current_segment();
8407 EUpdate
*le
= new EUpdate(mdlog
, "mksnap");
8408 mdlog
->start_entry(le
);
8410 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
8411 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
8412 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
8413 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
8415 // journal the snaprealm changes
8416 submit_mdlog_entry(le
, new C_MDS_mksnap_finish(this, mdr
, diri
, info
),
8421 void Server::_mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
)
8423 dout(10) << "_mksnap_finish " << *mdr
<< " " << info
<< dendl
;
8425 int op
= (diri
->snaprealm
? CEPH_SNAP_OP_CREATE
: CEPH_SNAP_OP_SPLIT
);
8427 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
8430 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
8433 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
8435 mdcache
->do_realm_invalidate_and_update_notify(diri
, op
);
8439 mdr
->snapid
= info
.snapid
;
8441 respond_to_request(mdr
, 0);
8447 struct C_MDS_rmsnap_finish
: public ServerLogContext
{
8450 C_MDS_rmsnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
8451 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
8452 void finish(int r
) override
{
8453 server
->_rmsnap_finish(mdr
, diri
, snapid
);
8457 /* This function takes responsibility for the passed mdr*/
8458 void Server::handle_client_rmsnap(MDRequestRef
& mdr
)
8460 MClientRequest
*req
= mdr
->client_request
;
8462 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8463 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8464 respond_to_request(mdr
, -ESTALE
);
8467 if (!diri
->is_auth()) { // fw to auth?
8468 mdcache
->request_forward(mdr
, diri
->authority().first
);
8471 if (!diri
->is_dir()) {
8472 respond_to_request(mdr
, -ENOTDIR
);
8476 const string
&snapname
= req
->get_filepath().last_dentry();
8478 if (mdr
->client_request
->get_caller_uid() < g_conf
->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf
->mds_snap_max_uid
) {
8479 dout(20) << "rmsnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
8480 respond_to_request(mdr
, -EPERM
);
8484 dout(10) << "rmsnap " << snapname
<< " on " << *diri
<< dendl
;
8487 if (snapname
.length() == 0 || snapname
[0] == '_') {
8488 respond_to_request(mdr
, -EINVAL
); // can't prune a parent snap, currently.
8491 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(snapname
)) {
8492 respond_to_request(mdr
, -ENOENT
);
8495 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(snapname
, diri
->ino());
8496 dout(10) << " snapname " << snapname
<< " is " << snapid
<< dendl
;
8498 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8499 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8500 rdlocks
.erase(&diri
->snaplock
);
8501 xlocks
.insert(&diri
->snaplock
);
8503 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8506 if (!check_access(mdr
, diri
, MAY_WRITE
))
8510 if (!mdr
->more()->stid
) {
8511 mds
->snapclient
->prepare_destroy(diri
->ino(), snapid
,
8512 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
8513 new C_MDS_RetryRequest(mdcache
, mdr
));
8516 version_t stid
= mdr
->more()->stid
;
8517 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8520 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
8523 inode_t
*pi
= diri
->project_inode();
8524 pi
->version
= diri
->pre_dirty();
8525 pi
->ctime
= mdr
->get_op_stamp();
8527 mdr
->ls
= mdlog
->get_current_segment();
8528 EUpdate
*le
= new EUpdate(mdlog
, "rmsnap");
8529 mdlog
->start_entry(le
);
8531 // project the snaprealm
8532 sr_t
*newnode
= diri
->project_snaprealm();
8533 newnode
->snaps
.erase(snapid
);
8535 newnode
->last_destroyed
= seq
;
8537 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
8538 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
8539 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
8540 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
8542 submit_mdlog_entry(le
, new C_MDS_rmsnap_finish(this, mdr
, diri
, snapid
),
8547 void Server::_rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
8549 dout(10) << "_rmsnap_finish " << *mdr
<< " " << snapid
<< dendl
;
8550 snapid_t stid
= mdr
->more()->stid
;
8551 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8555 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
8558 mds
->snapclient
->commit(stid
, mdr
->ls
);
8560 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
8562 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_DESTROY
);
8566 respond_to_request(mdr
, 0);
8568 // purge snapshot data
8569 if (diri
->snaprealm
->have_past_parents_open())
8570 diri
->purge_stale_snap_data(diri
->snaprealm
->get_snaps());
8573 struct C_MDS_renamesnap_finish
: public ServerLogContext
{
8576 C_MDS_renamesnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
8577 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
8578 void finish(int r
) override
{
8579 server
->_renamesnap_finish(mdr
, diri
, snapid
);
8583 /* This function takes responsibility for the passed mdr*/
8584 void Server::handle_client_renamesnap(MDRequestRef
& mdr
)
8586 MClientRequest
*req
= mdr
->client_request
;
8587 if (req
->get_filepath().get_ino() != req
->get_filepath2().get_ino()) {
8588 respond_to_request(mdr
, -EINVAL
);
8592 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8593 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8594 respond_to_request(mdr
, -ESTALE
);
8598 if (!diri
->is_auth()) { // fw to auth?
8599 mdcache
->request_forward(mdr
, diri
->authority().first
);
8603 if (!diri
->is_dir()) { // dir only
8604 respond_to_request(mdr
, -ENOTDIR
);
8608 if (mdr
->client_request
->get_caller_uid() < g_conf
->mds_snap_min_uid
||
8609 mdr
->client_request
->get_caller_uid() > g_conf
->mds_snap_max_uid
) {
8610 respond_to_request(mdr
, -EPERM
);
8614 const string
&dstname
= req
->get_filepath().last_dentry();
8615 const string
&srcname
= req
->get_filepath2().last_dentry();
8616 dout(10) << "renamesnap " << srcname
<< "->" << dstname
<< " on " << *diri
<< dendl
;
8618 if (srcname
.length() == 0 || srcname
[0] == '_') {
8619 respond_to_request(mdr
, -EINVAL
); // can't rename a parent snap.
8622 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(srcname
)) {
8623 respond_to_request(mdr
, -ENOENT
);
8626 if (dstname
.length() == 0 || dstname
[0] == '_') {
8627 respond_to_request(mdr
, -EINVAL
);
8630 if (diri
->snaprealm
->exists(dstname
)) {
8631 respond_to_request(mdr
, -EEXIST
);
8635 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(srcname
, diri
->ino());
8636 dout(10) << " snapname " << srcname
<< " is " << snapid
<< dendl
;
8639 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8641 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8642 rdlocks
.erase(&diri
->snaplock
);
8643 xlocks
.insert(&diri
->snaplock
);
8645 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8648 if (!check_access(mdr
, diri
, MAY_WRITE
))
8652 if (!mdr
->more()->stid
) {
8653 mds
->snapclient
->prepare_update(diri
->ino(), snapid
, dstname
, utime_t(),
8654 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
8655 new C_MDS_RetryRequest(mdcache
, mdr
));
8659 version_t stid
= mdr
->more()->stid
;
8660 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8663 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
8666 inode_t
*pi
= diri
->project_inode();
8667 pi
->ctime
= mdr
->get_op_stamp();
8668 pi
->version
= diri
->pre_dirty();
8670 // project the snaprealm
8671 sr_t
*newsnap
= diri
->project_snaprealm();
8672 assert(newsnap
->snaps
.count(snapid
));
8673 newsnap
->snaps
[snapid
].name
= dstname
;
8675 // journal the inode changes
8676 mdr
->ls
= mdlog
->get_current_segment();
8677 EUpdate
*le
= new EUpdate(mdlog
, "renamesnap");
8678 mdlog
->start_entry(le
);
8680 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
8681 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
8682 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
8683 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
8685 // journal the snaprealm changes
8686 submit_mdlog_entry(le
, new C_MDS_renamesnap_finish(this, mdr
, diri
, snapid
),
8691 void Server::_renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
8693 dout(10) << "_renamesnap_finish " << *mdr
<< " " << snapid
<< dendl
;
8695 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
8698 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
8700 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
8702 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_UPDATE
, true);
8707 mdr
->snapid
= snapid
;
8708 respond_to_request(mdr
, 0);
8712 * Return true if server is in state RECONNECT and this
8713 * client has not yet reconnected.
8715 bool Server::waiting_for_reconnect(client_t c
) const
8717 return client_reconnect_gather
.count(c
) > 0;
8720 void Server::dump_reconnect_status(Formatter
*f
) const
8722 f
->open_object_section("reconnect_status");
8723 f
->dump_stream("client_reconnect_gather") << client_reconnect_gather
;