1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include <boost/lexical_cast.hpp>
16 #include "include/assert.h" // lexical_cast includes system assert.h
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
27 #include "MDBalancer.h"
29 #include "SnapClient.h"
32 #include "msg/Messenger.h"
34 #include "osdc/Objecter.h"
36 #include "messages/MClientSession.h"
37 #include "messages/MClientRequest.h"
38 #include "messages/MClientReply.h"
39 #include "messages/MClientReconnect.h"
40 #include "messages/MClientCaps.h"
41 #include "messages/MClientSnap.h"
43 #include "messages/MMDSSlaveRequest.h"
45 #include "messages/MLock.h"
47 #include "events/EUpdate.h"
48 #include "events/ESlaveUpdate.h"
49 #include "events/ESession.h"
50 #include "events/EOpen.h"
51 #include "events/ECommitted.h"
53 #include "include/filepath.h"
54 #include "common/errno.h"
55 #include "common/Timer.h"
56 #include "common/perf_counters.h"
57 #include "include/compat.h"
58 #include "osd/OSDMap.h"
64 #include <boost/utility/string_view.hpp>
67 #include "common/config.h"
69 #define dout_context g_ceph_context
70 #define dout_subsys ceph_subsys_mds
72 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
74 class ServerContext
: public MDSInternalContextBase
{
77 MDSRank
*get_mds() override
83 explicit ServerContext(Server
*s
) : server(s
) {
84 assert(server
!= NULL
);
88 class ServerLogContext
: public MDSLogContextBase
{
91 MDSRank
*get_mds() override
97 void pre_finish(int r
) override
{
99 mdr
->mark_event("journal_committed: ");
102 explicit ServerLogContext(Server
*s
) : server(s
) {
103 assert(server
!= NULL
);
105 explicit ServerLogContext(Server
*s
, MDRequestRef
& r
) : server(s
), mdr(r
) {
106 assert(server
!= NULL
);
110 void Server::create_logger()
112 PerfCountersBuilder
plb(g_ceph_context
, "mds_server", l_mdss_first
, l_mdss_last
);
114 plb
.add_u64_counter(l_mdss_handle_client_request
, "handle_client_request",
115 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING
);
116 plb
.add_u64_counter(l_mdss_handle_slave_request
, "handle_slave_request",
117 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING
);
118 plb
.add_u64_counter(l_mdss_handle_client_session
,
119 "handle_client_session", "Client session messages", "hcs",
120 PerfCountersBuilder::PRIO_INTERESTING
);
122 // fop latencies are useful
123 plb
.set_prio_default(PerfCountersBuilder::PRIO_USEFUL
);
124 plb
.add_time_avg(l_mdss_req_lookuphash_latency
, "req_lookuphash_latency",
125 "Request type lookup hash of inode latency");
126 plb
.add_time_avg(l_mdss_req_lookupino_latency
, "req_lookupino_latency",
127 "Request type lookup inode latency");
128 plb
.add_time_avg(l_mdss_req_lookupparent_latency
, "req_lookupparent_latency",
129 "Request type lookup parent latency");
130 plb
.add_time_avg(l_mdss_req_lookupname_latency
, "req_lookupname_latency",
131 "Request type lookup name latency");
132 plb
.add_time_avg(l_mdss_req_lookup_latency
, "req_lookup_latency",
133 "Request type lookup latency");
134 plb
.add_time_avg(l_mdss_req_lookupsnap_latency
, "req_lookupsnap_latency",
135 "Request type lookup snapshot latency");
136 plb
.add_time_avg(l_mdss_req_getattr_latency
, "req_getattr_latency",
137 "Request type get attribute latency");
138 plb
.add_time_avg(l_mdss_req_setattr_latency
, "req_setattr_latency",
139 "Request type set attribute latency");
140 plb
.add_time_avg(l_mdss_req_setlayout_latency
, "req_setlayout_latency",
141 "Request type set file layout latency");
142 plb
.add_time_avg(l_mdss_req_setdirlayout_latency
, "req_setdirlayout_latency",
143 "Request type set directory layout latency");
144 plb
.add_time_avg(l_mdss_req_setxattr_latency
, "req_setxattr_latency",
145 "Request type set extended attribute latency");
146 plb
.add_time_avg(l_mdss_req_rmxattr_latency
, "req_rmxattr_latency",
147 "Request type remove extended attribute latency");
148 plb
.add_time_avg(l_mdss_req_readdir_latency
, "req_readdir_latency",
149 "Request type read directory latency");
150 plb
.add_time_avg(l_mdss_req_setfilelock_latency
, "req_setfilelock_latency",
151 "Request type set file lock latency");
152 plb
.add_time_avg(l_mdss_req_getfilelock_latency
, "req_getfilelock_latency",
153 "Request type get file lock latency");
154 plb
.add_time_avg(l_mdss_req_create_latency
, "req_create_latency",
155 "Request type create latency");
156 plb
.add_time_avg(l_mdss_req_open_latency
, "req_open_latency",
157 "Request type open latency");
158 plb
.add_time_avg(l_mdss_req_mknod_latency
, "req_mknod_latency",
159 "Request type make node latency");
160 plb
.add_time_avg(l_mdss_req_link_latency
, "req_link_latency",
161 "Request type link latency");
162 plb
.add_time_avg(l_mdss_req_unlink_latency
, "req_unlink_latency",
163 "Request type unlink latency");
164 plb
.add_time_avg(l_mdss_req_rmdir_latency
, "req_rmdir_latency",
165 "Request type remove directory latency");
166 plb
.add_time_avg(l_mdss_req_rename_latency
, "req_rename_latency",
167 "Request type rename latency");
168 plb
.add_time_avg(l_mdss_req_mkdir_latency
, "req_mkdir_latency",
169 "Request type make directory latency");
170 plb
.add_time_avg(l_mdss_req_symlink_latency
, "req_symlink_latency",
171 "Request type symbolic link latency");
172 plb
.add_time_avg(l_mdss_req_lssnap_latency
, "req_lssnap_latency",
173 "Request type list snapshot latency");
174 plb
.add_time_avg(l_mdss_req_mksnap_latency
, "req_mksnap_latency",
175 "Request type make snapshot latency");
176 plb
.add_time_avg(l_mdss_req_rmsnap_latency
, "req_rmsnap_latency",
177 "Request type remove snapshot latency");
178 plb
.add_time_avg(l_mdss_req_renamesnap_latency
, "req_renamesnap_latency",
179 "Request type rename snapshot latency");
181 plb
.add_u64_counter(l_mdss_cap_revoke_eviction
, "cap_revoke_eviction",
182 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING
);
184 plb
.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY
);
185 plb
.add_u64_counter(l_mdss_dispatch_client_request
, "dispatch_client_request",
186 "Client requests dispatched");
187 plb
.add_u64_counter(l_mdss_dispatch_slave_request
, "dispatch_server_request",
188 "Server requests dispatched");
190 logger
= plb
.create_perf_counters();
191 g_ceph_context
->get_perfcounters_collection()->add(logger
);
194 Server::Server(MDSRank
*m
) :
196 mdcache(mds
->mdcache
), mdlog(mds
->mdlog
),
199 reconnect_done(NULL
),
200 failed_reconnects(0),
201 reconnect_evicting(false),
202 terminating_sessions(false)
207 /* This function DOES put the passed message before returning*/
208 void Server::dispatch(Message
*m
)
210 switch (m
->get_type()) {
211 case CEPH_MSG_CLIENT_RECONNECT
:
212 handle_client_reconnect(static_cast<MClientReconnect
*>(m
));
217 // handle_slave_request()/handle_client_session() will wait if necessary
218 if (m
->get_type() == CEPH_MSG_CLIENT_REQUEST
&& !mds
->is_active()) {
219 MClientRequest
*req
= static_cast<MClientRequest
*>(m
);
220 if (mds
->is_reconnect() || mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
221 Session
*session
= mds
->get_session(req
);
222 if (!session
|| session
->is_closed()) {
223 dout(5) << "session is closed, dropping " << req
->get_reqid() << dendl
;
227 bool queue_replay
= false;
228 if (req
->is_replay()) {
229 dout(3) << "queuing replayed op" << dendl
;
231 } else if (req
->get_retry_attempt()) {
232 // process completed request in clientreplay stage. The completed request
233 // might have created new file/directorie. This guarantees MDS sends a reply
234 // to client before other request modifies the new file/directorie.
235 if (session
->have_completed_request(req
->get_reqid().tid
, NULL
)) {
236 dout(3) << "queuing completed op" << dendl
;
239 // this request was created before the cap reconnect message, drop any embedded
241 req
->releases
.clear();
244 req
->mark_queued_for_replay();
245 mds
->enqueue_replay(new C_MDS_RetryMessage(mds
, m
));
250 bool wait_for_active
= true;
251 if (mds
->is_stopping()) {
252 wait_for_active
= false;
253 } else if (mds
->is_clientreplay()) {
254 if (req
->is_queued_for_replay()) {
255 wait_for_active
= false;
258 if (wait_for_active
) {
259 dout(3) << "not active yet, waiting" << dendl
;
260 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
265 switch (m
->get_type()) {
266 case CEPH_MSG_CLIENT_SESSION
:
267 handle_client_session(static_cast<MClientSession
*>(m
));
269 case CEPH_MSG_CLIENT_REQUEST
:
270 handle_client_request(static_cast<MClientRequest
*>(m
));
272 case MSG_MDS_SLAVE_REQUEST
:
273 handle_slave_request(static_cast<MMDSSlaveRequest
*>(m
));
276 derr
<< "server unknown message " << m
->get_type() << dendl
;
277 assert(0 == "server unknown message");
283 // ----------------------------------------------------------
284 // SESSION management
286 class C_MDS_session_finish
: public ServerLogContext
{
291 interval_set
<inodeno_t
> inos
;
295 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, Context
*fin_
= NULL
) :
296 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inotablev(0), fin(fin_
) { }
297 C_MDS_session_finish(Server
*srv
, Session
*se
, uint64_t sseq
, bool s
, version_t mv
, interval_set
<inodeno_t
>& i
, version_t iv
, Context
*fin_
= NULL
) :
298 ServerLogContext(srv
), session(se
), state_seq(sseq
), open(s
), cmapv(mv
), inos(i
), inotablev(iv
), fin(fin_
) { }
299 void finish(int r
) override
{
301 server
->_session_logged(session
, state_seq
, open
, cmapv
, inos
, inotablev
);
308 /* This function DOES put the passed message before returning*/
309 void Server::handle_client_session(MClientSession
*m
)
312 bool blacklisted
= false;
313 Session
*session
= mds
->get_session(m
);
315 dout(3) << "handle_client_session " << *m
<< " from " << m
->get_source() << dendl
;
316 assert(m
->get_source().is_client()); // should _not_ come from an mds!
319 dout(0) << " ignoring sessionless msg " << *m
<< dendl
;
324 if (m
->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS
) {
325 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
326 } else if (m
->get_op() == CEPH_SESSION_REQUEST_CLOSE
) {
327 // close requests need to be handled when mds is active
328 if (mds
->get_state() < MDSMap::STATE_ACTIVE
) {
329 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
333 if (mds
->get_state() < MDSMap::STATE_CLIENTREPLAY
) {
334 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
340 logger
->inc(l_mdss_handle_client_session
);
343 switch (m
->get_op()) {
344 case CEPH_SESSION_REQUEST_OPEN
:
345 if (session
->is_opening() ||
346 session
->is_open() ||
347 session
->is_stale() ||
348 session
->is_killing() ||
349 terminating_sessions
) {
350 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl
;
351 // set client metadata for session opened by prepare_force_open_sessions
352 if (!m
->client_meta
.empty())
353 session
->set_client_metadata(m
->client_meta
);
357 assert(session
->is_closed() ||
358 session
->is_closing());
360 if (mds
->is_stopping()) {
361 dout(10) << "mds is stopping, dropping open req" << dendl
;
366 blacklisted
= mds
->objecter
->with_osdmap(
367 [session
](const OSDMap
&osd_map
) -> bool {
368 return osd_map
.is_blacklisted(session
->info
.inst
.addr
);
372 dout(10) << "rejecting blacklisted client " << session
->info
.inst
.addr
<< dendl
;
373 mds
->send_message_client(new MClientSession(CEPH_SESSION_REJECT
), session
);
378 session
->set_client_metadata(m
->client_meta
);
379 dout(20) << __func__
<< " CEPH_SESSION_REQUEST_OPEN "
380 << session
->info
.client_metadata
.size() << " metadata entries:" << dendl
;
381 for (map
<string
, string
>::iterator i
= session
->info
.client_metadata
.begin();
382 i
!= session
->info
.client_metadata
.end(); ++i
) {
383 dout(20) << " " << i
->first
<< ": " << i
->second
<< dendl
;
386 // Special case for the 'root' metadata path; validate that the claimed
387 // root is actually within the caps of the session
388 if (session
->info
.client_metadata
.count("root")) {
389 const auto claimed_root
= session
->info
.client_metadata
.at("root");
390 // claimed_root has a leading "/" which we strip before passing
392 if (claimed_root
.empty() || claimed_root
[0] != '/' ||
393 !session
->auth_caps
.path_capable(claimed_root
.substr(1))) {
394 derr
<< __func__
<< " forbidden path claimed as mount root: "
395 << claimed_root
<< " by " << m
->get_source() << dendl
;
396 // Tell the client we're rejecting their open
397 mds
->send_message_client(new MClientSession(CEPH_SESSION_REJECT
), session
);
398 mds
->clog
->warn() << "client session with invalid root '" <<
399 claimed_root
<< "' denied (" << session
->info
.inst
<< ")";
401 // Drop out; don't record this session in SessionMap or journal it.
406 if (session
->is_closed())
407 mds
->sessionmap
.add_session(session
);
409 pv
= mds
->sessionmap
.mark_projected(session
);
410 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
411 mds
->sessionmap
.touch_session(session
);
412 mdlog
->start_submit_entry(new ESession(m
->get_source_inst(), true, pv
, m
->client_meta
),
413 new C_MDS_session_finish(this, session
, sseq
, true, pv
));
417 case CEPH_SESSION_REQUEST_RENEWCAPS
:
418 if (session
->is_open() ||
419 session
->is_stale()) {
420 mds
->sessionmap
.touch_session(session
);
421 if (session
->is_stale()) {
422 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
423 mds
->locker
->resume_stale_caps(session
);
424 mds
->sessionmap
.touch_session(session
);
426 m
->get_connection()->send_message(new MClientSession(CEPH_SESSION_RENEWCAPS
, m
->get_seq()));
428 dout(10) << "ignoring renewcaps on non open|stale session (" << session
->get_state_name() << ")" << dendl
;
432 case CEPH_SESSION_REQUEST_CLOSE
:
434 if (session
->is_closed() ||
435 session
->is_closing() ||
436 session
->is_killing()) {
437 dout(10) << "already closed|closing|killing, dropping this req" << dendl
;
441 if (session
->is_importing()) {
442 dout(10) << "ignoring close req on importing session" << dendl
;
446 assert(session
->is_open() ||
447 session
->is_stale() ||
448 session
->is_opening());
449 if (m
->get_seq() < session
->get_push_seq()) {
450 dout(10) << "old push seq " << m
->get_seq() << " < " << session
->get_push_seq()
451 << ", dropping" << dendl
;
455 // We are getting a seq that is higher than expected.
456 // Handle the same as any other seqn error.
458 if (m
->get_seq() != session
->get_push_seq()) {
459 dout(0) << "old push seq " << m
->get_seq() << " != " << session
->get_push_seq()
460 << ", BUGGY!" << dendl
;
461 mds
->clog
->warn() << "incorrect push seq " << m
->get_seq() << " != "
462 << session
->get_push_seq() << ", dropping" << " from client : " << session
->get_human_name();
466 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
470 case CEPH_SESSION_FLUSHMSG_ACK
:
471 finish_flush_session(session
, m
->get_seq());
474 case CEPH_SESSION_REQUEST_FLUSH_MDLOG
:
475 if (mds
->is_active())
485 void Server::flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
)
487 for (set
<client_t
>::iterator p
= client_set
.begin(); p
!= client_set
.end(); ++p
) {
488 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->v
));
490 if (!session
->is_open() ||
491 !session
->connection
.get() ||
492 !session
->connection
->has_feature(CEPH_FEATURE_EXPORT_PEER
))
494 version_t seq
= session
->wait_for_flush(gather
.new_sub());
495 mds
->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG
, seq
), session
);
499 void Server::finish_flush_session(Session
*session
, version_t seq
)
501 list
<MDSInternalContextBase
*> finished
;
502 session
->finish_flush(seq
, finished
);
503 mds
->queue_waiters(finished
);
506 void Server::_session_logged(Session
*session
, uint64_t state_seq
, bool open
, version_t pv
,
507 interval_set
<inodeno_t
>& inos
, version_t piv
)
509 dout(10) << "_session_logged " << session
->info
.inst
<< " state_seq " << state_seq
<< " " << (open
? "open":"close")
510 << " " << pv
<< dendl
;
513 assert(session
->is_closing() || session
->is_killing() ||
514 session
->is_opening()); // re-open closing session
515 session
->info
.prealloc_inos
.subtract(inos
);
516 mds
->inotable
->apply_release_ids(inos
);
517 assert(mds
->inotable
->get_version() == piv
);
520 mds
->sessionmap
.mark_dirty(session
);
523 if (session
->get_state_seq() != state_seq
) {
524 dout(10) << " journaled state_seq " << state_seq
<< " != current " << session
->get_state_seq()
525 << ", noop" << dendl
;
526 // close must have been canceled (by an import?), or any number of other things..
528 assert(session
->is_opening());
529 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
530 mds
->sessionmap
.touch_session(session
);
531 assert(session
->connection
!= NULL
);
532 session
->connection
->send_message(new MClientSession(CEPH_SESSION_OPEN
));
533 if (mdcache
->is_readonly())
534 session
->connection
->send_message(new MClientSession(CEPH_SESSION_FORCE_RO
));
535 } else if (session
->is_closing() ||
536 session
->is_killing()) {
537 // kill any lingering capabilities, leases, requests
538 while (!session
->caps
.empty()) {
539 Capability
*cap
= session
->caps
.front();
540 CInode
*in
= cap
->get_inode();
541 dout(20) << " killing capability " << ccap_string(cap
->issued()) << " on " << *in
<< dendl
;
542 mds
->locker
->remove_client_cap(in
, session
->info
.inst
.name
.num());
544 while (!session
->leases
.empty()) {
545 ClientLease
*r
= session
->leases
.front();
546 CDentry
*dn
= static_cast<CDentry
*>(r
->parent
);
547 dout(20) << " killing client lease of " << *dn
<< dendl
;
548 dn
->remove_client_lease(r
, mds
->locker
);
550 if (client_reconnect_gather
.count(session
->info
.get_client())) {
551 dout(20) << " removing client from reconnect set" << dendl
;
552 client_reconnect_gather
.erase(session
->info
.get_client());
554 if (client_reconnect_gather
.empty()) {
555 dout(7) << " client " << session
->info
.inst
<< " was last reconnect, finishing" << dendl
;
556 reconnect_gather_finish();
560 if (session
->is_closing()) {
561 // mark con disposable. if there is a fault, we will get a
562 // reset and clean it up. if the client hasn't received the
563 // CLOSE message yet, they will reconnect and get an
564 // ms_handle_remote_reset() and realize they had in fact closed.
565 // do this *before* sending the message to avoid a possible
567 if (session
->connection
!= NULL
) {
568 // Conditional because terminate_sessions will indiscrimately
569 // put sessions in CLOSING whether they ever had a conn or not.
570 session
->connection
->mark_disposable();
574 mds
->send_message_client(new MClientSession(CEPH_SESSION_CLOSE
), session
);
575 mds
->sessionmap
.set_state(session
, Session::STATE_CLOSED
);
577 mds
->sessionmap
.remove_session(session
);
578 } else if (session
->is_killing()) {
579 // destroy session, close connection
580 if (session
->connection
!= NULL
) {
581 session
->connection
->mark_down();
582 session
->connection
->set_priv(NULL
);
584 mds
->sessionmap
.remove_session(session
);
594 * Inject sessions from some source other than actual connections.
597 * - sessions inferred from journal replay
598 * - sessions learned from other MDSs during rejoin
599 * - sessions learned from other MDSs during dir/caps migration
600 * - sessions learned from other MDSs during a cross-MDS rename
602 version_t
Server::prepare_force_open_sessions(map
<client_t
,entity_inst_t
>& cm
,
603 map
<client_t
, pair
<Session
*,uint64_t> >& smap
)
605 version_t pv
= mds
->sessionmap
.get_projected();
607 dout(10) << "prepare_force_open_sessions " << pv
608 << " on " << cm
.size() << " clients"
611 mds
->objecter
->with_osdmap(
612 [this, &cm
](const OSDMap
&osd_map
) {
613 for (auto p
= cm
.begin(); p
!= cm
.end(); ) {
614 if (osd_map
.is_blacklisted(p
->second
.addr
)) {
615 dout(10) << " ignoring blacklisted client." << p
->first
616 << " (" << p
->second
.addr
<< ")" << dendl
;
624 for (map
<client_t
,entity_inst_t
>::iterator p
= cm
.begin(); p
!= cm
.end(); ++p
) {
625 Session
*session
= mds
->sessionmap
.get_or_add_session(p
->second
);
626 pv
= mds
->sessionmap
.mark_projected(session
);
628 if (session
->is_closed() ||
629 session
->is_closing() ||
630 session
->is_killing()) {
631 sseq
= mds
->sessionmap
.set_state(session
, Session::STATE_OPENING
);
633 assert(session
->is_open() ||
634 session
->is_opening() ||
635 session
->is_stale());
638 smap
[p
->first
] = make_pair(session
, sseq
);
639 session
->inc_importing();
644 void Server::finish_force_open_sessions(const map
<client_t
,pair
<Session
*,uint64_t> >& smap
,
648 * FIXME: need to carefully consider the race conditions between a
649 * client trying to close a session and an MDS doing an import
650 * trying to force open a session...
652 dout(10) << "finish_force_open_sessions on " << smap
.size() << " clients,"
653 << " initial v " << mds
->sessionmap
.get_version() << dendl
;
655 for (auto &it
: smap
) {
656 Session
*session
= it
.second
.first
;
657 uint64_t sseq
= it
.second
.second
;
659 if (session
->get_state_seq() != sseq
) {
660 dout(10) << "force_open_sessions skipping changed " << session
->info
.inst
<< dendl
;
662 dout(10) << "force_open_sessions opened " << session
->info
.inst
<< dendl
;
663 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
664 mds
->sessionmap
.touch_session(session
);
665 mds
->send_message_client(new MClientSession(CEPH_SESSION_OPEN
), session
);
666 if (mdcache
->is_readonly())
667 mds
->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO
), session
);
670 dout(10) << "force_open_sessions skipping already-open " << session
->info
.inst
<< dendl
;
671 assert(session
->is_open() || session
->is_stale());
675 session
->dec_importing();
678 mds
->sessionmap
.mark_dirty(session
);
681 dout(10) << __func__
<< ": final v " << mds
->sessionmap
.get_version() << dendl
;
684 class C_MDS_TerminatedSessions
: public ServerContext
{
685 void finish(int r
) override
{
686 server
->terminating_sessions
= false;
689 explicit C_MDS_TerminatedSessions(Server
*s
) : ServerContext(s
) {}
692 void Server::terminate_sessions()
694 dout(2) << "terminate_sessions" << dendl
;
696 terminating_sessions
= true;
698 // kill them off. clients will retry etc.
699 set
<Session
*> sessions
;
700 mds
->sessionmap
.get_client_session_set(sessions
);
701 for (set
<Session
*>::const_iterator p
= sessions
.begin();
704 Session
*session
= *p
;
705 if (session
->is_closing() ||
706 session
->is_killing() ||
707 session
->is_closed())
709 journal_close_session(session
, Session::STATE_CLOSING
, NULL
);
712 mdlog
->wait_for_safe(new C_MDS_TerminatedSessions(this));
716 void Server::find_idle_sessions()
718 auto now
= clock::now();
719 auto last_cleared_laggy
= mds
->last_cleared_laggy();
721 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy
<< "s ago" << dendl
;
724 // (caps go stale, lease die)
725 double queue_max_age
= mds
->get_dispatch_queue_max_age(ceph_clock_now());
726 double cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_timeout();
728 Session
*session
= mds
->sessionmap
.get_oldest_session(Session::STATE_OPEN
);
730 auto last_cap_renew_span
= std::chrono::duration
<double>(now
-session
->last_cap_renew
).count();
731 if (last_cap_renew_span
< cutoff
) {
732 dout(20) << "laggiest active session is " << session
->info
.inst
<< " and renewed caps recently (" << last_cap_renew_span
<< "s ago)" << dendl
;
736 dout(10) << "new stale session " << session
->info
.inst
<< " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
737 mds
->sessionmap
.set_state(session
, Session::STATE_STALE
);
738 mds
->locker
->revoke_stale_caps(session
);
739 mds
->locker
->remove_stale_leases(session
);
740 mds
->send_message_client(new MClientSession(CEPH_SESSION_STALE
, session
->get_push_seq()), session
);
741 finish_flush_session(session
, session
->get_push_seq());
745 cutoff
= queue_max_age
+ mds
->mdsmap
->get_session_autoclose();
747 // don't kick clients if we've been laggy
748 if (last_cleared_laggy
< cutoff
) {
749 dout(10) << " last cleared laggy " << last_cleared_laggy
<< "s ago (< cutoff " << cutoff
750 << "), not kicking any clients to be safe" << dendl
;
754 if (mds
->sessionmap
.get_sessions().size() == 1 && mds
->mdsmap
->get_num_in_mds() == 1) {
755 dout(20) << "skipping client eviction because there is only one" << dendl
;
759 // Collect a list of sessions exceeding the autoclose threshold
760 std::vector
<Session
*> to_evict
;
761 const auto sessions_p
= mds
->sessionmap
.by_state
.find(Session::STATE_STALE
);
762 if (sessions_p
== mds
->sessionmap
.by_state
.end() || sessions_p
->second
->empty()) {
765 const auto &stale_sessions
= sessions_p
->second
;
766 assert(stale_sessions
!= nullptr);
768 for (const auto &session
: *stale_sessions
) {
769 auto last_cap_renew_span
= std::chrono::duration
<double>(now
-session
->last_cap_renew
).count();
770 if (session
->is_importing()) {
771 dout(10) << "stopping at importing session " << session
->info
.inst
<< dendl
;
774 assert(session
->is_stale());
775 if (last_cap_renew_span
< cutoff
) {
776 dout(20) << "oldest stale session is " << session
->info
.inst
<< " and recently renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
780 to_evict
.push_back(session
);
783 for (const auto &session
: to_evict
) {
784 auto last_cap_renew_span
= std::chrono::duration
<double>(now
-session
->last_cap_renew
).count();
785 mds
->clog
->warn() << "evicting unresponsive client " << *session
<< ", after " << last_cap_renew_span
<< " seconds";
786 dout(10) << "autoclosing stale session " << session
->info
.inst
<< " last renewed caps " << last_cap_renew_span
<< "s ago" << dendl
;
788 if (g_conf
->mds_session_blacklist_on_timeout
) {
789 std::stringstream ss
;
790 mds
->evict_client(session
->info
.inst
.name
.num(), false, true,
793 kill_session(session
, NULL
);
798 void Server::evict_cap_revoke_non_responders() {
799 if (!cap_revoke_eviction_timeout
) {
803 std::list
<client_t
> to_evict
;
804 mds
->locker
->get_late_revoking_clients(&to_evict
, cap_revoke_eviction_timeout
);
806 for (auto const &client
: to_evict
) {
807 mds
->clog
->warn() << "client id " << client
<< " has not responded to"
808 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
809 << " seconds, evicting";
810 dout(1) << __func__
<< ": evicting cap revoke non-responder client id "
813 std::stringstream ss
;
814 bool evicted
= mds
->evict_client(client
.v
, false,
815 g_conf
->mds_session_blacklist_on_evict
,
817 if (evicted
&& logger
) {
818 logger
->inc(l_mdss_cap_revoke_eviction
);
823 void Server::handle_conf_change(const struct md_config_t
*conf
,
824 const std::set
<std::string
> &changed
) {
825 if (changed
.count("mds_cap_revoke_eviction_timeout")) {
826 cap_revoke_eviction_timeout
= conf
->get_val
<double>("mds_cap_revoke_eviction_timeout");
827 dout(20) << __func__
<< " cap revoke eviction timeout changed to "
828 << cap_revoke_eviction_timeout
<< dendl
;
833 * XXX bump in the interface here, not using an MDSInternalContextBase here
834 * because all the callers right now happen to use a SaferCond
836 void Server::kill_session(Session
*session
, Context
*on_safe
)
838 assert(mds
->mds_lock
.is_locked_by_me());
840 if ((session
->is_opening() ||
841 session
->is_open() ||
842 session
->is_stale()) &&
843 !session
->is_importing()) {
844 dout(10) << "kill_session " << session
<< dendl
;
845 journal_close_session(session
, Session::STATE_KILLING
, on_safe
);
847 dout(10) << "kill_session importing or already closing/killing " << session
<< dendl
;
848 assert(session
->is_closing() ||
849 session
->is_closed() ||
850 session
->is_killing() ||
851 session
->is_importing());
853 on_safe
->complete(0);
858 size_t Server::apply_blacklist(const std::set
<entity_addr_t
> &blacklist
)
860 std::list
<Session
*> victims
;
861 const auto sessions
= mds
->sessionmap
.get_sessions();
862 for (const auto p
: sessions
) {
863 if (!p
.first
.is_client()) {
864 // Do not apply OSDMap blacklist to MDS daemons, we find out
865 // about their death via MDSMap.
869 Session
*s
= p
.second
;
870 if (blacklist
.count(s
->info
.inst
.addr
)) {
871 victims
.push_back(s
);
875 for (const auto s
: victims
) {
876 kill_session(s
, nullptr);
879 dout(10) << "apply_blacklist: killed " << victims
.size() << dendl
;
881 return victims
.size();
884 void Server::journal_close_session(Session
*session
, int state
, Context
*on_safe
)
886 uint64_t sseq
= mds
->sessionmap
.set_state(session
, state
);
887 version_t pv
= mds
->sessionmap
.mark_projected(session
);
890 // release alloc and pending-alloc inos for this session
891 // and wipe out session state, in case the session close aborts for some reason
892 interval_set
<inodeno_t
> both
;
893 both
.insert(session
->info
.prealloc_inos
);
894 both
.insert(session
->pending_prealloc_inos
);
896 mds
->inotable
->project_release_ids(both
);
897 piv
= mds
->inotable
->get_projected_version();
901 mdlog
->start_submit_entry(new ESession(session
->info
.inst
, false, pv
, both
, piv
),
902 new C_MDS_session_finish(this, session
, sseq
, false, pv
, both
, piv
, on_safe
));
905 // clean up requests, too
906 elist
<MDRequestImpl
*>::iterator p
=
907 session
->requests
.begin(member_offset(MDRequestImpl
,
908 item_session_request
));
910 MDRequestRef mdr
= mdcache
->request_get((*p
)->reqid
);
912 mdcache
->request_kill(mdr
);
915 finish_flush_session(session
, session
->get_push_seq());
918 void Server::reconnect_clients(MDSInternalContext
*reconnect_done_
)
920 reconnect_done
= reconnect_done_
;
922 set
<Session
*> sessions
;
923 mds
->sessionmap
.get_client_session_set(sessions
);
924 for (auto session
: sessions
) {
925 if (session
->is_open())
926 client_reconnect_gather
.insert(session
->get_client());
929 if (client_reconnect_gather
.empty()) {
930 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl
;
931 reconnect_gather_finish();
935 // clients will get the mdsmap and discover we're reconnecting via the monitor.
937 reconnect_start
= ceph_clock_now();
938 dout(1) << "reconnect_clients -- " << client_reconnect_gather
.size() << " sessions" << dendl
;
939 mds
->sessionmap
.dump();
942 /* This function DOES put the passed message before returning*/
943 void Server::handle_client_reconnect(MClientReconnect
*m
)
945 dout(7) << "handle_client_reconnect " << m
->get_source() << dendl
;
946 client_t from
= m
->get_source().num();
947 Session
*session
= mds
->get_session(m
);
950 if (!mds
->is_reconnect() && mds
->get_want_state() == CEPH_MDS_STATE_RECONNECT
) {
951 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl
;
952 mds
->wait_for_reconnect(new C_MDS_RetryMessage(mds
, m
));
956 utime_t delay
= ceph_clock_now();
957 delay
-= reconnect_start
;
958 dout(10) << " reconnect_start " << reconnect_start
<< " delay " << delay
<< dendl
;
961 if (!mds
->is_reconnect() || mds
->get_want_state() != CEPH_MDS_STATE_RECONNECT
|| reconnect_evicting
) {
962 // XXX maybe in the future we can do better than this?
963 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl
;
964 mds
->clog
->info() << "denied reconnect attempt (mds is "
965 << ceph_mds_state_name(mds
->get_state())
966 << ") from " << m
->get_source_inst()
967 << " after " << delay
<< " (allowed interval " << g_conf
->mds_reconnect_timeout
<< ")";
969 } else if (!session
->is_open()) {
970 dout(1) << " session is closed, ignoring reconnect, sending close" << dendl
;
971 mds
->clog
->info() << "denied reconnect attempt (mds is "
972 << ceph_mds_state_name(mds
->get_state())
973 << ") from " << m
->get_source_inst() << " (session is closed)";
975 } else if (mdcache
->is_readonly()) {
976 dout(1) << " read-only FS, ignoring reconnect, sending close" << dendl
;
977 mds
->clog
->info() << "denied reconnect attempt (mds is read-only)";
982 m
->get_connection()->send_message(new MClientSession(CEPH_SESSION_CLOSE
));
987 // notify client of success with an OPEN
988 m
->get_connection()->send_message(new MClientSession(CEPH_SESSION_OPEN
));
989 session
->last_cap_renew
= clock::now();
990 mds
->clog
->debug() << "reconnect by " << session
->info
.inst
<< " after " << delay
;
993 for (vector
<ceph_mds_snaprealm_reconnect
>::iterator p
= m
->realms
.begin();
994 p
!= m
->realms
.end();
996 CInode
*in
= mdcache
->get_inode(inodeno_t(p
->ino
));
997 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1000 assert(in
->snaprealm
);
1001 if (in
->snaprealm
->have_past_parents_open()) {
1002 dout(15) << "open snaprealm (w/ past parents) on " << *in
<< dendl
;
1003 mdcache
->finish_snaprealm_reconnect(from
, in
->snaprealm
, snapid_t(p
->seq
));
1005 dout(15) << "open snaprealm (w/o past parents) on " << *in
<< dendl
;
1006 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(p
->ino
), snapid_t(p
->seq
));
1009 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(p
->ino
)
1010 << " seq " << p
->seq
<< dendl
;
1011 mdcache
->add_reconnected_snaprealm(from
, inodeno_t(p
->ino
), snapid_t(p
->seq
));
1016 for (map
<inodeno_t
, cap_reconnect_t
>::iterator p
= m
->caps
.begin();
1019 // make sure our last_cap_id is MAX over all issued caps
1020 if (p
->second
.capinfo
.cap_id
> mdcache
->last_cap_id
)
1021 mdcache
->last_cap_id
= p
->second
.capinfo
.cap_id
;
1023 CInode
*in
= mdcache
->get_inode(p
->first
);
1024 if (in
&& in
->state_test(CInode::STATE_PURGING
))
1026 if (in
&& in
->is_auth()) {
1027 // we recovered it, and it's ours. take note.
1028 dout(15) << "open cap realm " << inodeno_t(p
->second
.capinfo
.snaprealm
)
1029 << " on " << *in
<< dendl
;
1030 in
->reconnect_cap(from
, p
->second
, session
);
1031 mdcache
->add_reconnected_cap(from
, p
->first
, p
->second
);
1032 recover_filelocks(in
, p
->second
.flockbl
, m
->get_orig_source().num());
1036 if (in
&& !in
->is_auth()) {
1038 dout(10) << "non-auth " << *in
<< ", will pass off to authority" << dendl
;
1039 // add to cap export list.
1040 p
->second
.path
.clear(); // we don't need path
1041 mdcache
->rejoin_export_caps(p
->first
, from
, p
->second
,
1042 in
->authority().first
);
1044 // don't know if the inode is mine
1045 dout(10) << "missing ino " << p
->first
<< ", will load later" << dendl
;
1046 p
->second
.path
.clear(); // we don't need path
1047 mdcache
->rejoin_recovered_caps(p
->first
, from
, p
->second
, MDS_RANK_NONE
);
1050 mdcache
->rejoin_recovered_client(session
->get_client(), session
->info
.inst
);
1052 // remove from gather set
1053 client_reconnect_gather
.erase(from
);
1054 if (client_reconnect_gather
.empty())
1055 reconnect_gather_finish();
1062 void Server::reconnect_gather_finish()
1064 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects
<< " clients" << dendl
;
1065 assert(reconnect_done
);
1066 reconnect_done
->complete(0);
1067 reconnect_done
= NULL
;
1070 void Server::reconnect_tick()
1072 if (reconnect_evicting
) {
1073 dout(4) << "reconnect_tick: waiting for evictions" << dendl
;
1077 utime_t reconnect_end
= reconnect_start
;
1078 reconnect_end
+= g_conf
->mds_reconnect_timeout
;
1079 if (ceph_clock_now() >= reconnect_end
&&
1080 !client_reconnect_gather
.empty()) {
1081 dout(10) << "reconnect timed out" << dendl
;
1083 // If we're doing blacklist evictions, use this to wait for them before
1084 // proceeding to reconnect_gather_finish
1085 MDSGatherBuilder
gather(g_ceph_context
);
1087 for (set
<client_t
>::iterator p
= client_reconnect_gather
.begin();
1088 p
!= client_reconnect_gather
.end();
1090 Session
*session
= mds
->sessionmap
.get_session(entity_name_t::CLIENT(p
->v
));
1092 dout(1) << "reconnect gave up on " << session
->info
.inst
<< dendl
;
1094 mds
->clog
->warn() << "evicting unresponsive client " << *session
1095 << ", after waiting " << g_conf
->mds_reconnect_timeout
1096 << " seconds during MDS startup";
1098 if (g_conf
->mds_session_blacklist_on_timeout
) {
1099 std::stringstream ss
;
1100 mds
->evict_client(session
->info
.inst
.name
.num(), false, true, ss
,
1103 kill_session(session
, NULL
);
1106 failed_reconnects
++;
1108 client_reconnect_gather
.clear();
1110 if (gather
.has_subs()) {
1111 dout(1) << "reconnect will complete once clients are evicted" << dendl
;
1112 gather
.set_finisher(new MDSInternalContextWrapper(mds
, new FunctionContext(
1113 [this](int r
){reconnect_gather_finish();})));
1115 reconnect_evicting
= true;
1117 reconnect_gather_finish();
1122 void Server::recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
)
1124 if (!locks
.length()) return;
1127 bufferlist::iterator p
= locks
.begin();
1128 ::decode(numlocks
, p
);
1129 for (int i
= 0; i
< numlocks
; ++i
) {
1131 lock
.client
= client
;
1132 in
->get_fcntl_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
>(lock
.start
, lock
));
1133 ++in
->get_fcntl_lock_state()->client_held_lock_counts
[client
];
1135 ::decode(numlocks
, p
);
1136 for (int i
= 0; i
< numlocks
; ++i
) {
1138 lock
.client
= client
;
1139 in
->get_flock_lock_state()->held_locks
.insert(pair
<uint64_t, ceph_filelock
> (lock
.start
, lock
));
1140 ++in
->get_flock_lock_state()->client_held_lock_counts
[client
];
1146 * Call this when the MDCache is oversized, to send requests to the clients
1147 * to trim some caps, and consequently unpin some inodes in the MDCache so
1148 * that it can trim too.
1150 void Server::recall_client_state(void)
1152 /* try to recall at least 80% of all caps */
1153 uint64_t max_caps_per_client
= Capability::count() * g_conf
->get_val
<double>("mds_max_ratio_caps_per_client");
1154 uint64_t min_caps_per_client
= g_conf
->get_val
<uint64_t>("mds_min_caps_per_client");
1155 if (max_caps_per_client
< min_caps_per_client
) {
1156 dout(0) << "max_caps_per_client " << max_caps_per_client
1157 << " < min_caps_per_client " << min_caps_per_client
<< dendl
;
1158 max_caps_per_client
= min_caps_per_client
+ 1;
1161 /* unless this ratio is smaller: */
1162 /* ratio: determine the amount of caps to recall from each client. Use
1163 * percentage full over the cache reservation. Cap the ratio at 80% of client
1165 double ratio
= 1.0-fmin(0.80, mdcache
->cache_toofull_ratio());
1167 dout(10) << "recall_client_state " << ratio
1168 << ", caps per client " << min_caps_per_client
<< "-" << max_caps_per_client
1171 set
<Session
*> sessions
;
1172 mds
->sessionmap
.get_client_session_set(sessions
);
1173 for (auto &session
: sessions
) {
1174 if (!session
->is_open() ||
1175 !session
->info
.inst
.name
.is_client())
1178 dout(10) << " session " << session
->info
.inst
1179 << " caps " << session
->caps
.size()
1180 << ", leases " << session
->leases
.size()
1183 uint64_t newlim
= MAX(MIN((session
->caps
.size() * ratio
), max_caps_per_client
), min_caps_per_client
);
1184 if (session
->caps
.size() > newlim
) {
1185 MClientSession
*m
= new MClientSession(CEPH_SESSION_RECALL_STATE
);
1186 m
->head
.max_caps
= newlim
;
1187 mds
->send_message_client(m
, session
);
1188 session
->notify_recall_sent(newlim
);
1193 void Server::force_clients_readonly()
1195 dout(10) << "force_clients_readonly" << dendl
;
1196 set
<Session
*> sessions
;
1197 mds
->sessionmap
.get_client_session_set(sessions
);
1198 for (set
<Session
*>::const_iterator p
= sessions
.begin();
1199 p
!= sessions
.end();
1201 Session
*session
= *p
;
1202 if (!session
->info
.inst
.name
.is_client() ||
1203 !(session
->is_open() || session
->is_stale()))
1205 mds
->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO
), session
);
1210 * some generic stuff for finishing off requests
1212 void Server::journal_and_reply(MDRequestRef
& mdr
, CInode
*in
, CDentry
*dn
, LogEvent
*le
, MDSLogContextBase
*fin
)
1214 dout(10) << "journal_and_reply tracei " << in
<< " tracedn " << dn
<< dendl
;
1215 assert(!mdr
->has_completed
);
1217 // note trace items for eventual reply.
1226 early_reply(mdr
, in
, dn
);
1228 mdr
->committing
= true;
1229 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
1231 if (mdr
->client_request
&& mdr
->client_request
->is_queued_for_replay()) {
1232 if (mds
->queue_one_replay()) {
1233 dout(10) << " queued next replay op" << dendl
;
1235 dout(10) << " journaled last replay op, flushing" << dendl
;
1238 } else if (mdr
->did_early_reply
)
1239 mds
->locker
->drop_rdlocks_for_early_reply(mdr
.get());
1244 void Server::submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
, MDRequestRef
& mdr
,
1248 string
event_str("submit entry: ");
1250 mdr
->mark_event_string(event_str
);
1252 mdlog
->submit_entry(le
, fin
);
1256 * send response built from mdr contents and error code; clean up mdr
1258 void Server::respond_to_request(MDRequestRef
& mdr
, int r
)
1260 if (mdr
->client_request
) {
1261 reply_client_request(mdr
, new MClientReply(mdr
->client_request
, r
));
1262 } else if (mdr
->internal_op
> -1) {
1263 dout(10) << "respond_to_request on internal request " << mdr
<< dendl
;
1264 if (!mdr
->internal_op_finish
)
1265 assert(0 == "trying to respond to internal op without finisher");
1266 mdr
->internal_op_finish
->complete(r
);
1267 mdcache
->request_finish(mdr
);
1271 // statistics mds req op number and latency
1272 void Server::perf_gather_op_latency(const MClientRequest
* req
, utime_t lat
)
1274 int code
= l_mdss_first
;
1275 switch(req
->get_op()) {
1276 case CEPH_MDS_OP_LOOKUPHASH
:
1277 code
= l_mdss_req_lookuphash_latency
;
1279 case CEPH_MDS_OP_LOOKUPINO
:
1280 code
= l_mdss_req_lookupino_latency
;
1282 case CEPH_MDS_OP_LOOKUPPARENT
:
1283 code
= l_mdss_req_lookupparent_latency
;
1285 case CEPH_MDS_OP_LOOKUPNAME
:
1286 code
= l_mdss_req_lookupname_latency
;
1288 case CEPH_MDS_OP_LOOKUP
:
1289 code
= l_mdss_req_lookup_latency
;
1291 case CEPH_MDS_OP_LOOKUPSNAP
:
1292 code
= l_mdss_req_lookupsnap_latency
;
1294 case CEPH_MDS_OP_GETATTR
:
1295 code
= l_mdss_req_getattr_latency
;
1297 case CEPH_MDS_OP_SETATTR
:
1298 code
= l_mdss_req_setattr_latency
;
1300 case CEPH_MDS_OP_SETLAYOUT
:
1301 code
= l_mdss_req_setlayout_latency
;
1303 case CEPH_MDS_OP_SETDIRLAYOUT
:
1304 code
= l_mdss_req_setdirlayout_latency
;
1306 case CEPH_MDS_OP_SETXATTR
:
1307 code
= l_mdss_req_setxattr_latency
;
1309 case CEPH_MDS_OP_RMXATTR
:
1310 code
= l_mdss_req_rmxattr_latency
;
1312 case CEPH_MDS_OP_READDIR
:
1313 code
= l_mdss_req_readdir_latency
;
1315 case CEPH_MDS_OP_SETFILELOCK
:
1316 code
= l_mdss_req_setfilelock_latency
;
1318 case CEPH_MDS_OP_GETFILELOCK
:
1319 code
= l_mdss_req_getfilelock_latency
;
1321 case CEPH_MDS_OP_CREATE
:
1322 code
= l_mdss_req_create_latency
;
1324 case CEPH_MDS_OP_OPEN
:
1325 code
= l_mdss_req_open_latency
;
1327 case CEPH_MDS_OP_MKNOD
:
1328 code
= l_mdss_req_mknod_latency
;
1330 case CEPH_MDS_OP_LINK
:
1331 code
= l_mdss_req_link_latency
;
1333 case CEPH_MDS_OP_UNLINK
:
1334 code
= l_mdss_req_unlink_latency
;
1336 case CEPH_MDS_OP_RMDIR
:
1337 code
= l_mdss_req_rmdir_latency
;
1339 case CEPH_MDS_OP_RENAME
:
1340 code
= l_mdss_req_rename_latency
;
1342 case CEPH_MDS_OP_MKDIR
:
1343 code
= l_mdss_req_mkdir_latency
;
1345 case CEPH_MDS_OP_SYMLINK
:
1346 code
= l_mdss_req_symlink_latency
;
1348 case CEPH_MDS_OP_LSSNAP
:
1349 code
= l_mdss_req_lssnap_latency
;
1351 case CEPH_MDS_OP_MKSNAP
:
1352 code
= l_mdss_req_mksnap_latency
;
1354 case CEPH_MDS_OP_RMSNAP
:
1355 code
= l_mdss_req_rmsnap_latency
;
1357 case CEPH_MDS_OP_RENAMESNAP
:
1358 code
= l_mdss_req_renamesnap_latency
;
1360 default: ceph_abort();
1362 logger
->tinc(code
, lat
);
1365 void Server::early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
)
1367 if (!g_conf
->mds_early_reply
)
1370 if (mdr
->no_early_reply
) {
1371 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl
;
1375 if (mdr
->has_more() && mdr
->more()->has_journaled_slaves
) {
1376 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl
;
1380 if (mdr
->alloc_ino
) {
1381 dout(10) << "early_reply - allocated ino, not allowed" << dendl
;
1385 MClientRequest
*req
= mdr
->client_request
;
1386 entity_inst_t client_inst
= req
->get_source_inst();
1387 if (client_inst
.name
.is_mds())
1390 if (req
->is_replay()) {
1391 dout(10) << " no early reply on replay op" << dendl
;
1396 MClientReply
*reply
= new MClientReply(req
, 0);
1397 reply
->set_unsafe();
1399 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1401 //_rename_finish() does not send dentry link/unlink message to replicas.
1402 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1403 // that have projected linkages from getting new replica.
1404 mds
->locker
->set_xlocks_done(mdr
.get(), req
->get_op() == CEPH_MDS_OP_RENAME
);
1406 dout(10) << "early_reply " << reply
->get_result()
1407 << " (" << cpp_strerror(reply
->get_result())
1408 << ") " << *req
<< dendl
;
1410 if (tracei
|| tracedn
) {
1412 mdr
->cap_releases
.erase(tracei
->vino());
1414 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
1416 set_trace_dist(mdr
->session
, reply
, tracei
, tracedn
, mdr
->snapid
,
1417 req
->get_dentry_wanted(), mdr
);
1420 reply
->set_extra_bl(mdr
->reply_extra_bl
);
1421 req
->get_connection()->send_message(reply
);
1423 mdr
->did_early_reply
= true;
1425 mds
->logger
->inc(l_mds_reply
);
1426 utime_t lat
= ceph_clock_now() - req
->get_recv_stamp();
1427 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
1428 if (client_inst
.name
.is_client()) {
1429 mds
->sessionmap
.hit_session(mdr
->session
);
1431 perf_gather_op_latency(req
, lat
);
1432 dout(20) << "lat " << lat
<< dendl
;
1434 mdr
->mark_event("early_replied");
1439 * include a trace to tracei
1442 void Server::reply_client_request(MDRequestRef
& mdr
, MClientReply
*reply
)
1445 MClientRequest
*req
= mdr
->client_request
;
1447 dout(7) << "reply_client_request " << reply
->get_result()
1448 << " (" << cpp_strerror(reply
->get_result())
1449 << ") " << *req
<< dendl
;
1451 mdr
->mark_event("replying");
1453 Session
*session
= mdr
->session
;
1455 // note successful request in session map?
1457 // setfilelock requests are special, they only modify states in MDS memory.
1458 // The states get lost when MDS fails. If Client re-send a completed
1459 // setfilelock request, it means that client did not receive corresponding
1460 // setfilelock reply. So MDS should re-execute the setfilelock request.
1461 if (req
->may_write() && req
->get_op() != CEPH_MDS_OP_SETFILELOCK
&&
1462 reply
->get_result() == 0 && session
) {
1463 inodeno_t created
= mdr
->alloc_ino
? mdr
->alloc_ino
: mdr
->used_prealloc_ino
;
1464 session
->add_completed_request(mdr
->reqid
.tid
, created
);
1466 mdr
->ls
->touched_sessions
.insert(session
->info
.inst
.name
);
1470 // give any preallocated inos to the session
1471 apply_allocated_inos(mdr
, session
);
1473 // get tracei/tracedn from mdr?
1474 snapid_t snapid
= mdr
->snapid
;
1475 CInode
*tracei
= mdr
->tracei
;
1476 CDentry
*tracedn
= mdr
->tracedn
;
1478 bool is_replay
= mdr
->client_request
->is_replay();
1479 bool did_early_reply
= mdr
->did_early_reply
;
1480 entity_inst_t client_inst
= req
->get_source_inst();
1481 int dentry_wanted
= req
->get_dentry_wanted();
1483 if (!did_early_reply
&& !is_replay
) {
1485 mds
->logger
->inc(l_mds_reply
);
1486 utime_t lat
= ceph_clock_now() - mdr
->client_request
->get_recv_stamp();
1487 mds
->logger
->tinc(l_mds_reply_latency
, lat
);
1488 if (client_inst
.name
.is_client()) {
1489 mds
->sessionmap
.hit_session(session
);
1491 perf_gather_op_latency(req
, lat
);
1492 dout(20) << "lat " << lat
<< dendl
;
1495 mdr
->cap_releases
.erase(tracei
->vino());
1497 mdr
->cap_releases
.erase(tracedn
->get_dir()->get_inode()->vino());
1500 // drop non-rdlocks before replying, so that we can issue leases
1501 mdcache
->request_drop_non_rdlocks(mdr
);
1504 if (client_inst
.name
.is_mds() || !session
) {
1505 reply
->put(); // mds doesn't need a reply
1509 if (!did_early_reply
&& // don't issue leases if we sent an earlier reply already
1510 (tracei
|| tracedn
)) {
1513 mdcache
->try_reconnect_cap(tracei
, session
);
1515 // include metadata in reply
1516 set_trace_dist(session
, reply
, tracei
, tracedn
,
1517 snapid
, dentry_wanted
,
1522 // We can set the extra bl unconditionally: if it's already been sent in the
1523 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
1524 reply
->set_extra_bl(mdr
->reply_extra_bl
);
1526 reply
->set_mdsmap_epoch(mds
->mdsmap
->get_epoch());
1527 req
->get_connection()->send_message(reply
);
1530 if (req
->is_queued_for_replay() &&
1531 (mdr
->has_completed
|| reply
->get_result() < 0)) {
1532 if (reply
->get_result() < 0) {
1533 int r
= reply
->get_result();
1534 derr
<< "reply_client_request: failed to replay " << *req
1535 << " error " << r
<< " (" << cpp_strerror(r
) << ")" << dendl
;
1536 mds
->clog
->warn() << "failed to replay " << req
->get_reqid() << " error " << r
;
1538 mds
->queue_one_replay();
1542 mdcache
->request_finish(mdr
);
1544 // take a closer look at tracei, if it happens to be a remote link
1547 tracedn
->get_projected_linkage()->is_remote()) {
1548 mdcache
->eval_remote(tracedn
);
1553 void Server::encode_empty_dirstat(bufferlist
& bl
)
1555 static DirStat empty
;
1559 void Server::encode_infinite_lease(bufferlist
& bl
)
1566 dout(20) << "encode_infinite_lease " << e
<< dendl
;
1569 void Server::encode_null_lease(bufferlist
& bl
)
1576 dout(20) << "encode_null_lease " << e
<< dendl
;
1581 * pass inode OR dentry (not both, or we may get confused)
1583 * trace is in reverse order (i.e. root inode comes last)
1585 void Server::set_trace_dist(Session
*session
, MClientReply
*reply
,
1586 CInode
*in
, CDentry
*dn
,
1591 // skip doing this for debugging purposes?
1592 if (g_conf
->mds_inject_traceless_reply_probability
&&
1593 mdr
->ls
&& !mdr
->o_trunc
&&
1594 (rand() % 10000 < g_conf
->mds_inject_traceless_reply_probability
* 10000.0)) {
1595 dout(5) << "deliberately skipping trace for " << *reply
<< dendl
;
1599 // inode, dentry, dir, ..., inode
1601 mds_rank_t whoami
= mds
->get_nodeid();
1602 client_t client
= session
->get_client();
1603 utime_t now
= ceph_clock_now();
1605 dout(20) << "set_trace_dist snapid " << snapid
<< dendl
;
1607 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
1610 if (snapid
== CEPH_NOSNAP
) {
1613 realm
= in
->find_snaprealm();
1615 realm
= dn
->get_dir()->get_inode()->find_snaprealm();
1616 reply
->snapbl
= realm
->get_snap_trace();
1617 dout(10) << "set_trace_dist snaprealm " << *realm
<< " len=" << reply
->snapbl
.length() << dendl
;
1622 reply
->head
.is_dentry
= 1;
1623 CDir
*dir
= dn
->get_dir();
1624 CInode
*diri
= dir
->get_inode();
1626 diri
->encode_inodestat(bl
, session
, NULL
, snapid
);
1627 dout(20) << "set_trace_dist added diri " << *diri
<< dendl
;
1629 #ifdef MDS_VERIFY_FRAGSTAT
1630 if (dir
->is_complete())
1631 dir
->verify_fragstat();
1633 dir
->encode_dirstat(bl
, whoami
);
1634 dout(20) << "set_trace_dist added dir " << *dir
<< dendl
;
1636 ::encode(dn
->get_name(), bl
);
1637 if (snapid
== CEPH_NOSNAP
)
1638 mds
->locker
->issue_client_lease(dn
, client
, bl
, now
, session
);
1640 encode_null_lease(bl
);
1641 dout(20) << "set_trace_dist added dn " << snapid
<< " " << *dn
<< dendl
;
1643 reply
->head
.is_dentry
= 0;
1647 in
->encode_inodestat(bl
, session
, NULL
, snapid
, 0, mdr
->getattr_caps
);
1648 dout(20) << "set_trace_dist added in " << *in
<< dendl
;
1649 reply
->head
.is_target
= 1;
1651 reply
->head
.is_target
= 0;
1653 reply
->set_trace(bl
);
1660 * process a client request
1661 * This function DOES put the passed message before returning
1663 void Server::handle_client_request(MClientRequest
*req
)
1665 dout(4) << "handle_client_request " << *req
<< dendl
;
1668 mds
->logger
->inc(l_mds_request
);
1670 logger
->inc(l_mdss_handle_client_request
);
1672 if (!mdcache
->is_open()) {
1673 dout(5) << "waiting for root" << dendl
;
1674 mdcache
->wait_for_open(new C_MDS_RetryMessage(mds
, req
));
1679 Session
*session
= 0;
1680 if (req
->get_source().is_client()) {
1681 session
= mds
->get_session(req
);
1683 dout(5) << "no session for " << req
->get_source() << ", dropping" << dendl
;
1684 } else if (session
->is_closed() ||
1685 session
->is_closing() ||
1686 session
->is_killing()) {
1687 dout(5) << "session closed|closing|killing, dropping" << dendl
;
1691 if (req
->is_queued_for_replay())
1692 mds
->queue_one_replay();
1699 if (req
->get_mdsmap_epoch() < mds
->mdsmap
->get_epoch()) {
1700 // send it? hrm, this isn't ideal; they may get a lot of copies if
1701 // they have a high request rate.
1704 // completed request?
1705 bool has_completed
= false;
1706 if (req
->is_replay() || req
->get_retry_attempt()) {
1709 if (session
->have_completed_request(req
->get_reqid().tid
, &created
)) {
1710 has_completed
= true;
1711 // Don't send traceless reply if the completed request has created
1712 // new inode. Treat the request as lookup request instead.
1713 if (req
->is_replay() ||
1714 ((created
== inodeno_t() || !mds
->is_clientreplay()) &&
1715 req
->get_op() != CEPH_MDS_OP_OPEN
&&
1716 req
->get_op() != CEPH_MDS_OP_CREATE
)) {
1717 dout(5) << "already completed " << req
->get_reqid() << dendl
;
1718 MClientReply
*reply
= new MClientReply(req
, 0);
1719 if (created
!= inodeno_t()) {
1721 ::encode(created
, extra
);
1722 reply
->set_extra_bl(extra
);
1724 req
->get_connection()->send_message(reply
);
1726 if (req
->is_queued_for_replay())
1727 mds
->queue_one_replay();
1732 if (req
->get_op() != CEPH_MDS_OP_OPEN
&&
1733 req
->get_op() != CEPH_MDS_OP_CREATE
) {
1734 dout(10) << " completed request which created new inode " << created
1735 << ", convert it to lookup request" << dendl
;
1736 req
->head
.op
= req
->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP
: CEPH_MDS_OP_GETATTR
;
1737 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
1742 // trim completed_request list
1743 if (req
->get_oldest_client_tid() > 0) {
1744 dout(15) << " oldest_client_tid=" << req
->get_oldest_client_tid() << dendl
;
1746 if (session
->trim_completed_requests(req
->get_oldest_client_tid())) {
1747 // Sessions 'completed_requests' was dirtied, mark it to be
1748 // potentially flushed at segment expiry.
1749 mdlog
->get_current_segment()->touched_sessions
.insert(session
->info
.inst
.name
);
1751 if (session
->get_num_trim_requests_warnings() > 0 &&
1752 session
->get_num_completed_requests() * 2 < g_conf
->mds_max_completed_requests
)
1753 session
->reset_num_trim_requests_warnings();
1755 if (session
->get_num_completed_requests() >=
1756 (g_conf
->mds_max_completed_requests
<< session
->get_num_trim_requests_warnings())) {
1757 session
->inc_num_trim_requests_warnings();
1759 ss
<< "client." << session
->get_client() << " does not advance its oldest_client_tid ("
1760 << req
->get_oldest_client_tid() << "), "
1761 << session
->get_num_completed_requests()
1762 << " completed requests recorded in session\n";
1763 mds
->clog
->warn() << ss
.str();
1764 dout(20) << __func__
<< " " << ss
.str() << dendl
;
1769 // register + dispatch
1770 MDRequestRef mdr
= mdcache
->request_start(req
);
1775 mdr
->session
= session
;
1776 session
->requests
.push_back(&mdr
->item_session_request
);
1780 mdr
->has_completed
= true;
1782 // process embedded cap releases?
1783 // (only if NOT replay!)
1784 if (!req
->releases
.empty() && req
->get_source().is_client() && !req
->is_replay()) {
1785 client_t client
= req
->get_source().num();
1786 for (vector
<MClientRequest::Release
>::iterator p
= req
->releases
.begin();
1787 p
!= req
->releases
.end();
1789 mds
->locker
->process_request_cap_release(mdr
, client
, p
->item
, p
->dname
);
1790 req
->releases
.clear();
1793 dispatch_client_request(mdr
);
1797 void Server::handle_osd_map()
1799 /* Note that we check the OSDMAP_FULL flag directly rather than
1800 * using osdmap_full_flag(), because we want to know "is the flag set"
1801 * rather than "does the flag apply to us?" */
1802 mds
->objecter
->with_osdmap([this](const OSDMap
& o
) {
1803 auto pi
= o
.get_pg_pool(mds
->mdsmap
->get_metadata_pool());
1804 is_full
= pi
&& pi
->has_flag(pg_pool_t::FLAG_FULL
);
1805 dout(7) << __func__
<< ": full = " << is_full
<< " epoch = "
1806 << o
.get_epoch() << dendl
;
1810 void Server::dispatch_client_request(MDRequestRef
& mdr
)
1812 // we shouldn't be waiting on anyone.
1813 assert(!mdr
->has_more() || mdr
->more()->waiting_on_slave
.empty());
1816 dout(10) << "request " << *mdr
<< " was killed" << dendl
;
1818 } else if (mdr
->aborted
) {
1819 mdr
->aborted
= false;
1820 mdcache
->request_kill(mdr
);
1824 MClientRequest
*req
= mdr
->client_request
;
1826 if (logger
) logger
->inc(l_mdss_dispatch_client_request
);
1828 dout(7) << "dispatch_client_request " << *req
<< dendl
;
1830 if (req
->may_write()) {
1831 if (mdcache
->is_readonly()) {
1832 dout(10) << " read-only FS" << dendl
;
1833 respond_to_request(mdr
, -EROFS
);
1836 if (mdr
->has_more() && mdr
->more()->slave_error
) {
1837 dout(10) << " got error from slaves" << dendl
;
1838 respond_to_request(mdr
, mdr
->more()->slave_error
);
1844 if (req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
1845 req
->get_op() == CEPH_MDS_OP_SETDIRLAYOUT
||
1846 req
->get_op() == CEPH_MDS_OP_SETLAYOUT
||
1847 req
->get_op() == CEPH_MDS_OP_RMXATTR
||
1848 req
->get_op() == CEPH_MDS_OP_SETXATTR
||
1849 req
->get_op() == CEPH_MDS_OP_CREATE
||
1850 req
->get_op() == CEPH_MDS_OP_SYMLINK
||
1851 req
->get_op() == CEPH_MDS_OP_MKSNAP
||
1852 ((req
->get_op() == CEPH_MDS_OP_LINK
||
1853 req
->get_op() == CEPH_MDS_OP_RENAME
) &&
1854 (!mdr
->has_more() || mdr
->more()->witnessed
.empty())) // haven't started slave request
1857 dout(20) << __func__
<< ": full, responding ENOSPC to op " << ceph_mds_op_name(req
->get_op()) << dendl
;
1858 respond_to_request(mdr
, -ENOSPC
);
1861 dout(20) << __func__
<< ": full, permitting op " << ceph_mds_op_name(req
->get_op()) << dendl
;
1865 switch (req
->get_op()) {
1866 case CEPH_MDS_OP_LOOKUPHASH
:
1867 case CEPH_MDS_OP_LOOKUPINO
:
1868 handle_client_lookup_ino(mdr
, false, false);
1870 case CEPH_MDS_OP_LOOKUPPARENT
:
1871 handle_client_lookup_ino(mdr
, true, false);
1873 case CEPH_MDS_OP_LOOKUPNAME
:
1874 handle_client_lookup_ino(mdr
, false, true);
1878 case CEPH_MDS_OP_LOOKUP
:
1879 handle_client_getattr(mdr
, true);
1882 case CEPH_MDS_OP_LOOKUPSNAP
:
1883 // lookupsnap does not reference a CDentry; treat it as a getattr
1884 case CEPH_MDS_OP_GETATTR
:
1885 handle_client_getattr(mdr
, false);
1888 case CEPH_MDS_OP_SETATTR
:
1889 handle_client_setattr(mdr
);
1891 case CEPH_MDS_OP_SETLAYOUT
:
1892 handle_client_setlayout(mdr
);
1894 case CEPH_MDS_OP_SETDIRLAYOUT
:
1895 handle_client_setdirlayout(mdr
);
1897 case CEPH_MDS_OP_SETXATTR
:
1898 handle_client_setxattr(mdr
);
1900 case CEPH_MDS_OP_RMXATTR
:
1901 handle_client_removexattr(mdr
);
1904 case CEPH_MDS_OP_READDIR
:
1905 handle_client_readdir(mdr
);
1908 case CEPH_MDS_OP_SETFILELOCK
:
1909 handle_client_file_setlock(mdr
);
1912 case CEPH_MDS_OP_GETFILELOCK
:
1913 handle_client_file_readlock(mdr
);
1917 case CEPH_MDS_OP_CREATE
:
1918 if (mdr
->has_completed
)
1919 handle_client_open(mdr
); // already created.. just open
1921 handle_client_openc(mdr
);
1924 case CEPH_MDS_OP_OPEN
:
1925 handle_client_open(mdr
);
1930 case CEPH_MDS_OP_MKNOD
:
1931 handle_client_mknod(mdr
);
1933 case CEPH_MDS_OP_LINK
:
1934 handle_client_link(mdr
);
1936 case CEPH_MDS_OP_UNLINK
:
1937 case CEPH_MDS_OP_RMDIR
:
1938 handle_client_unlink(mdr
);
1940 case CEPH_MDS_OP_RENAME
:
1941 handle_client_rename(mdr
);
1943 case CEPH_MDS_OP_MKDIR
:
1944 handle_client_mkdir(mdr
);
1946 case CEPH_MDS_OP_SYMLINK
:
1947 handle_client_symlink(mdr
);
1952 case CEPH_MDS_OP_LSSNAP
:
1953 handle_client_lssnap(mdr
);
1955 case CEPH_MDS_OP_MKSNAP
:
1956 handle_client_mksnap(mdr
);
1958 case CEPH_MDS_OP_RMSNAP
:
1959 handle_client_rmsnap(mdr
);
1961 case CEPH_MDS_OP_RENAMESNAP
:
1962 handle_client_renamesnap(mdr
);
1966 dout(1) << " unknown client op " << req
->get_op() << dendl
;
1967 respond_to_request(mdr
, -EOPNOTSUPP
);
1972 // ---------------------------------------
1975 /* This function DOES put the passed message before returning*/
1976 void Server::handle_slave_request(MMDSSlaveRequest
*m
)
1978 dout(4) << "handle_slave_request " << m
->get_reqid() << " from " << m
->get_source() << dendl
;
1979 mds_rank_t from
= mds_rank_t(m
->get_source().num());
1981 if (logger
) logger
->inc(l_mdss_handle_slave_request
);
1985 return handle_slave_request_reply(m
);
1987 // the purpose of rename notify is enforcing causal message ordering. making sure
1988 // bystanders have received all messages from rename srcdn's auth MDS.
1989 if (m
->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY
) {
1990 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(m
->get_reqid(), m
->get_attempt(),
1991 MMDSSlaveRequest::OP_RENAMENOTIFYACK
);
1992 mds
->send_message(reply
, m
->get_connection());
1997 CDentry
*straydn
= NULL
;
1998 if (m
->stray
.length() > 0) {
1999 straydn
= mdcache
->add_replica_stray(m
->stray
, from
);
2004 // am i a new slave?
2006 if (mdcache
->have_request(m
->get_reqid())) {
2008 mdr
= mdcache
->request_get(m
->get_reqid());
2010 // is my request newer?
2011 if (mdr
->attempt
> m
->get_attempt()) {
2012 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " > " << m
->get_attempt()
2013 << ", dropping " << *m
<< dendl
;
2019 if (mdr
->attempt
< m
->get_attempt()) {
2020 // mine is old, close it out
2021 dout(10) << "local request " << *mdr
<< " attempt " << mdr
->attempt
<< " < " << m
->get_attempt()
2022 << ", closing out" << dendl
;
2023 mdcache
->request_finish(mdr
);
2025 } else if (mdr
->slave_to_mds
!= from
) {
2026 dout(10) << "local request " << *mdr
<< " not slave to mds." << from
<< dendl
;
2031 if (m
->get_op() == MMDSSlaveRequest::OP_FINISH
&& m
->is_abort()) {
2032 mdr
->aborted
= true;
2033 if (mdr
->slave_request
) {
2034 // only abort on-going xlock, wrlock and auth pin
2035 assert(!mdr
->slave_did_prepare());
2037 mdcache
->request_finish(mdr
);
2045 if (m
->get_op() == MMDSSlaveRequest::OP_FINISH
) {
2046 dout(10) << "missing slave request for " << m
->get_reqid()
2047 << " OP_FINISH, must have lost race with a forward" << dendl
;
2051 mdr
= mdcache
->request_start_slave(m
->get_reqid(), m
->get_attempt(), m
);
2052 mdr
->set_op_stamp(m
->op_stamp
);
2054 assert(mdr
->slave_request
== 0); // only one at a time, please!
2058 mdr
->straydn
= straydn
;
2061 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2062 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2063 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2065 } else if (mds
->is_clientreplay() && !mds
->mdsmap
->is_clientreplay(from
) &&
2066 mdr
->locks
.empty()) {
2067 dout(3) << "not active yet, waiting" << dendl
;
2068 mds
->wait_for_active(new C_MDS_RetryMessage(mds
, m
));
2072 mdr
->reset_slave_request(m
);
2074 dispatch_slave_request(mdr
);
2077 /* This function DOES put the passed message before returning*/
2078 void Server::handle_slave_request_reply(MMDSSlaveRequest
*m
)
2080 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2082 if (!mds
->is_clientreplay() && !mds
->is_active() && !mds
->is_stopping()) {
2083 metareqid_t r
= m
->get_reqid();
2084 if (!mdcache
->have_uncommitted_master(r
, from
)) {
2085 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2086 << from
<< " reqid " << r
<< dendl
;
2090 dout(3) << "not clientreplay|active yet, waiting" << dendl
;
2091 mds
->wait_for_replay(new C_MDS_RetryMessage(mds
, m
));
2095 if (m
->get_op() == MMDSSlaveRequest::OP_COMMITTED
) {
2096 metareqid_t r
= m
->get_reqid();
2097 mdcache
->committed_master_slave(r
, from
);
2102 MDRequestRef mdr
= mdcache
->request_get(m
->get_reqid());
2103 if (m
->get_attempt() != mdr
->attempt
) {
2104 dout(10) << "handle_slave_request_reply " << *mdr
<< " ignoring reply from other attempt "
2105 << m
->get_attempt() << dendl
;
2110 switch (m
->get_op()) {
2111 case MMDSSlaveRequest::OP_XLOCKACK
:
2113 // identify lock, master request
2114 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2115 m
->get_object_info());
2116 mdr
->more()->slaves
.insert(from
);
2117 dout(10) << "got remote xlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2118 mdr
->xlocks
.insert(lock
);
2119 mdr
->locks
.insert(lock
);
2120 mdr
->finish_locking(lock
);
2121 lock
->get_xlock(mdr
, mdr
->get_client());
2123 assert(mdr
->more()->waiting_on_slave
.count(from
));
2124 mdr
->more()->waiting_on_slave
.erase(from
);
2125 assert(mdr
->more()->waiting_on_slave
.empty());
2126 mdcache
->dispatch_request(mdr
);
2130 case MMDSSlaveRequest::OP_WRLOCKACK
:
2132 // identify lock, master request
2133 SimpleLock
*lock
= mds
->locker
->get_lock(m
->get_lock_type(),
2134 m
->get_object_info());
2135 mdr
->more()->slaves
.insert(from
);
2136 dout(10) << "got remote wrlock on " << *lock
<< " on " << *lock
->get_parent() << dendl
;
2137 mdr
->remote_wrlocks
[lock
] = from
;
2138 mdr
->locks
.insert(lock
);
2139 mdr
->finish_locking(lock
);
2141 assert(mdr
->more()->waiting_on_slave
.count(from
));
2142 mdr
->more()->waiting_on_slave
.erase(from
);
2143 assert(mdr
->more()->waiting_on_slave
.empty());
2144 mdcache
->dispatch_request(mdr
);
2148 case MMDSSlaveRequest::OP_AUTHPINACK
:
2149 handle_slave_auth_pin_ack(mdr
, m
);
2152 case MMDSSlaveRequest::OP_LINKPREPACK
:
2153 handle_slave_link_prep_ack(mdr
, m
);
2156 case MMDSSlaveRequest::OP_RMDIRPREPACK
:
2157 handle_slave_rmdir_prep_ack(mdr
, m
);
2160 case MMDSSlaveRequest::OP_RENAMEPREPACK
:
2161 handle_slave_rename_prep_ack(mdr
, m
);
2164 case MMDSSlaveRequest::OP_RENAMENOTIFYACK
:
2165 handle_slave_rename_notify_ack(mdr
, m
);
2176 /* This function DOES put the mdr->slave_request before returning*/
2177 void Server::dispatch_slave_request(MDRequestRef
& mdr
)
2179 dout(7) << "dispatch_slave_request " << *mdr
<< " " << *mdr
->slave_request
<< dendl
;
2182 dout(7) << " abort flag set, finishing" << dendl
;
2183 mdcache
->request_finish(mdr
);
2187 if (logger
) logger
->inc(l_mdss_dispatch_slave_request
);
2189 int op
= mdr
->slave_request
->get_op();
2191 case MMDSSlaveRequest::OP_XLOCK
:
2192 case MMDSSlaveRequest::OP_WRLOCK
:
2195 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->slave_request
->get_lock_type(),
2196 mdr
->slave_request
->get_object_info());
2199 dout(10) << "don't have object, dropping" << dendl
;
2200 ceph_abort(); // can this happen, if we auth pinned properly.
2202 if (op
== MMDSSlaveRequest::OP_XLOCK
&& !lock
->get_parent()->is_auth()) {
2203 dout(10) << "not auth for remote xlock attempt, dropping on "
2204 << *lock
<< " on " << *lock
->get_parent() << dendl
;
2206 // use acquire_locks so that we get auth_pinning.
2207 set
<SimpleLock
*> rdlocks
;
2208 set
<SimpleLock
*> wrlocks
= mdr
->wrlocks
;
2209 set
<SimpleLock
*> xlocks
= mdr
->xlocks
;
2213 case MMDSSlaveRequest::OP_XLOCK
:
2214 xlocks
.insert(lock
);
2215 replycode
= MMDSSlaveRequest::OP_XLOCKACK
;
2217 case MMDSSlaveRequest::OP_WRLOCK
:
2218 wrlocks
.insert(lock
);
2219 replycode
= MMDSSlaveRequest::OP_WRLOCKACK
;
2223 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
2227 MMDSSlaveRequest
*r
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, replycode
);
2228 r
->set_lock_type(lock
->get_type());
2229 lock
->get_parent()->set_object_info(r
->get_object_info());
2230 mds
->send_message(r
, mdr
->slave_request
->get_connection());
2234 mdr
->reset_slave_request();
2238 case MMDSSlaveRequest::OP_UNXLOCK
:
2239 case MMDSSlaveRequest::OP_UNWRLOCK
:
2241 SimpleLock
*lock
= mds
->locker
->get_lock(mdr
->slave_request
->get_lock_type(),
2242 mdr
->slave_request
->get_object_info());
2244 bool need_issue
= false;
2246 case MMDSSlaveRequest::OP_UNXLOCK
:
2247 mds
->locker
->xlock_finish(lock
, mdr
.get(), &need_issue
);
2249 case MMDSSlaveRequest::OP_UNWRLOCK
:
2250 mds
->locker
->wrlock_finish(lock
, mdr
.get(), &need_issue
);
2254 mds
->locker
->issue_caps(static_cast<CInode
*>(lock
->get_parent()));
2256 // done. no ack necessary.
2257 mdr
->reset_slave_request();
2261 case MMDSSlaveRequest::OP_DROPLOCKS
:
2262 mds
->locker
->drop_locks(mdr
.get());
2263 mdr
->reset_slave_request();
2266 case MMDSSlaveRequest::OP_AUTHPIN
:
2267 handle_slave_auth_pin(mdr
);
2270 case MMDSSlaveRequest::OP_LINKPREP
:
2271 case MMDSSlaveRequest::OP_UNLINKPREP
:
2272 handle_slave_link_prep(mdr
);
2275 case MMDSSlaveRequest::OP_RMDIRPREP
:
2276 handle_slave_rmdir_prep(mdr
);
2279 case MMDSSlaveRequest::OP_RENAMEPREP
:
2280 handle_slave_rename_prep(mdr
);
2283 case MMDSSlaveRequest::OP_FINISH
:
2284 // information about rename imported caps
2285 if (mdr
->slave_request
->inode_export
.length() > 0)
2286 mdr
->more()->inode_import
.claim(mdr
->slave_request
->inode_export
);
2287 // finish off request.
2288 mdcache
->request_finish(mdr
);
2296 /* This function DOES put the mdr->slave_request before returning*/
2297 void Server::handle_slave_auth_pin(MDRequestRef
& mdr
)
2299 dout(10) << "handle_slave_auth_pin " << *mdr
<< dendl
;
2301 // build list of objects
2302 list
<MDSCacheObject
*> objects
;
2303 CInode
*auth_pin_freeze
= NULL
;
2304 bool fail
= false, wouldblock
= false, readonly
= false;
2306 if (mdcache
->is_readonly()) {
2307 dout(10) << " read-only FS" << dendl
;
2313 for (vector
<MDSCacheObjectInfo
>::iterator p
= mdr
->slave_request
->get_authpins().begin();
2314 p
!= mdr
->slave_request
->get_authpins().end();
2316 MDSCacheObject
*object
= mdcache
->get_object(*p
);
2318 dout(10) << " don't have " << *p
<< dendl
;
2323 objects
.push_back(object
);
2324 if (*p
== mdr
->slave_request
->get_authpin_freeze())
2325 auth_pin_freeze
= static_cast<CInode
*>(object
);
2329 // can we auth pin them?
2331 for (list
<MDSCacheObject
*>::iterator p
= objects
.begin();
2334 if (!(*p
)->is_auth()) {
2335 dout(10) << " not auth for " << **p
<< dendl
;
2339 if (mdr
->is_auth_pinned(*p
))
2341 if (!mdr
->can_auth_pin(*p
)) {
2342 if (mdr
->slave_request
->is_nonblock()) {
2343 dout(10) << " can't auth_pin (freezing?) " << **p
<< " nonblocking" << dendl
;
2349 dout(10) << " waiting for authpinnable on " << **p
<< dendl
;
2350 (*p
)->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2351 mdr
->drop_local_auth_pins();
2353 mds
->locker
->notify_freeze_waiter(*p
);
2361 mdr
->drop_local_auth_pins(); // just in case
2363 /* freeze authpin wrong inode */
2364 if (mdr
->has_more() && mdr
->more()->is_freeze_authpin
&&
2365 mdr
->more()->rename_inode
!= auth_pin_freeze
)
2366 mdr
->unfreeze_auth_pin(true);
2368 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2369 * on the source inode to complete. This happens after all locks for the rename
2370 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2371 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2372 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2373 * The solution is freeze the inode and prevent other MDRequests from getting new
2376 if (auth_pin_freeze
) {
2377 dout(10) << " freezing auth pin on " << *auth_pin_freeze
<< dendl
;
2378 if (!mdr
->freeze_auth_pin(auth_pin_freeze
)) {
2379 auth_pin_freeze
->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
2380 mds
->mdlog
->flush();
2384 for (list
<MDSCacheObject
*>::iterator p
= objects
.begin();
2387 dout(10) << "auth_pinning " << **p
<< dendl
;
2393 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_AUTHPINACK
);
2395 // return list of my auth_pins (if any)
2396 for (set
<MDSCacheObject
*>::iterator p
= mdr
->auth_pins
.begin();
2397 p
!= mdr
->auth_pins
.end();
2399 MDSCacheObjectInfo info
;
2400 (*p
)->set_object_info(info
);
2401 reply
->get_authpins().push_back(info
);
2402 if (*p
== (MDSCacheObject
*)auth_pin_freeze
)
2403 auth_pin_freeze
->set_object_info(reply
->get_authpin_freeze());
2407 reply
->mark_error_wouldblock();
2409 reply
->mark_error_rofs();
2411 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
2413 // clean up this request
2414 mdr
->reset_slave_request();
2418 /* This function DOES NOT put the passed ack before returning*/
2419 void Server::handle_slave_auth_pin_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
2421 dout(10) << "handle_slave_auth_pin_ack on " << *mdr
<< " " << *ack
<< dendl
;
2422 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
2425 set
<MDSCacheObject
*> pinned
;
2426 for (vector
<MDSCacheObjectInfo
>::iterator p
= ack
->get_authpins().begin();
2427 p
!= ack
->get_authpins().end();
2429 MDSCacheObject
*object
= mdcache
->get_object(*p
);
2430 assert(object
); // we pinned it
2431 dout(10) << " remote has pinned " << *object
<< dendl
;
2432 if (!mdr
->is_auth_pinned(object
))
2433 mdr
->remote_auth_pins
[object
] = from
;
2434 if (*p
== ack
->get_authpin_freeze())
2435 mdr
->set_remote_frozen_auth_pin(static_cast<CInode
*>(object
));
2436 pinned
.insert(object
);
2439 // removed frozen auth pin ?
2440 if (mdr
->more()->is_remote_frozen_authpin
&&
2441 ack
->get_authpin_freeze() == MDSCacheObjectInfo()) {
2442 auto p
= mdr
->remote_auth_pins
.find(mdr
->more()->rename_inode
);
2443 assert(p
!= mdr
->remote_auth_pins
.end());
2444 if (p
->second
== from
) {
2445 mdr
->more()->is_remote_frozen_authpin
= false;
2449 // removed auth pins?
2450 map
<MDSCacheObject
*, mds_rank_t
>::iterator p
= mdr
->remote_auth_pins
.begin();
2451 while (p
!= mdr
->remote_auth_pins
.end()) {
2452 MDSCacheObject
* object
= p
->first
;
2453 if (p
->second
== from
&& pinned
.count(object
) == 0) {
2454 dout(10) << " remote has unpinned " << *object
<< dendl
;
2455 mdr
->remote_auth_pins
.erase(p
++);
2461 if (ack
->is_error_rofs()) {
2462 mdr
->more()->slave_error
= -EROFS
;
2463 mdr
->aborted
= true;
2464 } else if (ack
->is_error_wouldblock()) {
2465 mdr
->more()->slave_error
= -EWOULDBLOCK
;
2466 mdr
->aborted
= true;
2470 mdr
->more()->slaves
.insert(from
);
2472 // clear from waiting list
2473 assert(mdr
->more()->waiting_on_slave
.count(from
));
2474 mdr
->more()->waiting_on_slave
.erase(from
);
2477 if (mdr
->more()->waiting_on_slave
.empty())
2478 mdcache
->dispatch_request(mdr
);
2480 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
2484 // ---------------------------------------
2489 * check whether we are permitted to complete a request
2491 * Check whether we have permission to perform the operation specified
2492 * by mask on the given inode, based on the capability in the mdr's
2495 bool Server::check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
)
2498 int r
= mdr
->session
->check_access(
2500 mdr
->client_request
->get_caller_uid(),
2501 mdr
->client_request
->get_caller_gid(),
2502 &mdr
->client_request
->get_caller_gid_list(),
2503 mdr
->client_request
->head
.args
.setattr
.uid
,
2504 mdr
->client_request
->head
.args
.setattr
.gid
);
2506 respond_to_request(mdr
, r
);
2514 * check whether fragment has reached maximum size
2517 bool Server::check_fragment_space(MDRequestRef
&mdr
, CDir
*in
)
2519 const auto size
= in
->get_frag_size();
2520 if (size
>= g_conf
->mds_bal_fragment_size_max
) {
2521 dout(10) << "fragment " << *in
<< " size exceeds " << g_conf
->mds_bal_fragment_size_max
<< " (ENOSPC)" << dendl
;
2522 respond_to_request(mdr
, -ENOSPC
);
2530 /** validate_dentry_dir
2532 * verify that the dir exists and would own the dname.
2533 * do not check if the dentry exists.
2535 CDir
*Server::validate_dentry_dir(MDRequestRef
& mdr
, CInode
*diri
, boost::string_view dname
)
2537 // make sure parent is a dir?
2538 if (!diri
->is_dir()) {
2539 dout(7) << "validate_dentry_dir: not a dir" << dendl
;
2540 respond_to_request(mdr
, -ENOTDIR
);
2545 frag_t fg
= diri
->pick_dirfrag(dname
);
2546 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
2551 if (dir
->is_frozen()) {
2552 dout(7) << "dir is frozen " << *dir
<< dendl
;
2553 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2561 /** prepare_null_dentry
2562 * prepare a null (or existing) dentry in given dir.
2563 * wait for any dn lock.
2565 CDentry
* Server::prepare_null_dentry(MDRequestRef
& mdr
, CDir
*dir
, boost::string_view dname
, bool okexist
)
2567 dout(10) << "prepare_null_dentry " << dname
<< " in " << *dir
<< dendl
;
2568 assert(dir
->is_auth());
2570 client_t client
= mdr
->get_client();
2572 // does it already exist?
2573 CDentry
*dn
= dir
->lookup(dname
);
2576 if (dn->lock.is_xlocked_by_other(mdr)) {
2577 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2578 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2582 if (!dn
->get_linkage(client
, mdr
)->is_null()) {
2583 // name already exists
2584 dout(10) << "dentry " << dname
<< " exists in " << *dir
<< dendl
;
2586 respond_to_request(mdr
, -EEXIST
);
2590 dn
->first
= dir
->inode
->find_snaprealm()->get_newest_seq() + 1;
2596 // make sure dir is complete
2597 if (!dir
->is_complete() && (!dir
->has_bloom() || dir
->is_in_bloom(dname
))) {
2598 dout(7) << " incomplete dir contents for " << *dir
<< ", fetching" << dendl
;
2599 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
));
2604 dn
= dir
->add_null_dentry(dname
, dir
->inode
->find_snaprealm()->get_newest_seq() + 1);
2606 dout(10) << "prepare_null_dentry added " << *dn
<< dendl
;
2610 CDentry
* Server::prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
)
2612 CDentry
*straydn
= mdr
->straydn
;
2615 in
->name_stray_dentry(straydname
);
2616 if (straydn
->get_name() == straydname
)
2619 assert(!mdr
->done_locking
);
2620 mdr
->unpin(straydn
);
2623 CDir
*straydir
= mdcache
->get_stray_dir(in
);
2625 if (!mdr
->client_request
->is_replay() &&
2626 !check_fragment_space(mdr
, straydir
))
2629 straydn
= mdcache
->get_or_create_stray_dentry(in
);
2630 mdr
->straydn
= straydn
;
2635 /** prepare_new_inode
2637 * create a new inode. set c/m/atime. hit dir pop.
2639 CInode
* Server::prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
2640 file_layout_t
*layout
)
2642 CInode
*in
= new CInode(mdcache
);
2644 // Server::prepare_force_open_sessions() can re-open session in closing
2645 // state. In that corner case, session's prealloc_inos are being freed.
2646 // To simplify the code, we disallow using/refilling session's prealloc_ino
2647 // while session is opening.
2648 bool allow_prealloc_inos
= !mdr
->session
->is_opening();
2651 if (allow_prealloc_inos
&&
2652 mdr
->session
->info
.prealloc_inos
.size()) {
2653 mdr
->used_prealloc_ino
=
2654 in
->inode
.ino
= mdr
->session
->take_ino(useino
); // prealloc -> used
2655 mds
->sessionmap
.mark_projected(mdr
->session
);
2657 dout(10) << "prepare_new_inode used_prealloc " << mdr
->used_prealloc_ino
2658 << " (" << mdr
->session
->info
.prealloc_inos
2659 << ", " << mdr
->session
->info
.prealloc_inos
.size() << " left)"
2663 in
->inode
.ino
= mds
->inotable
->project_alloc_id();
2664 dout(10) << "prepare_new_inode alloc " << mdr
->alloc_ino
<< dendl
;
2667 if (useino
&& useino
!= in
->inode
.ino
) {
2668 dout(0) << "WARNING: client specified " << useino
<< " and i allocated " << in
->inode
.ino
<< dendl
;
2669 mds
->clog
->error() << mdr
->client_request
->get_source()
2670 << " specified ino " << useino
2671 << " but mds." << mds
->get_nodeid() << " allocated " << in
->inode
.ino
;
2672 //ceph_abort(); // just for now.
2675 if (allow_prealloc_inos
&&
2676 mdr
->session
->get_num_projected_prealloc_inos() < g_conf
->mds_client_prealloc_inos
/ 2) {
2677 int need
= g_conf
->mds_client_prealloc_inos
- mdr
->session
->get_num_projected_prealloc_inos();
2678 mds
->inotable
->project_alloc_ids(mdr
->prealloc_inos
, need
);
2679 assert(mdr
->prealloc_inos
.size()); // or else fix projected increment semantics
2680 mdr
->session
->pending_prealloc_inos
.insert(mdr
->prealloc_inos
);
2681 mds
->sessionmap
.mark_projected(mdr
->session
);
2682 dout(10) << "prepare_new_inode prealloc " << mdr
->prealloc_inos
<< dendl
;
2685 in
->inode
.version
= 1;
2686 in
->inode
.xattr_version
= 1;
2687 in
->inode
.nlink
= 1; // FIXME
2689 in
->inode
.mode
= mode
;
2691 memset(&in
->inode
.dir_layout
, 0, sizeof(in
->inode
.dir_layout
));
2692 if (in
->inode
.is_dir()) {
2693 in
->inode
.dir_layout
.dl_dir_hash
= g_conf
->mds_default_dir_hash
;
2694 } else if (layout
) {
2695 in
->inode
.layout
= *layout
;
2697 in
->inode
.layout
= mdcache
->default_file_layout
;
2700 in
->inode
.truncate_size
= -1ull; // not truncated, yet!
2701 in
->inode
.truncate_seq
= 1; /* starting with 1, 0 is kept for no-truncation logic */
2703 CInode
*diri
= dir
->get_inode();
2705 dout(10) << oct
<< " dir mode 0" << diri
->inode
.mode
<< " new mode 0" << mode
<< dec
<< dendl
;
2707 if (diri
->inode
.mode
& S_ISGID
) {
2708 dout(10) << " dir is sticky" << dendl
;
2709 in
->inode
.gid
= diri
->inode
.gid
;
2710 if (S_ISDIR(mode
)) {
2711 dout(10) << " new dir also sticky" << dendl
;
2712 in
->inode
.mode
|= S_ISGID
;
2715 in
->inode
.gid
= mdr
->client_request
->get_caller_gid();
2717 in
->inode
.uid
= mdr
->client_request
->get_caller_uid();
2719 in
->inode
.btime
= in
->inode
.ctime
= in
->inode
.mtime
= in
->inode
.atime
=
2720 mdr
->get_op_stamp();
2722 in
->inode
.change_attr
= 0;
2724 MClientRequest
*req
= mdr
->client_request
;
2725 if (req
->get_data().length()) {
2726 bufferlist::iterator p
= req
->get_data().begin();
2728 // xattrs on new inode?
2729 CInode::mempool_xattr_map xattrs
;
2730 ::decode(xattrs
, p
);
2731 for (const auto &p
: xattrs
) {
2732 dout(10) << "prepare_new_inode setting xattr " << p
.first
<< dendl
;
2733 auto em
= in
->xattrs
.emplace(std::piecewise_construct
, std::forward_as_tuple(p
.first
), std::forward_as_tuple(p
.second
));
2735 em
.first
->second
= p
.second
;
2739 if (!mds
->mdsmap
->get_inline_data_enabled() ||
2740 !mdr
->session
->connection
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
))
2741 in
->inode
.inline_data
.version
= CEPH_INLINE_NONE
;
2743 mdcache
->add_inode(in
); // add
2744 dout(10) << "prepare_new_inode " << *in
<< dendl
;
2748 void Server::journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
)
2750 dout(20) << "journal_allocated_inos sessionmapv " << mds
->sessionmap
.get_projected()
2751 << " inotablev " << mds
->inotable
->get_projected_version()
2753 blob
->set_ino_alloc(mdr
->alloc_ino
,
2754 mdr
->used_prealloc_ino
,
2756 mdr
->client_request
->get_source(),
2757 mds
->sessionmap
.get_projected(),
2758 mds
->inotable
->get_projected_version());
2761 void Server::apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
)
2763 dout(10) << "apply_allocated_inos " << mdr
->alloc_ino
2764 << " / " << mdr
->prealloc_inos
2765 << " / " << mdr
->used_prealloc_ino
<< dendl
;
2767 if (mdr
->alloc_ino
) {
2768 mds
->inotable
->apply_alloc_id(mdr
->alloc_ino
);
2770 if (mdr
->prealloc_inos
.size()) {
2772 session
->pending_prealloc_inos
.subtract(mdr
->prealloc_inos
);
2773 session
->info
.prealloc_inos
.insert(mdr
->prealloc_inos
);
2774 mds
->sessionmap
.mark_dirty(session
);
2775 mds
->inotable
->apply_alloc_ids(mdr
->prealloc_inos
);
2777 if (mdr
->used_prealloc_ino
) {
2779 session
->info
.used_inos
.erase(mdr
->used_prealloc_ino
);
2780 mds
->sessionmap
.mark_dirty(session
);
2784 class C_MDS_TryFindInode
: public ServerContext
{
2787 C_MDS_TryFindInode(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
2788 void finish(int r
) override
{
2789 if (r
== -ESTALE
) // :( find_ino_peers failed
2790 server
->respond_to_request(mdr
, r
);
2792 server
->dispatch_client_request(mdr
);
2796 CDir
*Server::traverse_to_auth_dir(MDRequestRef
& mdr
, vector
<CDentry
*> &trace
, filepath refpath
)
2798 // figure parent dir vs dname
2799 if (refpath
.depth() == 0) {
2800 dout(7) << "can't do that to root" << dendl
;
2801 respond_to_request(mdr
, -EINVAL
);
2804 string dname
= refpath
.last_dentry();
2805 refpath
.pop_dentry();
2807 dout(10) << "traverse_to_auth_dir dirpath " << refpath
<< " dname " << dname
<< dendl
;
2809 // traverse to parent dir
2811 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, refpath
, &trace
, &diri
, MDS_TRAVERSE_FORWARD
);
2812 if (r
> 0) return 0; // delayed
2815 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
2816 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
2819 respond_to_request(mdr
, r
);
2823 // is it an auth dir?
2824 CDir
*dir
= validate_dentry_dir(mdr
, diri
, dname
);
2826 return 0; // forwarded or waiting for freeze
2828 dout(10) << "traverse_to_auth_dir " << *dir
<< dendl
;
2832 /* If this returns null, the request has been handled
2833 * as appropriate: forwarded on, or the client's been replied to */
2834 CInode
* Server::rdlock_path_pin_ref(MDRequestRef
& mdr
, int n
,
2835 set
<SimpleLock
*> &rdlocks
,
2837 bool no_want_auth
, /* for readdir, who doesn't want auth _even_if_ it's
2839 file_layout_t
**layout
,
2840 bool no_lookup
) // true if we cannot return a null dentry lease
2842 const filepath
& refpath
= n
? mdr
->get_filepath2() : mdr
->get_filepath();
2843 dout(10) << "rdlock_path_pin_ref " << *mdr
<< " " << refpath
<< dendl
;
2845 if (mdr
->done_locking
)
2849 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, refpath
, &mdr
->dn
[n
], &mdr
->in
[n
], MDS_TRAVERSE_FORWARD
);
2851 return NULL
; // delayed
2852 if (r
< 0) { // error
2853 if (r
== -ENOENT
&& n
== 0 && !mdr
->dn
[n
].empty()) {
2855 mdr
->tracedn
= mdr
->dn
[n
].back();
2857 respond_to_request(mdr
, r
);
2858 } else if (r
== -ESTALE
) {
2859 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
2860 MDSInternalContextBase
*c
= new C_MDS_TryFindInode(this, mdr
);
2861 mdcache
->find_ino_peers(refpath
.get_ino(), c
);
2863 dout(10) << "FAIL on error " << r
<< dendl
;
2864 respond_to_request(mdr
, r
);
2868 CInode
*ref
= mdr
->in
[n
];
2869 dout(10) << "ref is " << *ref
<< dendl
;
2871 // fw to inode auth?
2872 if (mdr
->snapid
!= CEPH_NOSNAP
&& !no_want_auth
)
2876 if (ref
->is_ambiguous_auth()) {
2877 dout(10) << "waiting for single auth on " << *ref
<< dendl
;
2878 ref
->add_waiter(CInode::WAIT_SINGLEAUTH
, new C_MDS_RetryRequest(mdcache
, mdr
));
2881 if (!ref
->is_auth()) {
2882 dout(10) << "fw to auth for " << *ref
<< dendl
;
2883 mdcache
->request_forward(mdr
, ref
->authority().first
);
2888 // do NOT proceed if freezing, as cap release may defer in that case, and
2889 // we could deadlock when we try to lock @ref.
2890 // if we're already auth_pinned, continue; the release has already been processed.
2891 if (ref
->is_frozen() || ref
->is_frozen_auth_pin() ||
2892 (ref
->is_freezing() && !mdr
->is_auth_pinned(ref
))) {
2893 dout(7) << "waiting for !frozen/authpinnable on " << *ref
<< dendl
;
2894 ref
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
2895 /* If we have any auth pins, this will deadlock.
2896 * But the only way to get here if we've already got auth pins
2897 * is because we're on an inode with snapshots that got updated
2898 * between dispatches of this request. So we're going to drop
2899 * our locks and our auth pins and reacquire them later.
2901 * This is safe since we're only in this function when working on
2902 * a single MDS request; otherwise we'd be in
2903 * rdlock_path_xlock_dentry.
2905 mds
->locker
->drop_locks(mdr
.get(), NULL
);
2906 mdr
->drop_local_auth_pins();
2907 if (!mdr
->remote_auth_pins
.empty())
2908 mds
->locker
->notify_freeze_waiter(ref
);
2915 for (int i
=0; i
<(int)mdr
->dn
[n
].size(); i
++)
2916 rdlocks
.insert(&mdr
->dn
[n
][i
]->lock
);
2918 mds
->locker
->include_snap_rdlocks_wlayout(rdlocks
, ref
, layout
);
2920 mds
->locker
->include_snap_rdlocks(rdlocks
, ref
);
2928 /** rdlock_path_xlock_dentry
2929 * traverse path to the directory that could/would contain dentry.
2930 * make sure i am auth for that dentry, forward as necessary.
2931 * create null dentry in place (or use existing if okexist).
2932 * get rdlocks on traversed dentries, xlock on new dentry.
2934 CDentry
* Server::rdlock_path_xlock_dentry(MDRequestRef
& mdr
, int n
,
2935 set
<SimpleLock
*>& rdlocks
, set
<SimpleLock
*>& wrlocks
, set
<SimpleLock
*>& xlocks
,
2936 bool okexist
, bool mustexist
, bool alwaysxlock
,
2937 file_layout_t
**layout
)
2939 const filepath
& refpath
= n
? mdr
->get_filepath2() : mdr
->get_filepath();
2941 dout(10) << "rdlock_path_xlock_dentry " << *mdr
<< " " << refpath
<< dendl
;
2943 client_t client
= mdr
->get_client();
2945 if (mdr
->done_locking
)
2946 return mdr
->dn
[n
].back();
2948 CDir
*dir
= traverse_to_auth_dir(mdr
, mdr
->dn
[n
], refpath
);
2951 CInode
*diri
= dir
->get_inode();
2952 if (!mdr
->reqid
.name
.is_mds()) {
2953 if (diri
->is_system() && !diri
->is_root()) {
2954 respond_to_request(mdr
, -EROFS
);
2958 if (!diri
->is_base() && diri
->get_projected_parent_dir()->inode
->is_stray()) {
2959 respond_to_request(mdr
, -ENOENT
);
2963 // make a null dentry?
2964 boost::string_view dname
= refpath
.last_dentry();
2967 dn
= dir
->lookup(dname
);
2969 // make sure dir is complete
2970 if (!dn
&& !dir
->is_complete() &&
2971 (!dir
->has_bloom() || dir
->is_in_bloom(dname
))) {
2972 dout(7) << " incomplete dir contents for " << *dir
<< ", fetching" << dendl
;
2973 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
));
2978 if (dn
&& !dn
->lock
.can_read(client
) && dn
->lock
.get_xlock_by() != mdr
) {
2979 dout(10) << "waiting on xlocked dentry " << *dn
<< dendl
;
2980 dn
->lock
.add_waiter(SimpleLock::WAIT_RD
, new C_MDS_RetryRequest(mdcache
, mdr
));
2985 if (!dn
|| dn
->get_linkage(client
, mdr
)->is_null()) {
2986 dout(7) << "dentry " << dname
<< " dne in " << *dir
<< dendl
;
2987 respond_to_request(mdr
, -ENOENT
);
2991 dn
= prepare_null_dentry(mdr
, dir
, dname
, okexist
);
2996 mdr
->dn
[n
].push_back(dn
);
2997 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
2998 mdr
->in
[n
] = dnl
->get_inode();
3001 // NOTE: rename takes the same set of locks for srcdn
3002 for (int i
=0; i
<(int)mdr
->dn
[n
].size(); i
++)
3003 rdlocks
.insert(&mdr
->dn
[n
][i
]->lock
);
3004 if (alwaysxlock
|| dnl
->is_null())
3005 xlocks
.insert(&dn
->lock
); // new dn, xlock
3007 rdlocks
.insert(&dn
->lock
); // existing dn, rdlock
3008 wrlocks
.insert(&dn
->get_dir()->inode
->filelock
); // also, wrlock on dir mtime
3009 wrlocks
.insert(&dn
->get_dir()->inode
->nestlock
); // also, wrlock on dir mtime
3011 mds
->locker
->include_snap_rdlocks_wlayout(rdlocks
, dn
->get_dir()->inode
, layout
);
3013 mds
->locker
->include_snap_rdlocks(rdlocks
, dn
->get_dir()->inode
);
3023 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3025 * @param diri base inode
3026 * @param fg the exact frag we want
3027 * @param mdr request
3028 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3030 CDir
* Server::try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
)
3032 CDir
*dir
= diri
->get_dirfrag(fg
);
3034 // not open and inode not mine?
3035 if (!dir
&& !diri
->is_auth()) {
3036 mds_rank_t inauth
= diri
->authority().first
;
3037 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth
<< dendl
;
3038 mdcache
->request_forward(mdr
, inauth
);
3042 // not open and inode frozen?
3043 if (!dir
&& diri
->is_frozen()) {
3044 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri
<< dendl
;
3045 assert(diri
->get_parent_dir());
3046 diri
->add_waiter(CInode::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3052 dir
= diri
->get_or_open_dirfrag(mdcache
, fg
);
3054 // am i auth for the dirfrag?
3055 if (!dir
->is_auth()) {
3056 mds_rank_t auth
= dir
->authority().first
;
3057 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3058 << ", fw to mds." << auth
<< dendl
;
3059 mdcache
->request_forward(mdr
, auth
);
3067 // ===============================================================================
3070 void Server::handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
)
3072 MClientRequest
*req
= mdr
->client_request
;
3073 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3075 if (req
->get_filepath().depth() == 0 && is_lookup
) {
3076 // refpath can't be empty for lookup but it can for
3077 // getattr (we do getattr with empty refpath for mount of '/')
3078 respond_to_request(mdr
, -EINVAL
);
3082 bool want_auth
= false;
3083 int mask
= req
->head
.args
.getattr
.mask
;
3084 if (mask
& CEPH_STAT_RSTAT
)
3085 want_auth
= true; // set want_auth for CEPH_STAT_RSTAT mask
3087 CInode
*ref
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, want_auth
, false, NULL
,
3092 * if client currently holds the EXCL cap on a field, do not rdlock
3093 * it; client's stat() will result in valid info if _either_ EXCL
3094 * cap is held or MDS rdlocks and reads the value here.
3096 * handling this case here is easier than weakening rdlock
3097 * semantics... that would cause problems elsewhere.
3099 client_t client
= mdr
->get_client();
3101 Capability
*cap
= ref
->get_client_cap(client
);
3102 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
||
3103 mdr
->snapid
<= cap
->client_follows
))
3104 issued
= cap
->issued();
3106 if ((mask
& CEPH_CAP_LINK_SHARED
) && !(issued
& CEPH_CAP_LINK_EXCL
))
3107 rdlocks
.insert(&ref
->linklock
);
3108 if ((mask
& CEPH_CAP_AUTH_SHARED
) && !(issued
& CEPH_CAP_AUTH_EXCL
))
3109 rdlocks
.insert(&ref
->authlock
);
3110 if ((mask
& CEPH_CAP_XATTR_SHARED
) && !(issued
& CEPH_CAP_XATTR_EXCL
))
3111 rdlocks
.insert(&ref
->xattrlock
);
3112 if ((mask
& CEPH_CAP_FILE_SHARED
) && !(issued
& CEPH_CAP_FILE_EXCL
)) {
3113 // Don't wait on unstable filelock if client is allowed to read file size.
3114 // This can reduce the response time of getattr in the case that multiple
3115 // clients do stat(2) and there are writers.
3116 // The downside of this optimization is that mds may not issue Fs caps along
3117 // with getattr reply. Client may need to send more getattr requests.
3118 if (mdr
->rdlocks
.count(&ref
->filelock
)) {
3119 rdlocks
.insert(&ref
->filelock
);
3120 } else if (ref
->filelock
.is_stable() ||
3121 ref
->filelock
.get_num_wrlocks() > 0 ||
3122 !ref
->filelock
.can_read(mdr
->get_client())) {
3123 rdlocks
.insert(&ref
->filelock
);
3124 mdr
->done_locking
= false;
3128 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3131 if (!check_access(mdr
, ref
, MAY_READ
))
3134 utime_t now
= ceph_clock_now();
3135 mdr
->set_mds_stamp(now
);
3137 // note which caps are requested, so we return at least a snapshot
3138 // value for them. (currently this matters for xattrs and inline data)
3139 mdr
->getattr_caps
= mask
;
3141 mds
->balancer
->hit_inode(now
, ref
, META_POP_IRD
,
3142 req
->get_source().num());
3145 dout(10) << "reply to stat on " << *req
<< dendl
;
3148 mdr
->tracedn
= mdr
->dn
[0].back();
3149 respond_to_request(mdr
, 0);
3152 struct C_MDS_LookupIno2
: public ServerContext
{
3154 C_MDS_LookupIno2(Server
*s
, MDRequestRef
& r
) : ServerContext(s
), mdr(r
) {}
3155 void finish(int r
) override
{
3156 server
->_lookup_ino_2(mdr
, r
);
3160 /* This function DOES clean up the mdr before returning*/
3164 void Server::handle_client_lookup_ino(MDRequestRef
& mdr
,
3165 bool want_parent
, bool want_dentry
)
3167 MClientRequest
*req
= mdr
->client_request
;
3169 inodeno_t ino
= req
->get_filepath().get_ino();
3170 CInode
*in
= mdcache
->get_inode(ino
);
3171 if (in
&& in
->state_test(CInode::STATE_PURGING
)) {
3172 respond_to_request(mdr
, -ESTALE
);
3176 mdcache
->open_ino(ino
, (int64_t)-1, new C_MDS_LookupIno2(this, mdr
), false);
3180 if (mdr
&& in
->snaprealm
&& !in
->snaprealm
->is_open() &&
3181 !in
->snaprealm
->open_parents(new C_MDS_RetryRequest(mdcache
, mdr
))) {
3185 // check for nothing (not read or write); this still applies the
3187 if (!check_access(mdr
, in
, 0))
3190 CDentry
*dn
= in
->get_projected_parent_dn();
3191 CInode
*diri
= dn
? dn
->get_dir()->inode
: NULL
;
3193 set
<SimpleLock
*> rdlocks
;
3194 if (dn
&& (want_parent
|| want_dentry
)) {
3196 rdlocks
.insert(&dn
->lock
);
3199 unsigned mask
= req
->head
.args
.getattr
.mask
;
3201 Capability
*cap
= in
->get_client_cap(mdr
->get_client());
3203 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
3204 issued
= cap
->issued();
3205 // permission bits, ACL/security xattrs
3206 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
3207 rdlocks
.insert(&in
->authlock
);
3208 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
3209 rdlocks
.insert(&in
->xattrlock
);
3211 mdr
->getattr_caps
= mask
;
3214 if (!rdlocks
.empty()) {
3215 set
<SimpleLock
*> wrlocks
, xlocks
;
3216 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3220 // need read access to directory inode
3221 if (!check_access(mdr
, diri
, MAY_READ
))
3227 if (in
->is_base()) {
3228 respond_to_request(mdr
, -EINVAL
);
3231 if (!diri
|| diri
->is_stray()) {
3232 respond_to_request(mdr
, -ESTALE
);
3235 dout(10) << "reply to lookup_parent " << *in
<< dendl
;
3237 respond_to_request(mdr
, 0);
3240 inodeno_t dirino
= req
->get_filepath2().get_ino();
3241 if (!diri
|| (dirino
!= inodeno_t() && diri
->ino() != dirino
)) {
3242 respond_to_request(mdr
, -ENOENT
);
3245 dout(10) << "reply to lookup_name " << *in
<< dendl
;
3247 dout(10) << "reply to lookup_ino " << *in
<< dendl
;
3252 respond_to_request(mdr
, 0);
3256 void Server::_lookup_ino_2(MDRequestRef
& mdr
, int r
)
3258 inodeno_t ino
= mdr
->client_request
->get_filepath().get_ino();
3259 dout(10) << "_lookup_ino_2 " << mdr
.get() << " ino " << ino
<< " r=" << r
<< dendl
;
3261 // `r` is a rank if >=0, else an error code
3263 mds_rank_t
dest_rank(r
);
3264 if (dest_rank
== mds
->get_nodeid())
3265 dispatch_client_request(mdr
);
3267 mdcache
->request_forward(mdr
, dest_rank
);
3272 if (r
== -ENOENT
|| r
== -ENODATA
)
3274 respond_to_request(mdr
, r
);
3278 /* This function takes responsibility for the passed mdr*/
3279 void Server::handle_client_open(MDRequestRef
& mdr
)
3281 MClientRequest
*req
= mdr
->client_request
;
3282 dout(7) << "open on " << req
->get_filepath() << dendl
;
3284 int flags
= req
->head
.args
.open
.flags
;
3285 int cmode
= ceph_flags_to_mode(flags
);
3287 respond_to_request(mdr
, -EINVAL
);
3291 bool need_auth
= !file_mode_is_readonly(cmode
) ||
3292 (flags
& (CEPH_O_TRUNC
| CEPH_O_DIRECTORY
));
3294 if ((cmode
& CEPH_FILE_MODE_WR
) && mdcache
->is_readonly()) {
3295 dout(7) << "read-only FS" << dendl
;
3296 respond_to_request(mdr
, -EROFS
);
3300 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3301 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, need_auth
);
3305 if (cur
->is_frozen() || cur
->state_test(CInode::STATE_EXPORTINGCAPS
)) {
3307 mdr
->done_locking
= false;
3308 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
3313 if (!cur
->inode
.is_file()) {
3314 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3315 cmode
= CEPH_FILE_MODE_PIN
;
3316 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3317 if (cur
->inode
.is_symlink() && !(flags
& CEPH_O_NOFOLLOW
))
3318 flags
&= ~CEPH_O_TRUNC
;
3321 dout(10) << "open flags = " << flags
3322 << ", filemode = " << cmode
3323 << ", need_auth = " << need_auth
3327 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3328 dout(7) << "not a file or dir " << *cur << dendl;
3329 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3332 if ((flags
& CEPH_O_DIRECTORY
) && !cur
->inode
.is_dir() && !cur
->inode
.is_symlink()) {
3333 dout(7) << "specified O_DIRECTORY on non-directory " << *cur
<< dendl
;
3334 respond_to_request(mdr
, -EINVAL
);
3338 if ((flags
& CEPH_O_TRUNC
) && !cur
->inode
.is_file()) {
3339 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur
<< dendl
;
3340 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3341 respond_to_request(mdr
, cur
->inode
.is_dir() ? -EISDIR
: -EINVAL
);
3345 if (cur
->inode
.inline_data
.version
!= CEPH_INLINE_NONE
&&
3346 !mdr
->session
->connection
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
3347 dout(7) << "old client cannot open inline data file " << *cur
<< dendl
;
3348 respond_to_request(mdr
, -EPERM
);
3352 // snapped data is read only
3353 if (mdr
->snapid
!= CEPH_NOSNAP
&&
3354 ((cmode
& CEPH_FILE_MODE_WR
) || req
->may_write())) {
3355 dout(7) << "snap " << mdr
->snapid
<< " is read-only " << *cur
<< dendl
;
3356 respond_to_request(mdr
, -EROFS
);
3360 unsigned mask
= req
->head
.args
.open
.mask
;
3362 Capability
*cap
= cur
->get_client_cap(mdr
->get_client());
3364 if (cap
&& (mdr
->snapid
== CEPH_NOSNAP
|| mdr
->snapid
<= cap
->client_follows
))
3365 issued
= cap
->issued();
3366 // permission bits, ACL/security xattrs
3367 if ((mask
& CEPH_CAP_AUTH_SHARED
) && (issued
& CEPH_CAP_AUTH_EXCL
) == 0)
3368 rdlocks
.insert(&cur
->authlock
);
3369 if ((mask
& CEPH_CAP_XATTR_SHARED
) && (issued
& CEPH_CAP_XATTR_EXCL
) == 0)
3370 rdlocks
.insert(&cur
->xattrlock
);
3372 mdr
->getattr_caps
= mask
;
3376 if ((flags
& CEPH_O_TRUNC
) && !mdr
->has_completed
) {
3377 assert(cur
->is_auth());
3379 xlocks
.insert(&cur
->filelock
);
3380 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3383 if (!check_access(mdr
, cur
, MAY_WRITE
))
3386 // wait for pending truncate?
3387 const auto pi
= cur
->get_projected_inode();
3388 if (pi
->is_truncating()) {
3389 dout(10) << " waiting for pending truncate from " << pi
->truncate_from
3390 << " to " << pi
->truncate_size
<< " to complete on " << *cur
<< dendl
;
3391 mds
->locker
->drop_locks(mdr
.get());
3392 mdr
->drop_local_auth_pins();
3393 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
3397 do_open_truncate(mdr
, cmode
);
3401 // sync filelock if snapped.
3402 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3403 // and that data itself is flushed so that we can read the snapped data off disk.
3404 if (mdr
->snapid
!= CEPH_NOSNAP
&& !cur
->is_dir()) {
3405 rdlocks
.insert(&cur
->filelock
);
3408 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3412 if (cmode
& CEPH_FILE_MODE_WR
)
3414 if (!check_access(mdr
, cur
, mask
))
3417 utime_t now
= ceph_clock_now();
3418 mdr
->set_mds_stamp(now
);
3420 if (cur
->is_file() || cur
->is_dir()) {
3421 if (mdr
->snapid
== CEPH_NOSNAP
) {
3423 Capability
*cap
= mds
->locker
->issue_new_caps(cur
, cmode
, mdr
->session
, 0, req
->is_replay());
3425 dout(12) << "open issued caps " << ccap_string(cap
->pending())
3426 << " for " << req
->get_source()
3427 << " on " << *cur
<< dendl
;
3429 int caps
= ceph_caps_for_mode(cmode
);
3430 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps
)
3431 << " for " << req
->get_source()
3432 << " snapid " << mdr
->snapid
3433 << " on " << *cur
<< dendl
;
3434 mdr
->snap_caps
= caps
;
3438 // increase max_size?
3439 if (cmode
& CEPH_FILE_MODE_WR
)
3440 mds
->locker
->check_inode_max_size(cur
);
3442 // make sure this inode gets into the journal
3443 if (cur
->is_auth() && cur
->last
== CEPH_NOSNAP
&&
3444 !cur
->item_open_file
.is_on_list()) {
3445 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
3446 EOpen
*le
= new EOpen(mds
->mdlog
);
3447 mdlog
->start_entry(le
);
3448 le
->add_clean_inode(cur
);
3449 ls
->open_files
.push_back(&cur
->item_open_file
);
3450 mdlog
->submit_entry(le
);
3454 if (cmode
& CEPH_FILE_MODE_WR
)
3455 mds
->balancer
->hit_inode(now
, cur
, META_POP_IWR
);
3457 mds
->balancer
->hit_inode(now
, cur
, META_POP_IRD
,
3458 mdr
->client_request
->get_source().num());
3461 if (req
->get_dentry_wanted()) {
3462 assert(mdr
->dn
[0].size());
3463 dn
= mdr
->dn
[0].back();
3468 respond_to_request(mdr
, 0);
3471 class C_MDS_openc_finish
: public ServerLogContext
{
3476 C_MDS_openc_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
, snapid_t f
) :
3477 ServerLogContext(s
, r
), dn(d
), newi(ni
), follows(f
) {}
3478 void finish(int r
) override
{
3481 dn
->pop_projected_linkage();
3483 // dirty inode, dn, dir
3484 newi
->inode
.version
--; // a bit hacky, see C_MDS_mknod_finish
3485 newi
->mark_dirty(newi
->inode
.version
+1, mdr
->ls
);
3486 newi
->mark_dirty_parent(mdr
->ls
, true);
3490 get_mds()->locker
->share_inode_max_size(newi
);
3492 MDRequestRef null_ref
;
3493 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
3495 utime_t now
= ceph_clock_now();
3496 get_mds()->balancer
->hit_inode(now
, newi
, META_POP_IWR
);
3498 server
->respond_to_request(mdr
, 0);
3500 assert(g_conf
->mds_kill_openc_at
!= 1);
3504 /* This function takes responsibility for the passed mdr*/
3505 void Server::handle_client_openc(MDRequestRef
& mdr
)
3507 MClientRequest
*req
= mdr
->client_request
;
3508 client_t client
= mdr
->get_client();
3510 dout(7) << "open w/ O_CREAT on " << req
->get_filepath() << dendl
;
3512 int cmode
= ceph_flags_to_mode(req
->head
.args
.open
.flags
);
3514 respond_to_request(mdr
, -EINVAL
);
3518 bool excl
= req
->head
.args
.open
.flags
& CEPH_O_EXCL
;
3521 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, req
->get_filepath(),
3522 &mdr
->dn
[0], NULL
, MDS_TRAVERSE_FORWARD
);
3526 handle_client_open(mdr
);
3529 if (r
< 0 && r
!= -ENOENT
) {
3531 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
3532 MDSInternalContextBase
*c
= new C_MDS_TryFindInode(this, mdr
);
3533 mdcache
->find_ino_peers(req
->get_filepath().get_ino(), c
);
3535 dout(10) << "FAIL on error " << r
<< dendl
;
3536 respond_to_request(mdr
, r
);
3542 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3543 file_layout_t
*dir_layout
= NULL
;
3544 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
,
3545 !excl
, false, false, &dir_layout
);
3547 if (mdr
->snapid
!= CEPH_NOSNAP
) {
3548 respond_to_request(mdr
, -EROFS
);
3552 file_layout_t layout
;
3554 layout
= *dir_layout
;
3556 layout
= mdcache
->default_file_layout
;
3558 // What kind of client caps are required to complete this operation
3559 uint64_t access
= MAY_WRITE
;
3561 const auto default_layout
= layout
;
3563 // fill in any special params from client
3564 if (req
->head
.args
.open
.stripe_unit
)
3565 layout
.stripe_unit
= req
->head
.args
.open
.stripe_unit
;
3566 if (req
->head
.args
.open
.stripe_count
)
3567 layout
.stripe_count
= req
->head
.args
.open
.stripe_count
;
3568 if (req
->head
.args
.open
.object_size
)
3569 layout
.object_size
= req
->head
.args
.open
.object_size
;
3570 if (req
->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID
) &&
3571 (__s32
)req
->head
.args
.open
.pool
>= 0) {
3572 layout
.pool_id
= req
->head
.args
.open
.pool
;
3574 // make sure we have as new a map as the client
3575 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
3576 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
3581 // If client doesn't have capability to modify layout pools, then
3582 // only permit this request if the requested pool matches what the
3583 // file would have inherited anyway from its parent.
3584 if (default_layout
!= layout
) {
3585 access
|= MAY_SET_VXATTR
;
3588 if (!layout
.is_valid()) {
3589 dout(10) << " invalid initial file layout" << dendl
;
3590 respond_to_request(mdr
, -EINVAL
);
3593 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
3594 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
3595 respond_to_request(mdr
, -EINVAL
);
3600 CDir
*dir
= dn
->get_dir();
3601 CInode
*diri
= dir
->get_inode();
3602 rdlocks
.insert(&diri
->authlock
);
3603 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3606 if (!check_access(mdr
, diri
, access
))
3609 if (!check_fragment_space(mdr
, dir
))
3612 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
3614 if (!dnl
->is_null()) {
3616 assert(req
->head
.args
.open
.flags
& CEPH_O_EXCL
);
3617 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl
;
3618 mdr
->tracei
= dnl
->get_inode();
3620 respond_to_request(mdr
, -EEXIST
);
3625 SnapRealm
*realm
= diri
->find_snaprealm(); // use directory's realm; inode isn't attached yet.
3626 snapid_t follows
= realm
->get_newest_seq();
3628 CInode
*in
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
3629 req
->head
.args
.open
.mode
| S_IFREG
, &layout
);
3633 dn
->push_projected_linkage(in
);
3635 in
->inode
.version
= dn
->pre_dirty();
3636 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
3637 in
->inode
.add_old_pool(mdcache
->default_file_layout
.pool_id
);
3638 in
->inode
.update_backtrace();
3639 if (cmode
& CEPH_FILE_MODE_WR
) {
3640 in
->inode
.client_ranges
[client
].range
.first
= 0;
3641 in
->inode
.client_ranges
[client
].range
.last
= in
->inode
.get_layout_size_increment();
3642 in
->inode
.client_ranges
[client
].follows
= follows
;
3644 in
->inode
.rstat
.rfiles
= 1;
3646 assert(dn
->first
== follows
+1);
3647 in
->first
= dn
->first
;
3650 mdr
->ls
= mdlog
->get_current_segment();
3651 EUpdate
*le
= new EUpdate(mdlog
, "openc");
3652 mdlog
->start_entry(le
);
3653 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
3654 journal_allocated_inos(mdr
, &le
->metablob
);
3655 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
3656 le
->metablob
.add_primary_dentry(dn
, in
, true, true, true);
3659 mds
->locker
->issue_new_caps(in
, cmode
, mdr
->session
, realm
, req
->is_replay());
3660 in
->authlock
.set_state(LOCK_EXCL
);
3661 in
->xattrlock
.set_state(LOCK_EXCL
);
3663 // make sure this inode gets into the journal
3664 le
->metablob
.add_opened_ino(in
->ino());
3665 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
3666 ls
->open_files
.push_back(&in
->item_open_file
);
3668 C_MDS_openc_finish
*fin
= new C_MDS_openc_finish(this, mdr
, dn
, in
, follows
);
3670 if (mdr
->client_request
->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE
)) {
3671 dout(10) << "adding ino to reply to indicate inode was created" << dendl
;
3672 // add the file created flag onto the reply if create_flags features is supported
3673 ::encode(in
->inode
.ino
, mdr
->reply_extra_bl
);
3676 journal_and_reply(mdr
, in
, dn
, le
, fin
);
3678 // We hit_dir (via hit_inode) in our finish callback, but by then we might
3679 // have overshot the split size (multiple opencs in flight), so here is
3680 // an early chance to split the dir if this openc makes it oversized.
3681 mds
->balancer
->maybe_fragment(dir
, false);
3686 void Server::handle_client_readdir(MDRequestRef
& mdr
)
3688 MClientRequest
*req
= mdr
->client_request
;
3689 client_t client
= req
->get_source().num();
3690 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3691 CInode
*diri
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, false, true);
3694 // it's a directory, right?
3695 if (!diri
->is_dir()) {
3697 dout(10) << "reply to " << *req
<< " readdir -ENOTDIR" << dendl
;
3698 respond_to_request(mdr
, -ENOTDIR
);
3702 rdlocks
.insert(&diri
->filelock
);
3703 rdlocks
.insert(&diri
->dirfragtreelock
);
3705 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
3708 if (!check_access(mdr
, diri
, MAY_READ
))
3712 frag_t fg
= (__u32
)req
->head
.args
.readdir
.frag
;
3713 unsigned req_flags
= (__u32
)req
->head
.args
.readdir
.flags
;
3714 string offset_str
= req
->get_path2();
3716 __u32 offset_hash
= 0;
3717 if (!offset_str
.empty())
3718 offset_hash
= ceph_frag_value(diri
->hash_dentry_name(offset_str
));
3720 offset_hash
= (__u32
)req
->head
.args
.readdir
.offset_hash
;
3722 dout(10) << " frag " << fg
<< " offset '" << offset_str
<< "'"
3723 << " offset_hash " << offset_hash
<< " flags " << req_flags
<< dendl
;
3725 // does the frag exist?
3726 if (diri
->dirfragtree
[fg
.value()] != fg
) {
3728 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
3729 if (fg
.contains((unsigned)offset_hash
)) {
3730 newfg
= diri
->dirfragtree
[offset_hash
];
3732 // client actually wants next frag
3733 newfg
= diri
->dirfragtree
[fg
.value()];
3737 newfg
= diri
->dirfragtree
[fg
.value()];
3739 dout(10) << " adjust frag " << fg
<< " -> " << newfg
<< " " << diri
->dirfragtree
<< dendl
;
3743 CDir
*dir
= try_open_auth_dirfrag(diri
, fg
, mdr
);
3747 dout(10) << "handle_client_readdir on " << *dir
<< dendl
;
3748 assert(dir
->is_auth());
3750 if (!dir
->is_complete()) {
3751 if (dir
->is_frozen()) {
3752 dout(7) << "dir is frozen " << *dir
<< dendl
;
3753 mds
->locker
->drop_locks(mdr
.get());
3754 mdr
->drop_local_auth_pins();
3755 dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_MDS_RetryRequest(mdcache
, mdr
));
3759 dout(10) << " incomplete dir contents for readdir on " << *dir
<< ", fetching" << dendl
;
3760 dir
->fetch(new C_MDS_RetryRequest(mdcache
, mdr
), true);
3764 #ifdef MDS_VERIFY_FRAGSTAT
3765 dir
->verify_fragstat();
3768 utime_t now
= ceph_clock_now();
3769 mdr
->set_mds_stamp(now
);
3771 snapid_t snapid
= mdr
->snapid
;
3772 dout(10) << "snapid " << snapid
<< dendl
;
3774 SnapRealm
*realm
= diri
->find_snaprealm();
3776 unsigned max
= req
->head
.args
.readdir
.max_entries
;
3778 max
= dir
->get_num_any(); // whatever, something big.
3779 unsigned max_bytes
= req
->head
.args
.readdir
.max_bytes
;
3781 // make sure at least one item can be encoded
3782 max_bytes
= (512 << 10) + g_conf
->mds_max_xattr_pairs_size
;
3786 dir
->encode_dirstat(dirbl
, mds
->get_nodeid());
3788 // count bytes available.
3789 // this isn't perfect, but we should capture the main variable/unbounded size items!
3790 int front_bytes
= dirbl
.length() + sizeof(__u32
) + sizeof(__u8
)*2;
3791 int bytes_left
= max_bytes
- front_bytes
;
3792 bytes_left
-= realm
->get_snap_trace().length();
3794 // build dir contents
3797 bool start
= !offset_hash
&& offset_str
.empty();
3798 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
3799 dentry_key_t
skip_key(snapid
, offset_str
.c_str(), offset_hash
);
3800 auto it
= start
? dir
->begin() : dir
->lower_bound(skip_key
);
3801 bool end
= (it
== dir
->end());
3802 for (; !end
&& numfiles
< max
; end
= (it
== dir
->end())) {
3803 CDentry
*dn
= it
->second
;
3806 if (dn
->state_test(CDentry::STATE_PURGING
))
3809 bool dnp
= dn
->use_projected(client
, mdr
);
3810 CDentry::linkage_t
*dnl
= dnp
? dn
->get_projected_linkage() : dn
->get_linkage();
3815 if (dn
->last
< snapid
|| dn
->first
> snapid
) {
3816 dout(20) << "skipping non-overlapping snap " << *dn
<< dendl
;
3821 dentry_key_t
offset_key(dn
->last
, offset_str
.c_str(), offset_hash
);
3822 if (!(offset_key
< dn
->key()))
3826 CInode
*in
= dnl
->get_inode();
3828 if (in
&& in
->ino() == CEPH_INO_CEPH
)
3832 // better for the MDS to do the work, if we think the client will stat any of these files.
3833 if (dnl
->is_remote() && !in
) {
3834 in
= mdcache
->get_inode(dnl
->get_remote_ino());
3836 dn
->link_remote(dnl
, in
);
3837 } else if (dn
->state_test(CDentry::STATE_BADREMOTEINO
)) {
3838 dout(10) << "skipping bad remote ino on " << *dn
<< dendl
;
3841 // touch everything i _do_ have
3842 for (auto &p
: *dir
) {
3843 if (!p
.second
->get_linkage()->is_null())
3844 mdcache
->lru
.lru_touch(p
.second
);
3847 // already issued caps and leases, reply immediately.
3848 if (dnbl
.length() > 0) {
3849 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDSInternalNoop
);
3850 dout(10) << " open remote dentry after caps were issued, stopping at "
3851 << dnbl
.length() << " < " << bytes_left
<< dendl
;
3855 mds
->locker
->drop_locks(mdr
.get());
3856 mdr
->drop_local_auth_pins();
3857 mdcache
->open_remote_dentry(dn
, dnp
, new C_MDS_RetryRequest(mdcache
, mdr
));
3863 if ((int)(dnbl
.length() + dn
->get_name().length() + sizeof(__u32
) + sizeof(LeaseStat
)) > bytes_left
) {
3864 dout(10) << " ran out of room, stopping at " << dnbl
.length() << " < " << bytes_left
<< dendl
;
3868 unsigned start_len
= dnbl
.length();
3871 dout(12) << "including dn " << *dn
<< dendl
;
3872 ::encode(dn
->get_name(), dnbl
);
3873 mds
->locker
->issue_client_lease(dn
, client
, dnbl
, now
, mdr
->session
);
3876 dout(12) << "including inode " << *in
<< dendl
;
3877 int r
= in
->encode_inodestat(dnbl
, mdr
->session
, realm
, snapid
, bytes_left
- (int)dnbl
.length());
3879 // chop off dn->name, lease
3880 dout(10) << " ran out of room, stopping at " << start_len
<< " < " << bytes_left
<< dendl
;
3882 keep
.substr_of(dnbl
, 0, start_len
);
3890 mdcache
->lru
.lru_touch(dn
);
3895 flags
= CEPH_READDIR_FRAG_END
;
3897 flags
|= CEPH_READDIR_FRAG_COMPLETE
; // FIXME: what purpose does this serve
3899 // client only understand END and COMPLETE flags ?
3900 if (req_flags
& CEPH_READDIR_REPLY_BITFLAGS
) {
3901 flags
|= CEPH_READDIR_HASH_ORDER
| CEPH_READDIR_OFFSET_HASH
;
3904 // finish final blob
3905 ::encode(numfiles
, dirbl
);
3906 ::encode(flags
, dirbl
);
3907 dirbl
.claim_append(dnbl
);
3910 dout(10) << "reply to " << *req
<< " readdir num=" << numfiles
3911 << " bytes=" << dirbl
.length()
3912 << " start=" << (int)start
3913 << " end=" << (int)end
3915 mdr
->reply_extra_bl
= dirbl
;
3917 // bump popularity. NOTE: this doesn't quite capture it.
3918 mds
->balancer
->hit_dir(now
, dir
, META_POP_IRD
, -1, numfiles
);
3922 respond_to_request(mdr
, 0);
3927 // ===============================================================================
3932 * finisher for basic inode updates
3934 class C_MDS_inode_update_finish
: public ServerLogContext
{
3936 bool truncating_smaller
, changed_ranges
;
3938 C_MDS_inode_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
,
3939 bool sm
=false, bool cr
=false) :
3940 ServerLogContext(s
, r
), in(i
), truncating_smaller(sm
), changed_ranges(cr
) { }
3941 void finish(int r
) override
{
3945 in
->pop_and_dirty_projected_inode(mdr
->ls
);
3948 // notify any clients
3949 if (truncating_smaller
&& in
->inode
.is_truncating()) {
3950 get_mds()->locker
->issue_truncate(in
);
3951 get_mds()->mdcache
->truncate_inode(in
, mdr
->ls
);
3954 utime_t now
= ceph_clock_now();
3955 get_mds()->balancer
->hit_inode(now
, in
, META_POP_IWR
);
3957 server
->respond_to_request(mdr
, 0);
3960 get_mds()->locker
->share_inode_max_size(in
);
3964 void Server::handle_client_file_setlock(MDRequestRef
& mdr
)
3966 MClientRequest
*req
= mdr
->client_request
;
3967 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
3969 // get the inode to operate on, and set up any locks needed for that
3970 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
3974 xlocks
.insert(&cur
->flocklock
);
3975 /* acquire_locks will return true if it gets the locks. If it fails,
3976 it will redeliver this request at a later date, so drop the request.
3978 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
)) {
3979 dout(10) << "handle_client_file_setlock could not get locks!" << dendl
;
3983 // copy the lock change into a ceph_filelock so we can store/apply it
3984 ceph_filelock set_lock
;
3985 set_lock
.start
= req
->head
.args
.filelock_change
.start
;
3986 set_lock
.length
= req
->head
.args
.filelock_change
.length
;
3987 set_lock
.client
= req
->get_orig_source().num();
3988 set_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
3989 set_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
3990 set_lock
.type
= req
->head
.args
.filelock_change
.type
;
3991 bool will_wait
= req
->head
.args
.filelock_change
.wait
;
3993 dout(10) << "handle_client_file_setlock: " << set_lock
<< dendl
;
3995 ceph_lock_state_t
*lock_state
= NULL
;
3996 bool interrupt
= false;
3998 // get the appropriate lock state
3999 switch (req
->head
.args
.filelock_change
.rule
) {
4000 case CEPH_LOCK_FLOCK_INTR
:
4003 case CEPH_LOCK_FLOCK
:
4004 lock_state
= cur
->get_flock_lock_state();
4007 case CEPH_LOCK_FCNTL_INTR
:
4010 case CEPH_LOCK_FCNTL
:
4011 lock_state
= cur
->get_fcntl_lock_state();
4015 dout(10) << "got unknown lock type " << set_lock
.type
4016 << ", dropping request!" << dendl
;
4017 respond_to_request(mdr
, -EOPNOTSUPP
);
4021 dout(10) << " state prior to lock change: " << *lock_state
<< dendl
;
4022 if (CEPH_LOCK_UNLOCK
== set_lock
.type
) {
4023 list
<ceph_filelock
> activated_locks
;
4024 list
<MDSInternalContextBase
*> waiters
;
4025 if (lock_state
->is_waiting(set_lock
)) {
4026 dout(10) << " unlock removing waiting lock " << set_lock
<< dendl
;
4027 lock_state
->remove_waiting(set_lock
);
4028 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4029 } else if (!interrupt
) {
4030 dout(10) << " unlock attempt on " << set_lock
<< dendl
;
4031 lock_state
->remove_lock(set_lock
, activated_locks
);
4032 cur
->take_waiting(CInode::WAIT_FLOCK
, waiters
);
4034 mds
->queue_waiters(waiters
);
4036 respond_to_request(mdr
, 0);
4038 dout(10) << " lock attempt on " << set_lock
<< dendl
;
4039 bool deadlock
= false;
4040 if (mdr
->more()->flock_was_waiting
&&
4041 !lock_state
->is_waiting(set_lock
)) {
4042 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock
<< dendl
;
4043 respond_to_request(mdr
, -EINTR
);
4044 } else if (!lock_state
->add_lock(set_lock
, will_wait
, mdr
->more()->flock_was_waiting
, &deadlock
)) {
4045 dout(10) << " it failed on this attempt" << dendl
;
4046 // couldn't set lock right now
4048 respond_to_request(mdr
, -EDEADLK
);
4049 } else if (!will_wait
) {
4050 respond_to_request(mdr
, -EWOULDBLOCK
);
4052 dout(10) << " added to waiting list" << dendl
;
4053 assert(lock_state
->is_waiting(set_lock
));
4054 mdr
->more()->flock_was_waiting
= true;
4055 mds
->locker
->drop_locks(mdr
.get());
4056 mdr
->drop_local_auth_pins();
4057 mdr
->mark_event("failed to add lock, waiting");
4059 cur
->add_waiter(CInode::WAIT_FLOCK
, new C_MDS_RetryRequest(mdcache
, mdr
));
4062 respond_to_request(mdr
, 0);
4064 dout(10) << " state after lock change: " << *lock_state
<< dendl
;
4067 void Server::handle_client_file_readlock(MDRequestRef
& mdr
)
4069 MClientRequest
*req
= mdr
->client_request
;
4070 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4072 // get the inode to operate on, and set up any locks needed for that
4073 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4077 /* acquire_locks will return true if it gets the locks. If it fails,
4078 it will redeliver this request at a later date, so drop the request.
4080 rdlocks
.insert(&cur
->flocklock
);
4081 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
)) {
4082 dout(10) << "handle_client_file_readlock could not get locks!" << dendl
;
4086 // copy the lock change into a ceph_filelock so we can store/apply it
4087 ceph_filelock checking_lock
;
4088 checking_lock
.start
= req
->head
.args
.filelock_change
.start
;
4089 checking_lock
.length
= req
->head
.args
.filelock_change
.length
;
4090 checking_lock
.client
= req
->get_orig_source().num();
4091 checking_lock
.owner
= req
->head
.args
.filelock_change
.owner
;
4092 checking_lock
.pid
= req
->head
.args
.filelock_change
.pid
;
4093 checking_lock
.type
= req
->head
.args
.filelock_change
.type
;
4095 // get the appropriate lock state
4096 ceph_lock_state_t
*lock_state
= NULL
;
4097 switch (req
->head
.args
.filelock_change
.rule
) {
4098 case CEPH_LOCK_FLOCK
:
4099 lock_state
= cur
->get_flock_lock_state();
4102 case CEPH_LOCK_FCNTL
:
4103 lock_state
= cur
->get_fcntl_lock_state();
4107 dout(10) << "got unknown lock type " << checking_lock
.type
<< dendl
;
4108 respond_to_request(mdr
, -EINVAL
);
4111 lock_state
->look_for_lock(checking_lock
);
4114 ::encode(checking_lock
, lock_bl
);
4116 mdr
->reply_extra_bl
= lock_bl
;
4117 respond_to_request(mdr
, 0);
4120 void Server::handle_client_setattr(MDRequestRef
& mdr
)
4122 MClientRequest
*req
= mdr
->client_request
;
4123 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4124 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4127 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4128 respond_to_request(mdr
, -EROFS
);
4131 if (cur
->ino() < MDS_INO_SYSTEM_BASE
&& !cur
->is_base()) {
4132 respond_to_request(mdr
, -EPERM
);
4136 __u32 mask
= req
->head
.args
.setattr
.mask
;
4137 __u32 access_mask
= MAY_WRITE
;
4140 if (mask
& (CEPH_SETATTR_MODE
|CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_BTIME
|CEPH_SETATTR_KILL_SGUID
))
4141 xlocks
.insert(&cur
->authlock
);
4142 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
|CEPH_SETATTR_SIZE
))
4143 xlocks
.insert(&cur
->filelock
);
4144 if (mask
& CEPH_SETATTR_CTIME
)
4145 wrlocks
.insert(&cur
->versionlock
);
4147 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4150 if ((mask
& CEPH_SETATTR_UID
) && (cur
->inode
.uid
!= req
->head
.args
.setattr
.uid
))
4151 access_mask
|= MAY_CHOWN
;
4153 if ((mask
& CEPH_SETATTR_GID
) && (cur
->inode
.gid
!= req
->head
.args
.setattr
.gid
))
4154 access_mask
|= MAY_CHGRP
;
4156 if (!check_access(mdr
, cur
, access_mask
))
4159 // trunc from bigger -> smaller?
4160 auto pip
= cur
->get_projected_inode();
4162 uint64_t old_size
= std::max
<uint64_t>(pip
->size
, req
->head
.args
.setattr
.old_size
);
4164 // ENOSPC on growing file while full, but allow shrinks
4165 if (is_full
&& req
->head
.args
.setattr
.size
> old_size
) {
4166 dout(20) << __func__
<< ": full, responding ENOSPC to setattr with larger size" << dendl
;
4167 respond_to_request(mdr
, -ENOSPC
);
4171 bool truncating_smaller
= false;
4172 if (mask
& CEPH_SETATTR_SIZE
) {
4173 truncating_smaller
= req
->head
.args
.setattr
.size
< old_size
;
4174 if (truncating_smaller
&& pip
->is_truncating()) {
4175 dout(10) << " waiting for pending truncate from " << pip
->truncate_from
4176 << " to " << pip
->truncate_size
<< " to complete on " << *cur
<< dendl
;
4177 mds
->locker
->drop_locks(mdr
.get());
4178 mdr
->drop_local_auth_pins();
4179 cur
->add_waiter(CInode::WAIT_TRUNC
, new C_MDS_RetryRequest(mdcache
, mdr
));
4184 bool changed_ranges
= false;
4187 mdr
->ls
= mdlog
->get_current_segment();
4188 EUpdate
*le
= new EUpdate(mdlog
, "setattr");
4189 mdlog
->start_entry(le
);
4191 auto &pi
= cur
->project_inode();
4193 if (mask
& CEPH_SETATTR_UID
)
4194 pi
.inode
.uid
= req
->head
.args
.setattr
.uid
;
4195 if (mask
& CEPH_SETATTR_GID
)
4196 pi
.inode
.gid
= req
->head
.args
.setattr
.gid
;
4198 if (mask
& CEPH_SETATTR_MODE
)
4199 pi
.inode
.mode
= (pi
.inode
.mode
& ~07777) | (req
->head
.args
.setattr
.mode
& 07777);
4200 else if ((mask
& (CEPH_SETATTR_UID
|CEPH_SETATTR_GID
|CEPH_SETATTR_KILL_SGUID
)) &&
4201 S_ISREG(pi
.inode
.mode
) &&
4202 (pi
.inode
.mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
4203 pi
.inode
.mode
&= ~(S_ISUID
|S_ISGID
);
4206 if (mask
& CEPH_SETATTR_MTIME
)
4207 pi
.inode
.mtime
= req
->head
.args
.setattr
.mtime
;
4208 if (mask
& CEPH_SETATTR_ATIME
)
4209 pi
.inode
.atime
= req
->head
.args
.setattr
.atime
;
4210 if (mask
& CEPH_SETATTR_BTIME
)
4211 pi
.inode
.btime
= req
->head
.args
.setattr
.btime
;
4212 if (mask
& (CEPH_SETATTR_ATIME
| CEPH_SETATTR_MTIME
| CEPH_SETATTR_BTIME
))
4213 pi
.inode
.time_warp_seq
++; // maybe not a timewarp, but still a serialization point.
4214 if (mask
& CEPH_SETATTR_SIZE
) {
4215 if (truncating_smaller
) {
4216 pi
.inode
.truncate(old_size
, req
->head
.args
.setattr
.size
);
4217 le
->metablob
.add_truncate_start(cur
->ino());
4219 pi
.inode
.size
= req
->head
.args
.setattr
.size
;
4220 pi
.inode
.rstat
.rbytes
= pi
.inode
.size
;
4222 pi
.inode
.mtime
= mdr
->get_op_stamp();
4224 // adjust client's max_size?
4225 CInode::mempool_inode::client_range_map new_ranges
;
4226 bool max_increased
= false;
4227 mds
->locker
->calc_new_client_ranges(cur
, pi
.inode
.size
, &new_ranges
, &max_increased
);
4228 if (pi
.inode
.client_ranges
!= new_ranges
) {
4229 dout(10) << " client_ranges " << pi
.inode
.client_ranges
<< " -> " << new_ranges
<< dendl
;
4230 pi
.inode
.client_ranges
= new_ranges
;
4231 changed_ranges
= true;
4235 pi
.inode
.version
= cur
->pre_dirty();
4236 pi
.inode
.ctime
= mdr
->get_op_stamp();
4237 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
4238 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
4239 pi
.inode
.change_attr
++;
4242 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4243 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4244 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4246 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
,
4247 truncating_smaller
, changed_ranges
));
4249 // flush immediately if there are readers/writers waiting
4250 if (xlocks
.count(&cur
->filelock
) &&
4251 (cur
->get_caps_wanted() & (CEPH_CAP_FILE_RD
|CEPH_CAP_FILE_WR
)))
4252 mds
->mdlog
->flush();
4255 /* Takes responsibility for mdr */
4256 void Server::do_open_truncate(MDRequestRef
& mdr
, int cmode
)
4258 CInode
*in
= mdr
->in
[0];
4259 client_t client
= mdr
->get_client();
4262 dout(10) << "do_open_truncate " << *in
<< dendl
;
4264 SnapRealm
*realm
= in
->find_snaprealm();
4265 mds
->locker
->issue_new_caps(in
, cmode
, mdr
->session
, realm
, mdr
->client_request
->is_replay());
4267 mdr
->ls
= mdlog
->get_current_segment();
4268 EUpdate
*le
= new EUpdate(mdlog
, "open_truncate");
4269 mdlog
->start_entry(le
);
4272 auto &pi
= in
->project_inode();
4273 pi
.inode
.version
= in
->pre_dirty();
4274 pi
.inode
.mtime
= pi
.inode
.ctime
= mdr
->get_op_stamp();
4275 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
4276 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
4277 pi
.inode
.change_attr
++;
4279 uint64_t old_size
= std::max
<uint64_t>(pi
.inode
.size
, mdr
->client_request
->head
.args
.open
.old_size
);
4281 pi
.inode
.truncate(old_size
, 0);
4282 le
->metablob
.add_truncate_start(in
->ino());
4285 bool changed_ranges
= false;
4286 if (cmode
& CEPH_FILE_MODE_WR
) {
4287 pi
.inode
.client_ranges
[client
].range
.first
= 0;
4288 pi
.inode
.client_ranges
[client
].range
.last
= pi
.inode
.get_layout_size_increment();
4289 pi
.inode
.client_ranges
[client
].follows
= in
->find_snaprealm()->get_newest_seq();
4290 changed_ranges
= true;
4293 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
4295 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
4296 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
4298 // make sure ino gets into the journal
4299 le
->metablob
.add_opened_ino(in
->ino());
4300 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
4301 ls
->open_files
.push_back(&in
->item_open_file
);
4303 mdr
->o_trunc
= true;
4306 if (mdr
->client_request
->get_dentry_wanted()) {
4307 assert(mdr
->dn
[0].size());
4308 dn
= mdr
->dn
[0].back();
4311 journal_and_reply(mdr
, in
, dn
, le
, new C_MDS_inode_update_finish(this, mdr
, in
, old_size
> 0,
4313 // Although the `open` part can give an early reply, the truncation won't
4314 // happen until our EUpdate is persistent, to give the client a prompt
4315 // response we must also flush that event.
4320 /* This function cleans up the passed mdr */
4321 void Server::handle_client_setlayout(MDRequestRef
& mdr
)
4323 MClientRequest
*req
= mdr
->client_request
;
4324 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4325 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4328 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4329 respond_to_request(mdr
, -EROFS
);
4332 if (!cur
->is_file()) {
4333 respond_to_request(mdr
, -EINVAL
);
4336 if (cur
->get_projected_inode()->size
||
4337 cur
->get_projected_inode()->truncate_seq
> 1) {
4338 respond_to_request(mdr
, -ENOTEMPTY
);
4343 file_layout_t layout
= cur
->get_projected_inode()->layout
;
4344 // save existing layout for later
4345 const auto old_layout
= layout
;
4347 int access
= MAY_WRITE
;
4349 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
4350 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
4351 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
4352 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
4353 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
4354 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
4355 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
4356 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
4358 // make sure we have as new a map as the client
4359 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4360 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4365 // Don't permit layout modifications without 'p' caps
4366 if (layout
!= old_layout
) {
4367 access
|= MAY_SET_VXATTR
;
4370 if (!layout
.is_valid()) {
4371 dout(10) << "bad layout" << dendl
;
4372 respond_to_request(mdr
, -EINVAL
);
4375 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4376 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4377 respond_to_request(mdr
, -EINVAL
);
4381 xlocks
.insert(&cur
->filelock
);
4382 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4385 if (!check_access(mdr
, cur
, access
))
4389 auto &pi
= cur
->project_inode();
4390 pi
.inode
.layout
= layout
;
4391 // add the old pool to the inode
4392 pi
.inode
.add_old_pool(old_layout
.pool_id
);
4393 pi
.inode
.version
= cur
->pre_dirty();
4394 pi
.inode
.ctime
= mdr
->get_op_stamp();
4395 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
4396 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
4397 pi
.inode
.change_attr
++;
4400 mdr
->ls
= mdlog
->get_current_segment();
4401 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
4402 mdlog
->start_entry(le
);
4403 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4404 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4405 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4407 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4410 void Server::handle_client_setdirlayout(MDRequestRef
& mdr
)
4412 MClientRequest
*req
= mdr
->client_request
;
4413 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4414 file_layout_t
*dir_layout
= NULL
;
4415 CInode
*cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true, false, &dir_layout
);
4418 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4419 respond_to_request(mdr
, -EROFS
);
4423 if (!cur
->is_dir()) {
4424 respond_to_request(mdr
, -ENOTDIR
);
4428 xlocks
.insert(&cur
->policylock
);
4429 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4433 const auto old_pi
= cur
->get_projected_inode();
4434 file_layout_t layout
;
4435 if (old_pi
->has_layout())
4436 layout
= old_pi
->layout
;
4437 else if (dir_layout
)
4438 layout
= *dir_layout
;
4440 layout
= mdcache
->default_file_layout
;
4442 // Level of access required to complete
4443 int access
= MAY_WRITE
;
4445 const auto old_layout
= layout
;
4447 if (req
->head
.args
.setlayout
.layout
.fl_object_size
> 0)
4448 layout
.object_size
= req
->head
.args
.setlayout
.layout
.fl_object_size
;
4449 if (req
->head
.args
.setlayout
.layout
.fl_stripe_unit
> 0)
4450 layout
.stripe_unit
= req
->head
.args
.setlayout
.layout
.fl_stripe_unit
;
4451 if (req
->head
.args
.setlayout
.layout
.fl_stripe_count
> 0)
4452 layout
.stripe_count
=req
->head
.args
.setlayout
.layout
.fl_stripe_count
;
4453 if (req
->head
.args
.setlayout
.layout
.fl_pg_pool
> 0) {
4454 layout
.pool_id
= req
->head
.args
.setlayout
.layout
.fl_pg_pool
;
4455 // make sure we have as new a map as the client
4456 if (req
->get_mdsmap_epoch() > mds
->mdsmap
->get_epoch()) {
4457 mds
->wait_for_mdsmap(req
->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache
, mdr
));
4462 if (layout
!= old_layout
) {
4463 access
|= MAY_SET_VXATTR
;
4466 if (!layout
.is_valid()) {
4467 dout(10) << "bad layout" << dendl
;
4468 respond_to_request(mdr
, -EINVAL
);
4471 if (!mds
->mdsmap
->is_data_pool(layout
.pool_id
)) {
4472 dout(10) << " invalid data pool " << layout
.pool_id
<< dendl
;
4473 respond_to_request(mdr
, -EINVAL
);
4477 if (!check_access(mdr
, cur
, access
))
4480 auto &pi
= cur
->project_inode();
4481 pi
.inode
.layout
= layout
;
4482 pi
.inode
.version
= cur
->pre_dirty();
4485 mdr
->ls
= mdlog
->get_current_segment();
4486 EUpdate
*le
= new EUpdate(mdlog
, "setlayout");
4487 mdlog
->start_entry(le
);
4488 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4489 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4490 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4492 mdr
->no_early_reply
= true;
4493 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4498 int Server::parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
4499 file_layout_t
*layout
, bool validate
)
4501 dout(20) << "parse_layout_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
4503 if (name
== "layout") {
4504 string::iterator begin
= value
.begin();
4505 string::iterator end
= value
.end();
4506 keys_and_values
<string::iterator
> p
; // create instance of parser
4507 std::map
<string
, string
> m
; // map to receive results
4508 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
4511 string
left(begin
, end
);
4512 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
4515 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
4516 // Skip validation on each attr, we do it once at the end (avoid
4517 // rejecting intermediate states if the overall result is ok)
4518 int r
= parse_layout_vxattr(string("layout.") + q
->first
, q
->second
,
4519 osdmap
, layout
, false);
4523 } else if (name
== "layout.object_size") {
4524 layout
->object_size
= boost::lexical_cast
<unsigned>(value
);
4525 } else if (name
== "layout.stripe_unit") {
4526 layout
->stripe_unit
= boost::lexical_cast
<unsigned>(value
);
4527 } else if (name
== "layout.stripe_count") {
4528 layout
->stripe_count
= boost::lexical_cast
<unsigned>(value
);
4529 } else if (name
== "layout.pool") {
4531 layout
->pool_id
= boost::lexical_cast
<unsigned>(value
);
4532 } catch (boost::bad_lexical_cast
const&) {
4533 int64_t pool
= osdmap
.lookup_pg_pool_name(value
);
4535 dout(10) << " unknown pool " << value
<< dendl
;
4538 layout
->pool_id
= pool
;
4540 } else if (name
== "layout.pool_namespace") {
4541 layout
->pool_ns
= value
;
4543 dout(10) << " unknown layout vxattr " << name
<< dendl
;
4546 } catch (boost::bad_lexical_cast
const&) {
4547 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
4551 if (validate
&& !layout
->is_valid()) {
4552 dout(10) << "bad layout" << dendl
;
4555 if (!mds
->mdsmap
->is_data_pool(layout
->pool_id
)) {
4556 dout(10) << " invalid data pool " << layout
->pool_id
<< dendl
;
4562 int Server::parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
)
4564 dout(20) << "parse_quota_vxattr name " << name
<< " value '" << value
<< "'" << dendl
;
4566 if (name
== "quota") {
4567 string::iterator begin
= value
.begin();
4568 string::iterator end
= value
.end();
4569 keys_and_values
<string::iterator
> p
; // create instance of parser
4570 std::map
<string
, string
> m
; // map to receive results
4571 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
4574 string
left(begin
, end
);
4575 dout(10) << " parsed " << m
<< " left '" << left
<< "'" << dendl
;
4578 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
4579 int r
= parse_quota_vxattr(string("quota.") + q
->first
, q
->second
, quota
);
4583 } else if (name
== "quota.max_bytes") {
4584 int64_t q
= boost::lexical_cast
<int64_t>(value
);
4587 quota
->max_bytes
= q
;
4588 } else if (name
== "quota.max_files") {
4589 int64_t q
= boost::lexical_cast
<int64_t>(value
);
4592 quota
->max_files
= q
;
4594 dout(10) << " unknown quota vxattr " << name
<< dendl
;
4597 } catch (boost::bad_lexical_cast
const&) {
4598 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
4602 if (!quota
->is_valid()) {
4603 dout(10) << "bad quota" << dendl
;
4610 * Verify that the file layout attribute carried by client
4611 * is well-formatted.
4612 * Return 0 on success, otherwise this function takes
4613 * responsibility for the passed mdr.
4615 int Server::check_layout_vxattr(MDRequestRef
& mdr
,
4618 file_layout_t
*layout
)
4620 MClientRequest
*req
= mdr
->client_request
;
4624 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
4625 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
4626 epoch
= osdmap
.get_epoch();
4631 // we don't have the specified pool, make sure our map
4632 // is newer than or as new as the client.
4633 epoch_t req_epoch
= req
->get_osdmap_epoch();
4635 if (req_epoch
> epoch
) {
4637 // well, our map is older. consult mds.
4638 Context
*fin
= new C_IO_Wrapper(mds
, new C_MDS_RetryRequest(mdcache
, mdr
));
4640 if (!mds
->objecter
->wait_for_map(req_epoch
, fin
))
4641 return r
; // wait, fin will retry this request later
4645 // now we have at least as new a map as the client, try again.
4646 mds
->objecter
->with_osdmap([&](const OSDMap
& osdmap
) {
4647 r
= parse_layout_vxattr(name
, value
, osdmap
, layout
);
4648 epoch
= osdmap
.get_epoch();
4651 assert(epoch
>= req_epoch
); // otherwise wait_for_map() told a lie
4653 } else if (req_epoch
== 0 && !mdr
->waited_for_osdmap
) {
4655 // For compatibility with client w/ old code, we still need get the
4656 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
4657 // we can remove those code.
4658 mdr
->waited_for_osdmap
= true;
4659 mds
->objecter
->wait_for_latest_osdmap(new C_IO_Wrapper(
4660 mds
, new C_MDS_RetryRequest(mdcache
, mdr
)));
4670 respond_to_request(mdr
, r
);
4678 void Server::handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
,
4679 file_layout_t
*dir_layout
,
4680 set
<SimpleLock
*> rdlocks
,
4681 set
<SimpleLock
*> wrlocks
,
4682 set
<SimpleLock
*> xlocks
)
4684 MClientRequest
*req
= mdr
->client_request
;
4685 string
name(req
->get_path2());
4686 bufferlist bl
= req
->get_data();
4687 string
value (bl
.c_str(), bl
.length());
4688 dout(10) << "handle_set_vxattr " << name
4689 << " val " << value
.length()
4690 << " bytes on " << *cur
4693 CInode::mempool_inode
*pip
= nullptr;
4696 if (!check_access(mdr
, cur
, MAY_SET_VXATTR
)) {
4700 if (name
.compare(0, 15, "ceph.dir.layout") == 0) {
4701 if (!cur
->is_dir()) {
4702 respond_to_request(mdr
, -EINVAL
);
4706 file_layout_t layout
;
4707 if (cur
->get_projected_inode()->has_layout())
4708 layout
= cur
->get_projected_inode()->layout
;
4709 else if (dir_layout
)
4710 layout
= *dir_layout
;
4712 layout
= mdcache
->default_file_layout
;
4714 rest
= name
.substr(name
.find("layout"));
4715 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
4718 xlocks
.insert(&cur
->policylock
);
4719 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4722 auto &pi
= cur
->project_inode();
4723 pi
.inode
.layout
= layout
;
4724 mdr
->no_early_reply
= true;
4726 } else if (name
.compare(0, 16, "ceph.file.layout") == 0) {
4727 if (!cur
->is_file()) {
4728 respond_to_request(mdr
, -EINVAL
);
4731 if (cur
->get_projected_inode()->size
||
4732 cur
->get_projected_inode()->truncate_seq
> 1) {
4733 respond_to_request(mdr
, -ENOTEMPTY
);
4736 file_layout_t layout
= cur
->get_projected_inode()->layout
;
4737 rest
= name
.substr(name
.find("layout"));
4738 if (check_layout_vxattr(mdr
, rest
, value
, &layout
) < 0)
4741 xlocks
.insert(&cur
->filelock
);
4742 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4745 auto &pi
= cur
->project_inode();
4746 int64_t old_pool
= pi
.inode
.layout
.pool_id
;
4747 pi
.inode
.add_old_pool(old_pool
);
4748 pi
.inode
.layout
= layout
;
4750 } else if (name
.compare(0, 10, "ceph.quota") == 0) {
4751 if (!cur
->is_dir() || cur
->is_root()) {
4752 respond_to_request(mdr
, -EINVAL
);
4756 quota_info_t quota
= cur
->get_projected_inode()->quota
;
4758 rest
= name
.substr(name
.find("quota"));
4759 int r
= parse_quota_vxattr(rest
, value
, "a
);
4761 respond_to_request(mdr
, r
);
4765 xlocks
.insert(&cur
->policylock
);
4766 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4769 auto &pi
= cur
->project_inode();
4770 pi
.inode
.quota
= quota
;
4772 mdr
->no_early_reply
= true;
4775 client_t exclude_ct
= mdr
->get_client();
4776 mdcache
->broadcast_quota_to_client(cur
, exclude_ct
);
4777 } else if (name
.find("ceph.dir.pin") == 0) {
4778 if (!cur
->is_dir() || cur
->is_root()) {
4779 respond_to_request(mdr
, -EINVAL
);
4785 rank
= boost::lexical_cast
<mds_rank_t
>(value
);
4786 if (rank
< 0) rank
= MDS_RANK_NONE
;
4787 } catch (boost::bad_lexical_cast
const&) {
4788 dout(10) << "bad vxattr value, unable to parse int for " << name
<< dendl
;
4789 respond_to_request(mdr
, -EINVAL
);
4793 xlocks
.insert(&cur
->policylock
);
4794 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4797 auto &pi
= cur
->project_inode();
4798 cur
->set_export_pin(rank
);
4801 dout(10) << " unknown vxattr " << name
<< dendl
;
4802 respond_to_request(mdr
, -EINVAL
);
4807 pip
->ctime
= mdr
->get_op_stamp();
4808 if (mdr
->get_op_stamp() > pip
->rstat
.rctime
)
4809 pip
->rstat
.rctime
= mdr
->get_op_stamp();
4810 pip
->version
= cur
->pre_dirty();
4812 pip
->update_backtrace();
4815 mdr
->ls
= mdlog
->get_current_segment();
4816 EUpdate
*le
= new EUpdate(mdlog
, "set vxattr layout");
4817 mdlog
->start_entry(le
);
4818 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4819 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4820 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4822 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4826 void Server::handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
,
4827 file_layout_t
*dir_layout
,
4828 set
<SimpleLock
*> rdlocks
,
4829 set
<SimpleLock
*> wrlocks
,
4830 set
<SimpleLock
*> xlocks
)
4832 MClientRequest
*req
= mdr
->client_request
;
4833 string
name(req
->get_path2());
4835 dout(10) << __func__
<< " " << name
<< " on " << *cur
<< dendl
;
4837 if (name
== "ceph.dir.layout") {
4838 if (!cur
->is_dir()) {
4839 respond_to_request(mdr
, -ENODATA
);
4842 if (cur
->is_root()) {
4843 dout(10) << "can't remove layout policy on the root directory" << dendl
;
4844 respond_to_request(mdr
, -EINVAL
);
4848 if (!cur
->get_projected_inode()->has_layout()) {
4849 respond_to_request(mdr
, -ENODATA
);
4853 xlocks
.insert(&cur
->policylock
);
4854 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4857 auto &pi
= cur
->project_inode();
4858 pi
.inode
.clear_layout();
4859 pi
.inode
.version
= cur
->pre_dirty();
4862 mdr
->ls
= mdlog
->get_current_segment();
4863 EUpdate
*le
= new EUpdate(mdlog
, "remove dir layout vxattr");
4864 mdlog
->start_entry(le
);
4865 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4866 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
4867 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
4869 mdr
->no_early_reply
= true;
4870 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
4872 } else if (name
== "ceph.dir.layout.pool_namespace"
4873 || name
== "ceph.file.layout.pool_namespace") {
4874 // Namespace is the only layout field that has a meaningful
4875 // null/none value (empty string, means default layout). Is equivalent
4876 // to a setxattr with empty string: pass through the empty payload of
4877 // the rmxattr request to do this.
4878 handle_set_vxattr(mdr
, cur
, dir_layout
, rdlocks
, wrlocks
, xlocks
);
4882 respond_to_request(mdr
, -ENODATA
);
4885 class C_MDS_inode_xattr_update_finish
: public ServerLogContext
{
4889 C_MDS_inode_xattr_update_finish(Server
*s
, MDRequestRef
& r
, CInode
*i
) :
4890 ServerLogContext(s
, r
), in(i
) { }
4891 void finish(int r
) override
{
4895 in
->pop_and_dirty_projected_inode(mdr
->ls
);
4899 utime_t now
= ceph_clock_now();
4900 get_mds()->balancer
->hit_inode(now
, in
, META_POP_IWR
);
4902 server
->respond_to_request(mdr
, 0);
4906 void Server::handle_client_setxattr(MDRequestRef
& mdr
)
4908 MClientRequest
*req
= mdr
->client_request
;
4909 string
name(req
->get_path2());
4910 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
4913 file_layout_t
*dir_layout
= NULL
;
4914 if (name
.compare(0, 15, "ceph.dir.layout") == 0)
4915 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true, false, &dir_layout
);
4917 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
4921 if (mdr
->snapid
!= CEPH_NOSNAP
) {
4922 respond_to_request(mdr
, -EROFS
);
4926 int flags
= req
->head
.args
.setxattr
.flags
;
4928 // magic ceph.* namespace?
4929 if (name
.compare(0, 5, "ceph.") == 0) {
4930 handle_set_vxattr(mdr
, cur
, dir_layout
, rdlocks
, wrlocks
, xlocks
);
4934 xlocks
.insert(&cur
->xattrlock
);
4935 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
4938 if (!check_access(mdr
, cur
, MAY_WRITE
))
4941 auto pxattrs
= cur
->get_projected_xattrs();
4942 size_t len
= req
->get_data().length();
4943 size_t inc
= len
+ name
.length();
4945 // check xattrs kv pairs size
4946 size_t cur_xattrs_size
= 0;
4947 for (const auto& p
: *pxattrs
) {
4948 if ((flags
& CEPH_XATTR_REPLACE
) && (name
.compare(std::string(boost::string_view(p
.first
))) == 0)) {
4951 cur_xattrs_size
+= p
.first
.length() + p
.second
.length();
4954 if (((cur_xattrs_size
+ inc
) > g_conf
->mds_max_xattr_pairs_size
)) {
4955 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
4956 << cur_xattrs_size
<< ", inc " << inc
<< dendl
;
4957 respond_to_request(mdr
, -ENOSPC
);
4961 if ((flags
& CEPH_XATTR_CREATE
) && pxattrs
->count(mempool::mds_co::string(boost::string_view(name
)))) {
4962 dout(10) << "setxattr '" << name
<< "' XATTR_CREATE and EEXIST on " << *cur
<< dendl
;
4963 respond_to_request(mdr
, -EEXIST
);
4966 if ((flags
& CEPH_XATTR_REPLACE
) && !pxattrs
->count(mempool::mds_co::string(boost::string_view(name
)))) {
4967 dout(10) << "setxattr '" << name
<< "' XATTR_REPLACE and ENODATA on " << *cur
<< dendl
;
4968 respond_to_request(mdr
, -ENODATA
);
4972 dout(10) << "setxattr '" << name
<< "' len " << len
<< " on " << *cur
<< dendl
;
4975 auto &pi
= cur
->project_inode(true);
4976 pi
.inode
.version
= cur
->pre_dirty();
4977 pi
.inode
.ctime
= mdr
->get_op_stamp();
4978 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
4979 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
4980 pi
.inode
.change_attr
++;
4981 pi
.inode
.xattr_version
++;
4982 auto &px
= *pi
.xattrs
;
4983 if ((flags
& CEPH_XATTR_REMOVE
)) {
4984 px
.erase(mempool::mds_co::string(boost::string_view(name
)));
4986 bufferptr b
= buffer::create(len
);
4988 req
->get_data().copy(0, len
, b
.c_str());
4989 auto em
= px
.emplace(std::piecewise_construct
, std::forward_as_tuple(mempool::mds_co::string(boost::string_view(name
))), std::forward_as_tuple(b
));
4991 em
.first
->second
= b
;
4995 mdr
->ls
= mdlog
->get_current_segment();
4996 EUpdate
*le
= new EUpdate(mdlog
, "setxattr");
4997 mdlog
->start_entry(le
);
4998 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
4999 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5000 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5002 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5005 void Server::handle_client_removexattr(MDRequestRef
& mdr
)
5007 MClientRequest
*req
= mdr
->client_request
;
5008 std::string
name(req
->get_path2());
5009 std::set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5010 file_layout_t
*dir_layout
= NULL
;
5012 if (name
== "ceph.dir.layout")
5013 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true, false, &dir_layout
);
5015 cur
= rdlock_path_pin_ref(mdr
, 0, rdlocks
, true);
5019 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5020 respond_to_request(mdr
, -EROFS
);
5024 if (name
.compare(0, 5, "ceph.") == 0) {
5025 handle_remove_vxattr(mdr
, cur
, dir_layout
, rdlocks
, wrlocks
, xlocks
);
5029 xlocks
.insert(&cur
->xattrlock
);
5030 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5033 auto pxattrs
= cur
->get_projected_xattrs();
5034 if (pxattrs
->count(mempool::mds_co::string(boost::string_view(name
))) == 0) {
5035 dout(10) << "removexattr '" << name
<< "' and ENODATA on " << *cur
<< dendl
;
5036 respond_to_request(mdr
, -ENODATA
);
5040 dout(10) << "removexattr '" << name
<< "' on " << *cur
<< dendl
;
5043 auto &pi
= cur
->project_inode(true);
5044 auto &px
= *pi
.xattrs
;
5045 pi
.inode
.version
= cur
->pre_dirty();
5046 pi
.inode
.ctime
= mdr
->get_op_stamp();
5047 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
5048 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5049 pi
.inode
.change_attr
++;
5050 pi
.inode
.xattr_version
++;
5051 px
.erase(mempool::mds_co::string(boost::string_view(name
)));
5054 mdr
->ls
= mdlog
->get_current_segment();
5055 EUpdate
*le
= new EUpdate(mdlog
, "removexattr");
5056 mdlog
->start_entry(le
);
5057 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5058 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, cur
, 0, PREDIRTY_PRIMARY
);
5059 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, cur
);
5061 journal_and_reply(mdr
, cur
, 0, le
, new C_MDS_inode_update_finish(this, mdr
, cur
));
5065 // =================================================================
5066 // DIRECTORY and NAMESPACE OPS
5069 // ------------------------------------------------
5073 class C_MDS_mknod_finish
: public ServerLogContext
{
5077 C_MDS_mknod_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ni
) :
5078 ServerLogContext(s
, r
), dn(d
), newi(ni
) {}
5079 void finish(int r
) override
{
5083 dn
->pop_projected_linkage();
5085 // be a bit hacky with the inode version, here.. we decrement it
5086 // just to keep mark_dirty() happen. (we didn't bother projecting
5087 // a new version of hte inode since it's just been created)
5088 newi
->inode
.version
--;
5089 newi
->mark_dirty(newi
->inode
.version
+ 1, mdr
->ls
);
5090 newi
->mark_dirty_parent(mdr
->ls
, true);
5093 if (newi
->inode
.is_dir()) {
5094 CDir
*dir
= newi
->get_dirfrag(frag_t());
5096 dir
->fnode
.version
--;
5097 dir
->mark_dirty(dir
->fnode
.version
+ 1, mdr
->ls
);
5098 dir
->mark_new(mdr
->ls
);
5103 MDRequestRef null_ref
;
5104 get_mds()->mdcache
->send_dentry_link(dn
, null_ref
);
5106 if (newi
->inode
.is_file())
5107 get_mds()->locker
->share_inode_max_size(newi
);
5110 utime_t now
= ceph_clock_now();
5111 get_mds()->balancer
->hit_inode(now
, newi
, META_POP_IWR
);
5114 server
->respond_to_request(mdr
, 0);
5119 void Server::handle_client_mknod(MDRequestRef
& mdr
)
5121 MClientRequest
*req
= mdr
->client_request
;
5122 client_t client
= mdr
->get_client();
5123 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5124 file_layout_t
*dir_layout
= NULL
;
5125 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false,
5128 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5129 respond_to_request(mdr
, -EROFS
);
5132 CInode
*diri
= dn
->get_dir()->get_inode();
5133 rdlocks
.insert(&diri
->authlock
);
5134 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5137 if (!check_access(mdr
, diri
, MAY_WRITE
))
5140 if (!check_fragment_space(mdr
, dn
->get_dir()))
5143 unsigned mode
= req
->head
.args
.mknod
.mode
;
5144 if ((mode
& S_IFMT
) == 0)
5148 file_layout_t layout
;
5149 if (dir_layout
&& S_ISREG(mode
))
5150 layout
= *dir_layout
;
5152 layout
= mdcache
->default_file_layout
;
5154 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
5155 snapid_t follows
= realm
->get_newest_seq();
5156 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
),
5160 dn
->push_projected_linkage(newi
);
5162 newi
->inode
.rdev
= req
->head
.args
.mknod
.rdev
;
5163 newi
->inode
.version
= dn
->pre_dirty();
5164 newi
->inode
.rstat
.rfiles
= 1;
5165 if (layout
.pool_id
!= mdcache
->default_file_layout
.pool_id
)
5166 newi
->inode
.add_old_pool(mdcache
->default_file_layout
.pool_id
);
5167 newi
->inode
.update_backtrace();
5169 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5170 // want to write to it (e.g., if they are reexporting NFS)
5171 if (S_ISREG(newi
->inode
.mode
)) {
5172 dout(15) << " setting a client_range too, since this is a regular file" << dendl
;
5173 newi
->inode
.client_ranges
[client
].range
.first
= 0;
5174 newi
->inode
.client_ranges
[client
].range
.last
= newi
->inode
.get_layout_size_increment();
5175 newi
->inode
.client_ranges
[client
].follows
= follows
;
5177 // issue a cap on the file
5178 int cmode
= CEPH_FILE_MODE_RDWR
;
5179 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
->session
, realm
, req
->is_replay());
5183 // put locks in excl mode
5184 newi
->filelock
.set_state(LOCK_EXCL
);
5185 newi
->authlock
.set_state(LOCK_EXCL
);
5186 newi
->xattrlock
.set_state(LOCK_EXCL
);
5190 assert(dn
->first
== follows
+ 1);
5191 newi
->first
= dn
->first
;
5193 dout(10) << "mknod mode " << newi
->inode
.mode
<< " rdev " << newi
->inode
.rdev
<< dendl
;
5196 mdr
->ls
= mdlog
->get_current_segment();
5197 EUpdate
*le
= new EUpdate(mdlog
, "mknod");
5198 mdlog
->start_entry(le
);
5199 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5200 journal_allocated_inos(mdr
, &le
->metablob
);
5202 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(),
5203 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5204 le
->metablob
.add_primary_dentry(dn
, newi
, true, true, true);
5206 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
5212 /* This function takes responsibility for the passed mdr*/
5213 void Server::handle_client_mkdir(MDRequestRef
& mdr
)
5215 MClientRequest
*req
= mdr
->client_request
;
5216 if (req
->get_filepath().is_last_dot_or_dotdot()) {
5217 respond_to_request(mdr
, -EEXIST
);
5221 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5222 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false);
5224 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5225 respond_to_request(mdr
, -EROFS
);
5228 CDir
*dir
= dn
->get_dir();
5229 CInode
*diri
= dir
->get_inode();
5230 rdlocks
.insert(&diri
->authlock
);
5231 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5234 // mkdir check access
5235 if (!check_access(mdr
, diri
, MAY_WRITE
))
5238 if (!check_fragment_space(mdr
, dir
))
5242 SnapRealm
*realm
= dn
->get_dir()->inode
->find_snaprealm();
5243 snapid_t follows
= realm
->get_newest_seq();
5245 unsigned mode
= req
->head
.args
.mkdir
.mode
;
5248 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
);
5251 // it's a directory.
5252 dn
->push_projected_linkage(newi
);
5254 newi
->inode
.version
= dn
->pre_dirty();
5255 newi
->inode
.rstat
.rsubdirs
= 1;
5256 newi
->inode
.update_backtrace();
5258 dout(12) << " follows " << follows
<< dendl
;
5259 assert(dn
->first
== follows
+ 1);
5260 newi
->first
= dn
->first
;
5262 // ...and that new dir is empty.
5263 CDir
*newdir
= newi
->get_or_open_dirfrag(mdcache
, frag_t());
5264 newdir
->state_set(CDir::STATE_CREATING
);
5265 newdir
->mark_complete();
5266 newdir
->fnode
.version
= newdir
->pre_dirty();
5269 mdr
->ls
= mdlog
->get_current_segment();
5270 EUpdate
*le
= new EUpdate(mdlog
, "mkdir");
5271 mdlog
->start_entry(le
);
5272 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5273 journal_allocated_inos(mdr
, &le
->metablob
);
5274 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5275 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
5276 le
->metablob
.add_new_dir(newdir
); // dirty AND complete AND new
5278 // issue a cap on the directory
5279 int cmode
= CEPH_FILE_MODE_RDWR
;
5280 Capability
*cap
= mds
->locker
->issue_new_caps(newi
, cmode
, mdr
->session
, realm
, req
->is_replay());
5284 // put locks in excl mode
5285 newi
->filelock
.set_state(LOCK_EXCL
);
5286 newi
->authlock
.set_state(LOCK_EXCL
);
5287 newi
->xattrlock
.set_state(LOCK_EXCL
);
5290 // make sure this inode gets into the journal
5291 le
->metablob
.add_opened_ino(newi
->ino());
5292 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
5293 ls
->open_files
.push_back(&newi
->item_open_file
);
5295 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
5301 void Server::handle_client_symlink(MDRequestRef
& mdr
)
5303 MClientRequest
*req
= mdr
->client_request
;
5304 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5305 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false);
5307 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5308 respond_to_request(mdr
, -EROFS
);
5311 CDir
*dir
= dn
->get_dir();
5312 CInode
*diri
= dir
->get_inode();
5313 rdlocks
.insert(&diri
->authlock
);
5314 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5317 if (!check_access(mdr
, diri
, MAY_WRITE
))
5320 if (!check_fragment_space(mdr
, dir
))
5323 unsigned mode
= S_IFLNK
| 0777;
5324 CInode
*newi
= prepare_new_inode(mdr
, dn
->get_dir(), inodeno_t(req
->head
.ino
), mode
);
5328 dn
->push_projected_linkage(newi
);
5330 newi
->symlink
= mempool::mds_co::string(boost::string_view(req
->get_path2()));
5331 newi
->inode
.size
= newi
->symlink
.length();
5332 newi
->inode
.rstat
.rbytes
= newi
->inode
.size
;
5333 newi
->inode
.rstat
.rfiles
= 1;
5334 newi
->inode
.version
= dn
->pre_dirty();
5335 newi
->inode
.update_backtrace();
5337 newi
->first
= dn
->first
;
5340 mdr
->ls
= mdlog
->get_current_segment();
5341 EUpdate
*le
= new EUpdate(mdlog
, "symlink");
5342 mdlog
->start_entry(le
);
5343 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
5344 journal_allocated_inos(mdr
, &le
->metablob
);
5345 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, newi
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
5346 le
->metablob
.add_primary_dentry(dn
, newi
, true, true);
5348 journal_and_reply(mdr
, newi
, dn
, le
, new C_MDS_mknod_finish(this, mdr
, dn
, newi
));
5357 void Server::handle_client_link(MDRequestRef
& mdr
)
5359 MClientRequest
*req
= mdr
->client_request
;
5361 dout(7) << "handle_client_link " << req
->get_filepath()
5362 << " to " << req
->get_filepath2()
5365 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
5367 CDentry
*dn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, false, false, false);
5369 CInode
*targeti
= rdlock_path_pin_ref(mdr
, 1, rdlocks
, false);
5370 if (!targeti
) return;
5371 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5372 respond_to_request(mdr
, -EROFS
);
5376 CDir
*dir
= dn
->get_dir();
5377 dout(7) << "handle_client_link link " << dn
->get_name() << " in " << *dir
<< dendl
;
5378 dout(7) << "target is " << *targeti
<< dendl
;
5379 if (targeti
->is_dir()) {
5380 // if srcdn is replica, need to make sure its linkage is correct
5381 vector
<CDentry
*>& trace
= mdr
->dn
[1];
5382 if (trace
.empty() ||
5383 trace
.back()->is_auth() ||
5384 trace
.back()->lock
.can_read(mdr
->get_client())) {
5385 dout(7) << "target is a dir, failing..." << dendl
;
5386 respond_to_request(mdr
, -EINVAL
);
5391 xlocks
.insert(&targeti
->linklock
);
5393 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
5396 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
5397 if (!check_access(mdr
, targeti
, MAY_WRITE
))
5400 if (!check_access(mdr
, dir
->get_inode(), MAY_WRITE
))
5403 if (!check_fragment_space(mdr
, dir
))
5408 assert(g_conf
->mds_kill_link_at
!= 1);
5411 if (targeti
->is_auth())
5412 _link_local(mdr
, dn
, targeti
);
5414 _link_remote(mdr
, true, dn
, targeti
);
5418 class C_MDS_link_local_finish
: public ServerLogContext
{
5424 C_MDS_link_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CInode
*ti
,
5425 version_t dnpv_
, version_t tipv_
) :
5426 ServerLogContext(s
, r
), dn(d
), targeti(ti
),
5427 dnpv(dnpv_
), tipv(tipv_
) { }
5428 void finish(int r
) override
{
5430 server
->_link_local_finish(mdr
, dn
, targeti
, dnpv
, tipv
);
5435 void Server::_link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
)
5437 dout(10) << "_link_local " << *dn
<< " to " << *targeti
<< dendl
;
5439 mdr
->ls
= mdlog
->get_current_segment();
5441 // predirty NEW dentry
5442 version_t dnpv
= dn
->pre_dirty();
5443 version_t tipv
= targeti
->pre_dirty();
5445 // project inode update
5446 auto &pi
= targeti
->project_inode();
5448 pi
.inode
.ctime
= mdr
->get_op_stamp();
5449 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
5450 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
5451 pi
.inode
.change_attr
++;
5452 pi
.inode
.version
= tipv
;
5455 EUpdate
*le
= new EUpdate(mdlog
, "link_local");
5456 mdlog
->start_entry(le
);
5457 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5458 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1); // new dn
5459 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, 0, PREDIRTY_PRIMARY
); // targeti
5460 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
5461 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, targeti
);
5463 // do this after predirty_*, to avoid funky extra dnl arg
5464 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
5466 journal_and_reply(mdr
, targeti
, dn
, le
, new C_MDS_link_local_finish(this, mdr
, dn
, targeti
, dnpv
, tipv
));
5469 void Server::_link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
5470 version_t dnpv
, version_t tipv
)
5472 dout(10) << "_link_local_finish " << *dn
<< " to " << *targeti
<< dendl
;
5474 // link and unlock the NEW dentry
5475 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
5476 if (!dnl
->get_inode())
5477 dn
->link_remote(dnl
, targeti
);
5478 dn
->mark_dirty(dnpv
, mdr
->ls
);
5481 targeti
->pop_and_dirty_projected_inode(mdr
->ls
);
5485 MDRequestRef null_ref
;
5486 mdcache
->send_dentry_link(dn
, null_ref
);
5488 // bump target popularity
5489 utime_t now
= ceph_clock_now();
5490 mds
->balancer
->hit_inode(now
, targeti
, META_POP_IWR
);
5491 mds
->balancer
->hit_dir(now
, dn
->get_dir(), META_POP_IWR
);
5494 respond_to_request(mdr
, 0);
5498 // link / unlink remote
5500 class C_MDS_link_remote_finish
: public ServerLogContext
{
5506 C_MDS_link_remote_finish(Server
*s
, MDRequestRef
& r
, bool i
, CDentry
*d
, CInode
*ti
) :
5507 ServerLogContext(s
, r
), inc(i
), dn(d
), targeti(ti
),
5508 dpv(d
->get_projected_version()) {}
5509 void finish(int r
) override
{
5511 server
->_link_remote_finish(mdr
, inc
, dn
, targeti
, dpv
);
5515 void Server::_link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
)
5517 dout(10) << "_link_remote "
5518 << (inc
? "link ":"unlink ")
5519 << *dn
<< " to " << *targeti
<< dendl
;
5521 // 1. send LinkPrepare to dest (journal nlink++ prepare)
5522 mds_rank_t linkauth
= targeti
->authority().first
;
5523 if (mdr
->more()->witnessed
.count(linkauth
) == 0) {
5524 if (mds
->is_cluster_degraded() &&
5525 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(linkauth
)) {
5526 dout(10) << " targeti auth mds." << linkauth
<< " is not active" << dendl
;
5527 if (mdr
->more()->waiting_on_slave
.empty())
5528 mds
->wait_for_active_peer(linkauth
, new C_MDS_RetryRequest(mdcache
, mdr
));
5532 dout(10) << " targeti auth must prepare nlink++/--" << dendl
;
5535 op
= MMDSSlaveRequest::OP_LINKPREP
;
5537 op
= MMDSSlaveRequest::OP_UNLINKPREP
;
5538 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, op
);
5539 targeti
->set_object_info(req
->get_object_info());
5540 req
->op_stamp
= mdr
->get_op_stamp();
5541 mds
->send_message_mds(req
, linkauth
);
5543 assert(mdr
->more()->waiting_on_slave
.count(linkauth
) == 0);
5544 mdr
->more()->waiting_on_slave
.insert(linkauth
);
5547 dout(10) << " targeti auth has prepared nlink++/--" << dendl
;
5549 assert(g_conf
->mds_kill_link_at
!= 2);
5551 mdr
->set_mds_stamp(ceph_clock_now());
5554 mdr
->ls
= mdlog
->get_current_segment();
5555 EUpdate
*le
= new EUpdate(mdlog
, inc
? "link_remote":"unlink_remote");
5556 mdlog
->start_entry(le
);
5557 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
5558 if (!mdr
->more()->witnessed
.empty()) {
5559 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
5560 le
->reqid
= mdr
->reqid
;
5561 le
->had_slaves
= true;
5562 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
5567 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, 1);
5568 le
->metablob
.add_remote_dentry(dn
, true, targeti
->ino(), targeti
->d_type()); // new remote
5569 dn
->push_projected_linkage(targeti
->ino(), targeti
->d_type());
5572 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, targeti
, dn
->get_dir(), PREDIRTY_DIR
, -1);
5573 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
5574 le
->metablob
.add_null_dentry(dn
, true);
5575 dn
->push_projected_linkage();
5578 journal_and_reply(mdr
, targeti
, dn
, le
, new C_MDS_link_remote_finish(this, mdr
, inc
, dn
, targeti
));
5581 void Server::_link_remote_finish(MDRequestRef
& mdr
, bool inc
,
5582 CDentry
*dn
, CInode
*targeti
,
5585 dout(10) << "_link_remote_finish "
5586 << (inc
? "link ":"unlink ")
5587 << *dn
<< " to " << *targeti
<< dendl
;
5589 assert(g_conf
->mds_kill_link_at
!= 3);
5591 if (!mdr
->more()->witnessed
.empty())
5592 mdcache
->logged_master_update(mdr
->reqid
);
5595 // link the new dentry
5596 CDentry::linkage_t
*dnl
= dn
->pop_projected_linkage();
5597 if (!dnl
->get_inode())
5598 dn
->link_remote(dnl
, targeti
);
5599 dn
->mark_dirty(dpv
, mdr
->ls
);
5601 // unlink main dentry
5602 dn
->get_dir()->unlink_inode(dn
);
5603 dn
->pop_projected_linkage();
5604 dn
->mark_dirty(dn
->get_projected_version(), mdr
->ls
); // dirty old dentry
5609 MDRequestRef null_ref
;
5611 mdcache
->send_dentry_link(dn
, null_ref
);
5613 mdcache
->send_dentry_unlink(dn
, NULL
, null_ref
);
5615 // bump target popularity
5616 utime_t now
= ceph_clock_now();
5617 mds
->balancer
->hit_inode(now
, targeti
, META_POP_IWR
);
5618 mds
->balancer
->hit_dir(now
, dn
->get_dir(), META_POP_IWR
);
5621 respond_to_request(mdr
, 0);
5624 // removing a new dn?
5625 dn
->get_dir()->try_remove_unlinked_dn(dn
);
5629 // remote linking/unlinking
5631 class C_MDS_SlaveLinkPrep
: public ServerLogContext
{
5634 C_MDS_SlaveLinkPrep(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
5635 ServerLogContext(s
, r
), targeti(t
) { }
5636 void finish(int r
) override
{
5638 server
->_logged_slave_link(mdr
, targeti
);
5642 class C_MDS_SlaveLinkCommit
: public ServerContext
{
5646 C_MDS_SlaveLinkCommit(Server
*s
, MDRequestRef
& r
, CInode
*t
) :
5647 ServerContext(s
), mdr(r
), targeti(t
) { }
5648 void finish(int r
) override
{
5649 server
->_commit_slave_link(mdr
, r
, targeti
);
5653 /* This function DOES put the mdr->slave_request before returning*/
5654 void Server::handle_slave_link_prep(MDRequestRef
& mdr
)
5656 dout(10) << "handle_slave_link_prep " << *mdr
5657 << " on " << mdr
->slave_request
->get_object_info()
5660 assert(g_conf
->mds_kill_link_at
!= 4);
5662 CInode
*targeti
= mdcache
->get_inode(mdr
->slave_request
->get_object_info().ino
);
5664 dout(10) << "targeti " << *targeti
<< dendl
;
5665 CDentry
*dn
= targeti
->get_parent_dn();
5666 CDentry::linkage_t
*dnl
= dn
->get_linkage();
5667 assert(dnl
->is_primary());
5669 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
5671 mdr
->auth_pin(targeti
);
5673 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
5674 assert(g_conf
->mds_kill_link_at
!= 5);
5677 mdr
->ls
= mdlog
->get_current_segment();
5678 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_prep", mdr
->reqid
, mdr
->slave_to_mds
,
5679 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::LINK
);
5680 mdlog
->start_entry(le
);
5682 auto &pi
= dnl
->get_inode()->project_inode();
5684 // update journaled target inode
5686 if (mdr
->slave_request
->get_op() == MMDSSlaveRequest::OP_LINKPREP
) {
5694 link_rollback rollback
;
5695 rollback
.reqid
= mdr
->reqid
;
5696 rollback
.ino
= targeti
->ino();
5697 rollback
.old_ctime
= targeti
->inode
.ctime
; // we hold versionlock xlock; no concorrent projections
5698 const fnode_t
*pf
= targeti
->get_parent_dn()->get_dir()->get_projected_fnode();
5699 rollback
.old_dir_mtime
= pf
->fragstat
.mtime
;
5700 rollback
.old_dir_rctime
= pf
->rstat
.rctime
;
5701 rollback
.was_inc
= inc
;
5702 ::encode(rollback
, le
->rollback
);
5703 mdr
->more()->rollback_bl
= le
->rollback
;
5705 pi
.inode
.ctime
= mdr
->get_op_stamp();
5706 pi
.inode
.version
= targeti
->pre_dirty();
5708 dout(10) << " projected inode " << pi
.inode
.ino
<< " v " << pi
.inode
.version
<< dendl
;
5711 mdcache
->predirty_journal_parents(mdr
, &le
->commit
, dnl
->get_inode(), 0, PREDIRTY_SHALLOW
|PREDIRTY_PRIMARY
);
5712 mdcache
->journal_dirty_inode(mdr
.get(), &le
->commit
, targeti
);
5714 // set up commit waiter
5715 mdr
->more()->slave_commit
= new C_MDS_SlaveLinkCommit(this, mdr
, targeti
);
5717 mdr
->more()->slave_update_journaled
= true;
5718 submit_mdlog_entry(le
, new C_MDS_SlaveLinkPrep(this, mdr
, targeti
),
5723 void Server::_logged_slave_link(MDRequestRef
& mdr
, CInode
*targeti
)
5725 dout(10) << "_logged_slave_link " << *mdr
5726 << " " << *targeti
<< dendl
;
5728 assert(g_conf
->mds_kill_link_at
!= 6);
5730 // update the target
5731 targeti
->pop_and_dirty_projected_inode(mdr
->ls
);
5735 utime_t now
= ceph_clock_now();
5736 mds
->balancer
->hit_inode(now
, targeti
, META_POP_IWR
);
5739 mdr
->reset_slave_request();
5742 if (!mdr
->aborted
) {
5743 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
5744 MMDSSlaveRequest::OP_LINKPREPACK
);
5745 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
5747 dout(10) << " abort flag set, finishing" << dendl
;
5748 mdcache
->request_finish(mdr
);
5753 struct C_MDS_CommittedSlave
: public ServerLogContext
{
5754 C_MDS_CommittedSlave(Server
*s
, MDRequestRef
& m
) : ServerLogContext(s
, m
) {}
5755 void finish(int r
) override
{
5756 server
->_committed_slave(mdr
);
5760 void Server::_commit_slave_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
)
5762 dout(10) << "_commit_slave_link " << *mdr
5764 << " " << *targeti
<< dendl
;
5766 assert(g_conf
->mds_kill_link_at
!= 7);
5769 // drop our pins, etc.
5772 // write a commit to the journal
5773 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_commit", mdr
->reqid
, mdr
->slave_to_mds
,
5774 ESlaveUpdate::OP_COMMIT
, ESlaveUpdate::LINK
);
5775 mdlog
->start_entry(le
);
5776 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
5779 do_link_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
);
5783 void Server::_committed_slave(MDRequestRef
& mdr
)
5785 dout(10) << "_committed_slave " << *mdr
<< dendl
;
5787 assert(g_conf
->mds_kill_link_at
!= 8);
5789 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
5790 MMDSSlaveRequest::OP_COMMITTED
);
5791 mds
->send_message_mds(req
, mdr
->slave_to_mds
);
5792 mdcache
->request_finish(mdr
);
5795 struct C_MDS_LoggedLinkRollback
: public ServerLogContext
{
5797 C_MDS_LoggedLinkRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
) : ServerLogContext(s
, r
), mut(m
) {}
5798 void finish(int r
) override
{
5799 server
->_link_rollback_finish(mut
, mdr
);
5803 void Server::do_link_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
)
5805 link_rollback rollback
;
5806 bufferlist::iterator p
= rbl
.begin();
5807 ::decode(rollback
, p
);
5809 dout(10) << "do_link_rollback on " << rollback
.reqid
5810 << (rollback
.was_inc
? " inc":" dec")
5811 << " ino " << rollback
.ino
5814 assert(g_conf
->mds_kill_link_at
!= 9);
5816 mdcache
->add_rollback(rollback
.reqid
, master
); // need to finish this update before resolve finishes
5817 assert(mdr
|| mds
->is_resolve());
5819 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
5820 mut
->ls
= mds
->mdlog
->get_current_segment();
5822 CInode
*in
= mdcache
->get_inode(rollback
.ino
);
5824 dout(10) << " target is " << *in
<< dendl
;
5825 assert(!in
->is_projected()); // live slave request hold versionlock xlock.
5827 auto &pi
= in
->project_inode();
5828 pi
.inode
.version
= in
->pre_dirty();
5829 mut
->add_projected_inode(in
);
5831 // parent dir rctime
5832 CDir
*parent
= in
->get_projected_parent_dn()->get_dir();
5833 fnode_t
*pf
= parent
->project_fnode();
5834 mut
->add_projected_fnode(parent
);
5835 pf
->version
= parent
->pre_dirty();
5836 if (pf
->fragstat
.mtime
== pi
.inode
.ctime
) {
5837 pf
->fragstat
.mtime
= rollback
.old_dir_mtime
;
5838 if (pf
->rstat
.rctime
== pi
.inode
.ctime
)
5839 pf
->rstat
.rctime
= rollback
.old_dir_rctime
;
5840 mut
->add_updated_lock(&parent
->get_inode()->filelock
);
5841 mut
->add_updated_lock(&parent
->get_inode()->nestlock
);
5845 pi
.inode
.ctime
= rollback
.old_ctime
;
5846 if (rollback
.was_inc
)
5852 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_link_rollback", rollback
.reqid
, master
,
5853 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::LINK
);
5854 mdlog
->start_entry(le
);
5855 le
->commit
.add_dir_context(parent
);
5856 le
->commit
.add_dir(parent
, true);
5857 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), 0, true);
5859 submit_mdlog_entry(le
, new C_MDS_LoggedLinkRollback(this, mut
, mdr
),
5864 void Server::_link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
)
5866 dout(10) << "_link_rollback_finish" << dendl
;
5868 assert(g_conf
->mds_kill_link_at
!= 10);
5872 mdcache
->request_finish(mdr
);
5874 mdcache
->finish_rollback(mut
->reqid
);
5880 /* This function DOES NOT put the passed message before returning*/
5881 void Server::handle_slave_link_prep_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*m
)
5883 dout(10) << "handle_slave_link_prep_ack " << *mdr
5884 << " " << *m
<< dendl
;
5885 mds_rank_t from
= mds_rank_t(m
->get_source().num());
5887 assert(g_conf
->mds_kill_link_at
!= 11);
5890 mdr
->more()->slaves
.insert(from
);
5893 assert(mdr
->more()->witnessed
.count(from
) == 0);
5894 mdr
->more()->witnessed
.insert(from
);
5895 assert(!m
->is_not_journaled());
5896 mdr
->more()->has_journaled_slaves
= true;
5898 // remove from waiting list
5899 assert(mdr
->more()->waiting_on_slave
.count(from
));
5900 mdr
->more()->waiting_on_slave
.erase(from
);
5902 assert(mdr
->more()->waiting_on_slave
.empty());
5904 dispatch_client_request(mdr
); // go again!
5913 void Server::handle_client_unlink(MDRequestRef
& mdr
)
5915 MClientRequest
*req
= mdr
->client_request
;
5916 client_t client
= mdr
->get_client();
5920 if (req
->get_op() == CEPH_MDS_OP_RMDIR
) rmdir
= true;
5922 const filepath
& refpath
= req
->get_filepath();
5923 if (refpath
.depth() == 0) {
5924 respond_to_request(mdr
, -EINVAL
);
5927 if (refpath
.is_last_dot_or_dotdot()) {
5928 respond_to_request(mdr
, -ENOTEMPTY
);
5933 vector
<CDentry
*> trace
;
5935 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, refpath
, &trace
, &in
, MDS_TRAVERSE_FORWARD
);
5939 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
5940 mdcache
->find_ino_peers(refpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
5943 respond_to_request(mdr
, r
);
5946 if (mdr
->snapid
!= CEPH_NOSNAP
) {
5947 respond_to_request(mdr
, -EROFS
);
5951 CDentry
*dn
= trace
.back();
5953 if (!dn
->is_auth()) {
5954 mdcache
->request_forward(mdr
, dn
->authority().first
);
5958 CInode
*diri
= dn
->get_dir()->get_inode();
5960 CDentry::linkage_t
*dnl
= dn
->get_linkage(client
, mdr
);
5961 assert(!dnl
->is_null());
5964 dout(7) << "handle_client_rmdir on " << *dn
<< dendl
;
5966 dout(7) << "handle_client_unlink on " << *dn
<< dendl
;
5968 dout(7) << "dn links to " << *in
<< dendl
;
5973 // do empty directory checks
5974 if (_dir_is_nonempty_unlocked(mdr
, in
)) {
5975 respond_to_request(mdr
, -ENOTEMPTY
);
5979 dout(7) << "handle_client_unlink on dir " << *in
<< ", returning error" << dendl
;
5980 respond_to_request(mdr
, -EISDIR
);
5986 dout(7) << "handle_client_rmdir on non-dir " << *in
<< ", returning error" << dendl
;
5987 respond_to_request(mdr
, -ENOTDIR
);
5992 // -- create stray dentry? --
5993 CDentry
*straydn
= NULL
;
5994 if (dnl
->is_primary()) {
5995 straydn
= prepare_stray_dentry(mdr
, dnl
->get_inode());
5998 dout(10) << " straydn is " << *straydn
<< dendl
;
5999 } else if (mdr
->straydn
) {
6000 mdr
->unpin(mdr
->straydn
);
6001 mdr
->straydn
= NULL
;
6005 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
6006 for (int i
=0; i
<(int)trace
.size()-1; i
++) {
6007 rdlocks
.insert(&trace
[i
]->lock
);
6009 xlocks
.insert(&dn
->lock
);
6010 wrlocks
.insert(&diri
->filelock
);
6011 wrlocks
.insert(&diri
->nestlock
);
6012 xlocks
.insert(&in
->linklock
);
6014 wrlocks
.insert(&straydn
->get_dir()->inode
->filelock
);
6015 wrlocks
.insert(&straydn
->get_dir()->inode
->nestlock
);
6016 xlocks
.insert(&straydn
->lock
);
6019 rdlocks
.insert(&in
->filelock
); // to verify it's empty
6020 mds
->locker
->include_snap_rdlocks(rdlocks
, dnl
->get_inode());
6022 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
6026 _dir_is_nonempty(mdr
, in
)) {
6027 respond_to_request(mdr
, -ENOTEMPTY
);
6031 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
6032 if (!check_access(mdr
, diri
, MAY_WRITE
))
6037 if (in
->is_dir() && in
->has_subtree_root_dirfrag()) {
6038 // subtree root auths need to be witnesses
6039 set
<mds_rank_t
> witnesses
;
6040 in
->list_replicas(witnesses
);
6041 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
6043 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
6044 p
!= witnesses
.end();
6046 if (mdr
->more()->witnessed
.count(*p
)) {
6047 dout(10) << " already witnessed by mds." << *p
<< dendl
;
6048 } else if (mdr
->more()->waiting_on_slave
.count(*p
)) {
6049 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
6051 if (!_rmdir_prepare_witness(mdr
, *p
, trace
, straydn
))
6055 if (!mdr
->more()->waiting_on_slave
.empty())
6056 return; // we're waiting for a witness.
6060 if (dnl
->is_remote() && !dnl
->get_inode()->is_auth())
6061 _link_remote(mdr
, false, dn
, dnl
->get_inode());
6063 _unlink_local(mdr
, dn
, straydn
);
6066 class C_MDS_unlink_local_finish
: public ServerLogContext
{
6069 version_t dnpv
; // deleted dentry
6071 C_MDS_unlink_local_finish(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*sd
) :
6072 ServerLogContext(s
, r
), dn(d
), straydn(sd
),
6073 dnpv(d
->get_projected_version()) {}
6074 void finish(int r
) override
{
6076 server
->_unlink_local_finish(mdr
, dn
, straydn
, dnpv
);
6080 void Server::_unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
6082 dout(10) << "_unlink_local " << *dn
<< dendl
;
6084 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
6085 CInode
*in
= dnl
->get_inode();
6087 SnapRealm
*realm
= in
->find_snaprealm();
6088 snapid_t follows
= realm
->get_newest_seq();
6091 mdr
->ls
= mdlog
->get_current_segment();
6093 // prepare log entry
6094 EUpdate
*le
= new EUpdate(mdlog
, "unlink_local");
6095 mdlog
->start_entry(le
);
6096 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
6097 if (!mdr
->more()->witnessed
.empty()) {
6098 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
6099 le
->reqid
= mdr
->reqid
;
6100 le
->had_slaves
= true;
6101 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
6105 assert(dnl
->is_primary());
6106 straydn
->push_projected_linkage(in
);
6107 straydn
->first
= follows
+ 1;
6110 // the unlinked dentry
6113 auto &pi
= in
->project_inode();
6116 dn
->make_path_string(t
, true);
6117 pi
.inode
.stray_prior_path
= mempool::mds_co::string(boost::string_view(t
));
6119 mdr
->add_projected_inode(in
); // do this _after_ my dn->pre_dirty().. we apply that one manually.
6120 pi
.inode
.version
= in
->pre_dirty();
6121 pi
.inode
.ctime
= mdr
->get_op_stamp();
6122 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
6123 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
6124 pi
.inode
.change_attr
++;
6126 if (pi
.inode
.nlink
== 0)
6127 in
->state_set(CInode::STATE_ORPHAN
);
6129 if (dnl
->is_primary()) {
6130 // primary link. add stray dentry.
6132 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, -1);
6133 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, straydn
->get_dir(), PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
6135 // project snaprealm, too
6136 if (in
->snaprealm
|| follows
+ 1 > in
->get_oldest_snap())
6137 in
->project_past_snaprealm_parent(straydn
->get_dir()->inode
->find_snaprealm());
6139 pi
.inode
.update_backtrace();
6140 le
->metablob
.add_primary_dentry(straydn
, in
, true, true);
6142 // remote link. update remote inode.
6143 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, dn
->get_dir(), PREDIRTY_DIR
, -1);
6144 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, in
, 0, PREDIRTY_PRIMARY
);
6145 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, in
);
6148 mdcache
->journal_cow_dentry(mdr
.get(), &le
->metablob
, dn
);
6149 le
->metablob
.add_null_dentry(dn
, true);
6152 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
6153 le
->metablob
.renamed_dirino
= in
->ino();
6156 dn
->push_projected_linkage();
6160 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
6162 in
->maybe_export_pin(true);
6165 journal_and_reply(mdr
, 0, dn
, le
, new C_MDS_unlink_local_finish(this, mdr
, dn
, straydn
));
6168 void Server::_unlink_local_finish(MDRequestRef
& mdr
,
6169 CDentry
*dn
, CDentry
*straydn
,
6172 dout(10) << "_unlink_local_finish " << *dn
<< dendl
;
6174 if (!mdr
->more()->witnessed
.empty())
6175 mdcache
->logged_master_update(mdr
->reqid
);
6177 // unlink main dentry
6178 dn
->get_dir()->unlink_inode(dn
);
6179 dn
->pop_projected_linkage();
6181 // relink as stray? (i.e. was primary link?)
6182 CInode
*strayin
= NULL
;
6183 bool snap_is_new
= false;
6185 dout(20) << " straydn is " << *straydn
<< dendl
;
6186 CDentry::linkage_t
*straydnl
= straydn
->pop_projected_linkage();
6187 strayin
= straydnl
->get_inode();
6189 snap_is_new
= strayin
->snaprealm
? true : false;
6190 mdcache
->touch_dentry_bottom(straydn
);
6193 dn
->mark_dirty(dnpv
, mdr
->ls
);
6196 if (snap_is_new
) //only new if strayin exists
6197 mdcache
->do_realm_invalidate_and_update_notify(strayin
, CEPH_SNAP_OP_SPLIT
, true);
6199 mdcache
->send_dentry_unlink(dn
, straydn
, mdr
);
6201 // update subtree map?
6202 if (straydn
&& strayin
->is_dir())
6203 mdcache
->adjust_subtree_after_rename(strayin
, dn
->get_dir(), true);
6206 utime_t now
= ceph_clock_now();
6207 mds
->balancer
->hit_dir(now
, dn
->get_dir(), META_POP_IWR
);
6210 respond_to_request(mdr
, 0);
6212 // removing a new dn?
6213 dn
->get_dir()->try_remove_unlinked_dn(dn
);
6216 // respond_to_request() drops locks. So stray reintegration can race with us.
6217 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
6218 // Tip off the MDCache that this dentry is a stray that
6219 // might be elegible for purge.
6220 mdcache
->notify_stray(straydn
);
6224 bool Server::_rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
)
6226 if (mds
->is_cluster_degraded() &&
6227 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
6228 dout(10) << "_rmdir_prepare_witness mds." << who
<< " is not active" << dendl
;
6229 if (mdr
->more()->waiting_on_slave
.empty())
6230 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
6234 dout(10) << "_rmdir_prepare_witness mds." << who
<< dendl
;
6235 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
6236 MMDSSlaveRequest::OP_RMDIRPREP
);
6237 req
->srcdnpath
= filepath(trace
.front()->get_dir()->ino());
6238 for (auto dn
: trace
)
6239 req
->srcdnpath
.push_dentry(dn
->get_name());
6240 mdcache
->replicate_stray(straydn
, who
, req
->stray
);
6242 req
->op_stamp
= mdr
->get_op_stamp();
6243 mds
->send_message_mds(req
, who
);
6245 assert(mdr
->more()->waiting_on_slave
.count(who
) == 0);
6246 mdr
->more()->waiting_on_slave
.insert(who
);
6250 struct C_MDS_SlaveRmdirPrep
: public ServerLogContext
{
6251 CDentry
*dn
, *straydn
;
6252 C_MDS_SlaveRmdirPrep(Server
*s
, MDRequestRef
& r
, CDentry
*d
, CDentry
*st
)
6253 : ServerLogContext(s
, r
), dn(d
), straydn(st
) {}
6254 void finish(int r
) override
{
6255 server
->_logged_slave_rmdir(mdr
, dn
, straydn
);
6259 struct C_MDS_SlaveRmdirCommit
: public ServerContext
{
6262 C_MDS_SlaveRmdirCommit(Server
*s
, MDRequestRef
& r
, CDentry
*sd
)
6263 : ServerContext(s
), mdr(r
), straydn(sd
) { }
6264 void finish(int r
) override
{
6265 server
->_commit_slave_rmdir(mdr
, r
, straydn
);
6269 void Server::handle_slave_rmdir_prep(MDRequestRef
& mdr
)
6271 dout(10) << "handle_slave_rmdir_prep " << *mdr
6272 << " " << mdr
->slave_request
->srcdnpath
6273 << " to " << mdr
->slave_request
->destdnpath
6276 vector
<CDentry
*> trace
;
6277 filepath
srcpath(mdr
->slave_request
->srcdnpath
);
6278 dout(10) << " src " << srcpath
<< dendl
;
6280 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, srcpath
, &trace
, &in
, MDS_TRAVERSE_DISCOVERXLOCK
);
6283 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
6288 CDentry
*dn
= trace
.back();
6289 dout(10) << " dn " << *dn
<< dendl
;
6292 assert(mdr
->straydn
);
6293 CDentry
*straydn
= mdr
->straydn
;
6294 dout(10) << " straydn " << *straydn
<< dendl
;
6296 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
6298 rmdir_rollback rollback
;
6299 rollback
.reqid
= mdr
->reqid
;
6300 rollback
.src_dir
= dn
->get_dir()->dirfrag();
6301 rollback
.src_dname
= std::string(dn
->get_name());
6302 rollback
.dest_dir
= straydn
->get_dir()->dirfrag();
6303 rollback
.dest_dname
= std::string(straydn
->get_name());
6304 ::encode(rollback
, mdr
->more()->rollback_bl
);
6305 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
6307 // set up commit waiter
6308 mdr
->more()->slave_commit
= new C_MDS_SlaveRmdirCommit(this, mdr
, straydn
);
6310 if (!in
->has_subtree_root_dirfrag(mds
->get_nodeid())) {
6311 dout(10) << " no auth subtree in " << *in
<< ", skipping journal" << dendl
;
6312 dn
->get_dir()->unlink_inode(dn
);
6313 straydn
->get_dir()->link_primary_inode(straydn
, in
);
6315 assert(straydn
->first
>= in
->first
);
6316 in
->first
= straydn
->first
;
6318 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), false);
6320 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
6321 MMDSSlaveRequest::OP_RMDIRPREPACK
);
6322 reply
->mark_not_journaled();
6323 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
6325 // send caps to auth (if we're not already)
6326 if (in
->is_any_caps() && !in
->state_test(CInode::STATE_EXPORTINGCAPS
))
6327 mdcache
->migrator
->export_caps(in
);
6329 mdcache
->touch_dentry_bottom(straydn
); // move stray to end of lru
6331 mdr
->slave_request
->put();
6332 mdr
->slave_request
= 0;
6337 straydn
->push_projected_linkage(in
);
6338 dn
->push_projected_linkage();
6340 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir", mdr
->reqid
, mdr
->slave_to_mds
,
6341 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::RMDIR
);
6342 mdlog
->start_entry(le
);
6343 le
->rollback
= mdr
->more()->rollback_bl
;
6345 le
->commit
.add_dir_context(straydn
->get_dir());
6346 le
->commit
.add_primary_dentry(straydn
, in
, true);
6347 // slave: no need to journal original dentry
6349 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
6350 le
->commit
.renamed_dirino
= in
->ino();
6352 mdcache
->project_subtree_rename(in
, dn
->get_dir(), straydn
->get_dir());
6354 mdr
->more()->slave_update_journaled
= true;
6355 submit_mdlog_entry(le
, new C_MDS_SlaveRmdirPrep(this, mdr
, dn
, straydn
),
6360 void Server::_logged_slave_rmdir(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
)
6362 dout(10) << "_logged_slave_rmdir " << *mdr
<< " on " << *dn
<< dendl
;
6364 // update our cache now, so we are consistent with what is in the journal
6365 // when we journal a subtree map
6366 CInode
*in
= dn
->get_linkage()->get_inode();
6367 dn
->get_dir()->unlink_inode(dn
);
6368 straydn
->pop_projected_linkage();
6369 dn
->pop_projected_linkage();
6370 mdcache
->adjust_subtree_after_rename(in
, dn
->get_dir(), true);
6373 mdr
->reset_slave_request();
6376 if (!mdr
->aborted
) {
6377 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
6378 MMDSSlaveRequest::OP_RMDIRPREPACK
);
6379 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
6381 dout(10) << " abort flag set, finishing" << dendl
;
6382 mdcache
->request_finish(mdr
);
6386 void Server::handle_slave_rmdir_prep_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
6388 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
6389 << " " << *ack
<< dendl
;
6391 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
6393 mdr
->more()->slaves
.insert(from
);
6394 mdr
->more()->witnessed
.insert(from
);
6395 if (!ack
->is_not_journaled())
6396 mdr
->more()->has_journaled_slaves
= true;
6398 // remove from waiting list
6399 assert(mdr
->more()->waiting_on_slave
.count(from
));
6400 mdr
->more()->waiting_on_slave
.erase(from
);
6402 if (mdr
->more()->waiting_on_slave
.empty())
6403 dispatch_client_request(mdr
); // go again!
6405 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
6408 void Server::_commit_slave_rmdir(MDRequestRef
& mdr
, int r
, CDentry
*straydn
)
6410 dout(10) << "_commit_slave_rmdir " << *mdr
<< " r=" << r
<< dendl
;
6413 if (mdr
->more()->slave_update_journaled
) {
6414 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
6415 if (strayin
&& !strayin
->snaprealm
)
6416 mdcache
->clear_dirty_bits_for_stray(strayin
);
6421 if (mdr
->more()->slave_update_journaled
) {
6422 // write a commit to the journal
6423 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir_commit", mdr
->reqid
,
6424 mdr
->slave_to_mds
, ESlaveUpdate::OP_COMMIT
,
6425 ESlaveUpdate::RMDIR
);
6426 mdlog
->start_entry(le
);
6427 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
6430 _committed_slave(mdr
);
6434 do_rmdir_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
);
6438 struct C_MDS_LoggedRmdirRollback
: public ServerLogContext
{
6442 C_MDS_LoggedRmdirRollback(Server
*s
, MDRequestRef
& m
, metareqid_t mr
, CDentry
*d
, CDentry
*st
)
6443 : ServerLogContext(s
, m
), reqid(mr
), dn(d
), straydn(st
) {}
6444 void finish(int r
) override
{
6445 server
->_rmdir_rollback_finish(mdr
, reqid
, dn
, straydn
);
6449 void Server::do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
)
6451 // unlink the other rollback methods, the rmdir rollback is only
6452 // needed to record the subtree changes in the journal for inode
6453 // replicas who are auth for empty dirfrags. no actual changes to
6454 // the file system are taking place here, so there is no Mutation.
6456 rmdir_rollback rollback
;
6457 bufferlist::iterator p
= rbl
.begin();
6458 ::decode(rollback
, p
);
6460 dout(10) << "do_rmdir_rollback on " << rollback
.reqid
<< dendl
;
6461 mdcache
->add_rollback(rollback
.reqid
, master
); // need to finish this update before resolve finishes
6462 assert(mdr
|| mds
->is_resolve());
6464 CDir
*dir
= mdcache
->get_dirfrag(rollback
.src_dir
);
6466 dir
= mdcache
->get_dirfrag(rollback
.src_dir
.ino
, rollback
.src_dname
);
6468 CDentry
*dn
= dir
->lookup(rollback
.src_dname
);
6470 dout(10) << " dn " << *dn
<< dendl
;
6471 dir
= mdcache
->get_dirfrag(rollback
.dest_dir
);
6473 CDentry
*straydn
= dir
->lookup(rollback
.dest_dname
);
6475 dout(10) << " straydn " << *dn
<< dendl
;
6476 CInode
*in
= straydn
->get_linkage()->get_inode();
6478 if (mdr
&& !mdr
->more()->slave_update_journaled
) {
6479 assert(!in
->has_subtree_root_dirfrag(mds
->get_nodeid()));
6481 straydn
->get_dir()->unlink_inode(straydn
);
6482 dn
->get_dir()->link_primary_inode(dn
, in
);
6484 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(), false);
6486 mdcache
->request_finish(mdr
);
6487 mdcache
->finish_rollback(rollback
.reqid
);
6491 dn
->push_projected_linkage(in
);
6492 straydn
->push_projected_linkage();
6494 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rmdir_rollback", rollback
.reqid
, master
,
6495 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::RMDIR
);
6496 mdlog
->start_entry(le
);
6498 le
->commit
.add_dir_context(dn
->get_dir());
6499 le
->commit
.add_primary_dentry(dn
, in
, true);
6500 // slave: no need to journal straydn
6502 dout(10) << " noting renamed (unlinked) dir ino " << in
->ino() << " in metablob" << dendl
;
6503 le
->commit
.renamed_dirino
= in
->ino();
6505 mdcache
->project_subtree_rename(in
, straydn
->get_dir(), dn
->get_dir());
6507 submit_mdlog_entry(le
,
6508 new C_MDS_LoggedRmdirRollback(this, mdr
,rollback
.reqid
,
6514 void Server::_rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
)
6516 dout(10) << "_rmdir_rollback_finish " << reqid
<< dendl
;
6518 straydn
->get_dir()->unlink_inode(straydn
);
6519 dn
->pop_projected_linkage();
6520 straydn
->pop_projected_linkage();
6522 CInode
*in
= dn
->get_linkage()->get_inode();
6523 mdcache
->adjust_subtree_after_rename(in
, straydn
->get_dir(), true);
6524 if (mds
->is_resolve()) {
6525 CDir
*root
= mdcache
->get_subtree_root(straydn
->get_dir());
6526 mdcache
->try_trim_non_auth_subtree(root
);
6530 mdcache
->request_finish(mdr
);
6532 mdcache
->finish_rollback(reqid
);
6536 /** _dir_is_nonempty[_unlocked]
6538 * check if a directory is non-empty (i.e. we can rmdir it).
6540 * the unlocked varient this is a fastpath check. we can't really be
6541 * sure until we rdlock the filelock.
6543 bool Server::_dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*in
)
6545 dout(10) << "dir_is_nonempty_unlocked " << *in
<< dendl
;
6546 assert(in
->is_auth());
6548 if (in
->snaprealm
&& in
->snaprealm
->srnode
.snaps
.size())
6549 return true; // in a snapshot!
6552 in
->get_dirfrags(ls
);
6553 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6555 // is the frag obviously non-empty?
6556 if (dir
->is_auth()) {
6557 if (dir
->get_projected_fnode()->fragstat
.size()) {
6558 dout(10) << "dir_is_nonempty_unlocked dirstat has "
6559 << dir
->get_projected_fnode()->fragstat
.size() << " items " << *dir
<< dendl
;
6568 bool Server::_dir_is_nonempty(MDRequestRef
& mdr
, CInode
*in
)
6570 dout(10) << "dir_is_nonempty " << *in
<< dendl
;
6571 assert(in
->is_auth());
6572 assert(in
->filelock
.can_read(mdr
->get_client()));
6574 frag_info_t dirstat
;
6575 version_t dirstat_version
= in
->get_projected_inode()->dirstat
.version
;
6578 in
->get_dirfrags(ls
);
6579 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
6581 const fnode_t
*pf
= dir
->get_projected_fnode();
6582 if (pf
->fragstat
.size()) {
6583 dout(10) << "dir_is_nonempty dirstat has "
6584 << pf
->fragstat
.size() << " items " << *dir
<< dendl
;
6588 if (pf
->accounted_fragstat
.version
== dirstat_version
)
6589 dirstat
.add(pf
->accounted_fragstat
);
6591 dirstat
.add(pf
->fragstat
);
6594 return dirstat
.size() != in
->get_projected_inode()->dirstat
.size();
6598 // ======================================================
6601 class C_MDS_rename_finish
: public ServerLogContext
{
6606 C_MDS_rename_finish(Server
*s
, MDRequestRef
& r
,
6607 CDentry
*sdn
, CDentry
*ddn
, CDentry
*stdn
) :
6608 ServerLogContext(s
, r
),
6609 srcdn(sdn
), destdn(ddn
), straydn(stdn
) { }
6610 void finish(int r
) override
{
6612 server
->_rename_finish(mdr
, srcdn
, destdn
, straydn
);
6617 /** handle_client_rename
6619 * rename master is the destdn auth. this is because cached inodes
6620 * must remain connected. thus, any replica of srci, must also
6621 * replicate destdn, and possibly straydn, so that srci (and
6622 * destdn->inode) remain connected during the rename.
6624 * to do this, we freeze srci, then master (destdn auth) verifies that
6625 * all other nodes have also replciated destdn and straydn. note that
6626 * destdn replicas need not also replicate srci. this only works when
6629 * This function takes responsibility for the passed mdr.
6631 void Server::handle_client_rename(MDRequestRef
& mdr
)
6633 MClientRequest
*req
= mdr
->client_request
;
6634 dout(7) << "handle_client_rename " << *req
<< dendl
;
6636 filepath destpath
= req
->get_filepath();
6637 filepath srcpath
= req
->get_filepath2();
6638 if (destpath
.depth() == 0 || srcpath
.depth() == 0) {
6639 respond_to_request(mdr
, -EINVAL
);
6642 if (srcpath
.is_last_dot_or_dotdot() || destpath
.is_last_dot_or_dotdot()) {
6643 respond_to_request(mdr
, -EBUSY
);
6647 boost::string_view destname
= destpath
.last_dentry();
6649 vector
<CDentry
*>& srctrace
= mdr
->dn
[1];
6650 vector
<CDentry
*>& desttrace
= mdr
->dn
[0];
6652 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
6654 CDentry
*destdn
= rdlock_path_xlock_dentry(mdr
, 0, rdlocks
, wrlocks
, xlocks
, true, false, true);
6655 if (!destdn
) return;
6656 dout(10) << " destdn " << *destdn
<< dendl
;
6657 if (mdr
->snapid
!= CEPH_NOSNAP
) {
6658 respond_to_request(mdr
, -EROFS
);
6661 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
6662 CDir
*destdir
= destdn
->get_dir();
6663 assert(destdir
->is_auth());
6665 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, srcpath
, &srctrace
, NULL
, MDS_TRAVERSE_DISCOVER
);
6670 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl
;
6671 mdcache
->find_ino_peers(srcpath
.get_ino(), new C_MDS_TryFindInode(this, mdr
));
6673 dout(10) << "FAIL on error " << r
<< dendl
;
6674 respond_to_request(mdr
, r
);
6679 assert(!srctrace
.empty());
6680 CDentry
*srcdn
= srctrace
.back();
6681 dout(10) << " srcdn " << *srcdn
<< dendl
;
6682 if (srcdn
->last
!= CEPH_NOSNAP
) {
6683 respond_to_request(mdr
, -EROFS
);
6686 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
6687 CInode
*srci
= srcdnl
->get_inode();
6688 dout(10) << " srci " << *srci
<< dendl
;
6691 if (!destdnl
->is_null()) {
6692 //dout(10) << "dest dn exists " << *destdn << dendl;
6693 oldin
= mdcache
->get_dentry_inode(destdn
, mdr
, true);
6695 dout(10) << " oldin " << *oldin
<< dendl
;
6697 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
6698 if (oldin
->is_dir() && _dir_is_nonempty_unlocked(mdr
, oldin
)) {
6699 respond_to_request(mdr
, -ENOTEMPTY
);
6703 // if srcdn is replica, need to make sure its linkage is correct
6704 if (srcdn
->is_auth() ||
6705 srcdn
->lock
.can_read(mdr
->get_client()) ||
6706 (srcdn
->lock
.is_xlocked() && srcdn
->lock
.get_xlock_by() == mdr
)) {
6707 // mv /some/thing /to/some/existing_other_thing
6708 if (oldin
->is_dir() && !srci
->is_dir()) {
6709 respond_to_request(mdr
, -EISDIR
);
6712 if (!oldin
->is_dir() && srci
->is_dir()) {
6713 respond_to_request(mdr
, -ENOTDIR
);
6716 if (srci
== oldin
&& !srcdn
->get_dir()->inode
->is_stray()) {
6717 respond_to_request(mdr
, 0); // no-op. POSIX makes no sense.
6723 // -- some sanity checks --
6725 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
6726 if (destpath
.get_ino() != srcpath
.get_ino() &&
6727 !(req
->get_source().is_mds() &&
6728 MDS_INO_IS_MDSDIR(srcpath
.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
6729 CInode
*srcbase
= srctrace
[0]->get_dir()->get_inode();
6730 CInode
*destbase
= desttrace
[0]->get_dir()->get_inode();
6731 // ok, extend srctrace toward root until it is an ancestor of desttrace.
6732 while (srcbase
!= destbase
&&
6733 !srcbase
->is_projected_ancestor_of(destbase
)) {
6734 CDentry
*pdn
= srcbase
->get_projected_parent_dn();
6735 srctrace
.insert(srctrace
.begin(), pdn
);
6736 dout(10) << "rename prepending srctrace with " << *pdn
<< dendl
;
6737 srcbase
= pdn
->get_dir()->get_inode();
6740 // then, extend destpath until it shares the same parent inode as srcpath.
6741 while (destbase
!= srcbase
) {
6742 CDentry
*pdn
= destbase
->get_projected_parent_dn();
6743 desttrace
.insert(desttrace
.begin(), pdn
);
6744 rdlocks
.insert(&pdn
->lock
);
6745 dout(10) << "rename prepending desttrace with " << *pdn
<< dendl
;
6746 destbase
= pdn
->get_dir()->get_inode();
6748 dout(10) << "rename src and dest traces now share common ancestor " << *destbase
<< dendl
;
6752 if (srcdn
->get_dir() == destdir
&& srcdn
->get_name() == destname
) {
6753 dout(7) << "rename src=dest, noop" << dendl
;
6754 respond_to_request(mdr
, 0);
6758 // dest a child of src?
6759 // e.g. mv /usr /usr/foo
6760 CDentry
*pdn
= destdir
->inode
->get_projected_parent_dn();
6763 dout(7) << "cannot rename item to be a child of itself" << dendl
;
6764 respond_to_request(mdr
, -EINVAL
);
6767 pdn
= pdn
->get_dir()->inode
->parent
;
6770 // is this a stray migration, reintegration or merge? (sanity checks!)
6771 if (mdr
->reqid
.name
.is_mds() &&
6772 !(MDS_INO_IS_MDSDIR(srcpath
.get_ino()) &&
6773 MDS_INO_IS_MDSDIR(destpath
.get_ino())) &&
6774 !(destdnl
->is_remote() &&
6775 destdnl
->get_remote_ino() == srci
->ino())) {
6776 respond_to_request(mdr
, -EINVAL
); // actually, this won't reply, but whatev.
6780 bool linkmerge
= (srcdnl
->get_inode() == destdnl
->get_inode() &&
6781 (srcdnl
->is_primary() || destdnl
->is_primary()));
6783 dout(10) << " this is a link merge" << dendl
;
6785 // -- create stray dentry? --
6786 CDentry
*straydn
= NULL
;
6787 if (destdnl
->is_primary() && !linkmerge
) {
6788 straydn
= prepare_stray_dentry(mdr
, destdnl
->get_inode());
6791 dout(10) << " straydn is " << *straydn
<< dendl
;
6792 } else if (mdr
->straydn
) {
6793 mdr
->unpin(mdr
->straydn
);
6794 mdr
->straydn
= NULL
;
6797 // -- prepare witness list --
6799 * NOTE: we use _all_ replicas as witnesses.
6800 * this probably isn't totally necessary (esp for file renames),
6801 * but if/when we change that, we have to make sure rejoin is
6802 * sufficiently robust to handle strong rejoins from survivors
6803 * with totally wrong dentry->inode linkage.
6804 * (currently, it can ignore rename effects, because the resolve
6805 * stage will sort them out.)
6807 set
<mds_rank_t
> witnesses
= mdr
->more()->extra_witnesses
;
6808 if (srcdn
->is_auth())
6809 srcdn
->list_replicas(witnesses
);
6811 witnesses
.insert(srcdn
->authority().first
);
6812 if (srcdnl
->is_remote() && !srci
->is_auth())
6813 witnesses
.insert(srci
->authority().first
);
6814 destdn
->list_replicas(witnesses
);
6815 if (destdnl
->is_remote() && !oldin
->is_auth())
6816 witnesses
.insert(oldin
->authority().first
);
6817 dout(10) << " witnesses " << witnesses
<< ", have " << mdr
->more()->witnessed
<< dendl
;
6821 map
<SimpleLock
*, mds_rank_t
> remote_wrlocks
;
6823 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
6824 for (int i
=0; i
<(int)srctrace
.size(); i
++)
6825 rdlocks
.insert(&srctrace
[i
]->lock
);
6826 xlocks
.insert(&srcdn
->lock
);
6827 mds_rank_t srcdirauth
= srcdn
->get_dir()->authority().first
;
6828 if (srcdirauth
!= mds
->get_nodeid()) {
6829 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth
<< dendl
;
6830 remote_wrlocks
[&srcdn
->get_dir()->inode
->filelock
] = srcdirauth
;
6831 remote_wrlocks
[&srcdn
->get_dir()->inode
->nestlock
] = srcdirauth
;
6833 rdlocks
.insert(&srci
->dirfragtreelock
);
6835 wrlocks
.insert(&srcdn
->get_dir()->inode
->filelock
);
6836 wrlocks
.insert(&srcdn
->get_dir()->inode
->nestlock
);
6838 mds
->locker
->include_snap_rdlocks(rdlocks
, srcdn
->get_dir()->inode
);
6842 wrlocks
.insert(&straydn
->get_dir()->inode
->filelock
);
6843 wrlocks
.insert(&straydn
->get_dir()->inode
->nestlock
);
6844 xlocks
.insert(&straydn
->lock
);
6847 // xlock versionlock on dentries if there are witnesses.
6848 // replicas can't see projected dentry linkages, and will get
6849 // confused if we try to pipeline things.
6850 if (!witnesses
.empty()) {
6851 // take xlock on all projected ancestor dentries for srcdn and destdn.
6852 // this ensures the srcdn and destdn can be traversed to by the witnesses.
6853 for (int i
= 0; i
<(int)srctrace
.size(); i
++) {
6854 if (srctrace
[i
]->is_auth() && srctrace
[i
]->is_projected())
6855 xlocks
.insert(&srctrace
[i
]->versionlock
);
6857 for (int i
=0; i
<(int)desttrace
.size(); i
++) {
6858 if (desttrace
[i
]->is_auth() && desttrace
[i
]->is_projected())
6859 xlocks
.insert(&desttrace
[i
]->versionlock
);
6861 // xlock srci and oldin's primary dentries, so witnesses can call
6862 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
6864 if (srcdnl
->is_remote())
6865 xlocks
.insert(&srci
->get_projected_parent_dn()->lock
);
6866 if (destdnl
->is_remote())
6867 xlocks
.insert(&oldin
->get_projected_parent_dn()->lock
);
6870 // we need to update srci's ctime. xlock its least contended lock to do that...
6871 xlocks
.insert(&srci
->linklock
);
6873 // xlock oldin (for nlink--)
6875 xlocks
.insert(&oldin
->linklock
);
6876 if (oldin
->is_dir())
6877 rdlocks
.insert(&oldin
->filelock
);
6879 if (srcdnl
->is_primary() && srci
->is_dir())
6880 // FIXME: this should happen whenever we are renamning between
6881 // realms, regardless of the file type
6882 // FIXME: If/when this changes, make sure to update the
6883 // "allowance" in handle_slave_rename_prep
6884 xlocks
.insert(&srci
->snaplock
); // FIXME: an auth bcast could be sufficient?
6886 rdlocks
.insert(&srci
->snaplock
);
6888 CInode
*auth_pin_freeze
= !srcdn
->is_auth() && srcdnl
->is_primary() ? srci
: NULL
;
6889 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
,
6890 &remote_wrlocks
, auth_pin_freeze
))
6893 if ((!mdr
->has_more() || mdr
->more()->witnessed
.empty())) {
6894 if (!check_access(mdr
, srcdn
->get_dir()->get_inode(), MAY_WRITE
))
6897 if (!check_access(mdr
, destdn
->get_dir()->get_inode(), MAY_WRITE
))
6900 if (!check_fragment_space(mdr
, destdn
->get_dir()))
6903 if (!check_access(mdr
, srci
, MAY_WRITE
))
6907 // with read lock, really verify oldin is empty
6910 _dir_is_nonempty(mdr
, oldin
)) {
6911 respond_to_request(mdr
, -ENOTEMPTY
);
6915 /* project_past_snaprealm_parent() will do this job
6917 // moving between snaprealms?
6918 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
6919 SnapRealm *srcrealm = srci->find_snaprealm();
6920 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
6921 if (srcrealm != destrealm &&
6922 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
6923 destrealm->get_newest_seq() + 1 > srcdn->first)) {
6924 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
6925 mdcache->snaprealm_create(mdr, srci);
6931 assert(g_conf
->mds_kill_rename_at
!= 1);
6933 // -- open all srcdn inode frags, if any --
6934 // we need these open so that auth can properly delegate from inode to dirfrags
6935 // after the inode is _ours_.
6936 if (srcdnl
->is_primary() &&
6937 !srcdn
->is_auth() &&
6939 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl
;
6940 mdr
->set_stickydirs(srci
);
6943 srci
->dirfragtree
.get_leaves(frags
);
6944 for (list
<frag_t
>::iterator p
= frags
.begin();
6947 CDir
*dir
= srci
->get_dirfrag(*p
);
6949 dout(10) << " opening " << *p
<< " under " << *srci
<< dendl
;
6950 mdcache
->open_remote_dirfrag(srci
, *p
, new C_MDS_RetryRequest(mdcache
, mdr
));
6956 // -- prepare witnesses --
6958 // do srcdn auth last
6959 mds_rank_t last
= MDS_RANK_NONE
;
6960 if (!srcdn
->is_auth()) {
6961 last
= srcdn
->authority().first
;
6962 mdr
->more()->srcdn_auth_mds
= last
;
6963 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
6964 // are involved in the rename operation.
6965 if (srcdnl
->is_primary() && !mdr
->more()->is_ambiguous_auth
) {
6966 dout(10) << " preparing ambiguous auth for srci" << dendl
;
6967 assert(mdr
->more()->is_remote_frozen_authpin
);
6968 assert(mdr
->more()->rename_inode
== srci
);
6969 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
6974 for (set
<mds_rank_t
>::iterator p
= witnesses
.begin();
6975 p
!= witnesses
.end();
6977 if (*p
== last
) continue; // do it last!
6978 if (mdr
->more()->witnessed
.count(*p
)) {
6979 dout(10) << " already witnessed by mds." << *p
<< dendl
;
6980 } else if (mdr
->more()->waiting_on_slave
.count(*p
)) {
6981 dout(10) << " already waiting on witness mds." << *p
<< dendl
;
6983 if (!_rename_prepare_witness(mdr
, *p
, witnesses
, srctrace
, desttrace
, straydn
))
6987 if (!mdr
->more()->waiting_on_slave
.empty())
6988 return; // we're waiting for a witness.
6990 if (last
!= MDS_RANK_NONE
&& mdr
->more()->witnessed
.count(last
) == 0) {
6991 dout(10) << " preparing last witness (srcdn auth)" << dendl
;
6992 assert(mdr
->more()->waiting_on_slave
.count(last
) == 0);
6993 _rename_prepare_witness(mdr
, last
, witnesses
, srctrace
, desttrace
, straydn
);
6997 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
6998 if (!mdr
->more()->slaves
.empty() && !srci
->is_dir())
6999 assert(g_conf
->mds_kill_rename_at
!= 3);
7000 if (!mdr
->more()->slaves
.empty() && srci
->is_dir())
7001 assert(g_conf
->mds_kill_rename_at
!= 4);
7003 // -- declare now --
7004 mdr
->set_mds_stamp(ceph_clock_now());
7006 // -- prepare journal entry --
7007 mdr
->ls
= mdlog
->get_current_segment();
7008 EUpdate
*le
= new EUpdate(mdlog
, "rename");
7009 mdlog
->start_entry(le
);
7010 le
->metablob
.add_client_req(mdr
->reqid
, mdr
->client_request
->get_oldest_client_tid());
7011 if (!mdr
->more()->witnessed
.empty()) {
7012 dout(20) << " noting uncommitted_slaves " << mdr
->more()->witnessed
<< dendl
;
7014 le
->reqid
= mdr
->reqid
;
7015 le
->had_slaves
= true;
7017 mdcache
->add_uncommitted_master(mdr
->reqid
, mdr
->ls
, mdr
->more()->witnessed
);
7018 // no need to send frozen auth pin to recovring auth MDS of srci
7019 mdr
->more()->is_remote_frozen_authpin
= false;
7022 _rename_prepare(mdr
, &le
->metablob
, &le
->client_map
, srcdn
, destdn
, straydn
);
7023 if (le
->client_map
.length())
7024 le
->cmapv
= mds
->sessionmap
.get_projected();
7026 // -- commit locally --
7027 C_MDS_rename_finish
*fin
= new C_MDS_rename_finish(this, mdr
, srcdn
, destdn
, straydn
);
7029 journal_and_reply(mdr
, srci
, destdn
, le
, fin
);
7033 void Server::_rename_finish(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7035 dout(10) << "_rename_finish " << *mdr
<< dendl
;
7037 if (!mdr
->more()->witnessed
.empty())
7038 mdcache
->logged_master_update(mdr
->reqid
);
7041 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
7043 mdcache
->send_dentry_link(destdn
, mdr
);
7045 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
7046 CInode
*in
= destdnl
->get_inode();
7047 bool need_eval
= mdr
->more()->cap_imports
.count(in
);
7049 // test hack: test slave commit
7050 if (!mdr
->more()->slaves
.empty() && !in
->is_dir())
7051 assert(g_conf
->mds_kill_rename_at
!= 5);
7052 if (!mdr
->more()->slaves
.empty() && in
->is_dir())
7053 assert(g_conf
->mds_kill_rename_at
!= 6);
7056 utime_t now
= ceph_clock_now();
7057 mds
->balancer
->hit_dir(now
, srcdn
->get_dir(), META_POP_IWR
);
7058 if (destdnl
->is_remote() && in
->is_auth())
7059 mds
->balancer
->hit_inode(now
, in
, META_POP_IWR
);
7061 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
7063 assert(g_conf
->mds_kill_rename_at
!= 7);
7066 respond_to_request(mdr
, 0);
7069 mds
->locker
->eval(in
, CEPH_CAP_LOCKS
, true);
7072 // respond_to_request() drops locks. So stray reintegration can race with us.
7073 if (straydn
&& !straydn
->get_projected_linkage()->is_null()) {
7074 mdcache
->notify_stray(straydn
);
7082 bool Server::_rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
7083 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
)
7085 if (mds
->is_cluster_degraded() &&
7086 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(who
)) {
7087 dout(10) << "_rename_prepare_witness mds." << who
<< " is not active" << dendl
;
7088 if (mdr
->more()->waiting_on_slave
.empty())
7089 mds
->wait_for_active_peer(who
, new C_MDS_RetryRequest(mdcache
, mdr
));
7093 dout(10) << "_rename_prepare_witness mds." << who
<< dendl
;
7094 MMDSSlaveRequest
*req
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
7095 MMDSSlaveRequest::OP_RENAMEPREP
);
7097 req
->srcdnpath
= filepath(srctrace
.front()->get_dir()->ino());
7098 for (auto dn
: srctrace
)
7099 req
->srcdnpath
.push_dentry(dn
->get_name());
7100 req
->destdnpath
= filepath(dsttrace
.front()->get_dir()->ino());
7101 for (auto dn
: dsttrace
)
7102 req
->destdnpath
.push_dentry(dn
->get_name());
7104 mdcache
->replicate_stray(straydn
, who
, req
->stray
);
7106 req
->srcdn_auth
= mdr
->more()->srcdn_auth_mds
;
7108 // srcdn auth will verify our current witness list is sufficient
7109 req
->witnesses
= witnesse
;
7111 req
->op_stamp
= mdr
->get_op_stamp();
7112 mds
->send_message_mds(req
, who
);
7114 assert(mdr
->more()->waiting_on_slave
.count(who
) == 0);
7115 mdr
->more()->waiting_on_slave
.insert(who
);
7119 version_t
Server::_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
)
7121 version_t oldpv
= mdr
->more()->inode_import_v
;
7123 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
7126 bufferlist::iterator blp
= mdr
->more()->inode_import
.begin();
7129 map
<client_t
,entity_inst_t
> client_map
;
7130 decode(client_map
, blp
);
7131 prepare_force_open_sessions(client_map
, mdr
->more()->imported_session_map
);
7132 encode(client_map
, *client_map_bl
, mds
->mdsmap
->get_up_features());
7134 list
<ScatterLock
*> updated_scatterlocks
;
7135 mdcache
->migrator
->decode_import_inode(srcdn
, blp
, srcdn
->authority().first
, mdr
->ls
,
7136 mdr
->more()->cap_imports
, updated_scatterlocks
);
7138 // hack: force back to !auth and clean, temporarily
7139 srcdnl
->get_inode()->state_clear(CInode::STATE_AUTH
);
7140 srcdnl
->get_inode()->mark_clean();
7145 bool Server::_need_force_journal(CInode
*diri
, bool empty
)
7148 diri
->get_dirfrags(ls
);
7150 bool force_journal
= false;
7152 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7153 if ((*p
)->is_subtree_root() && (*p
)->get_dir_auth().first
== mds
->get_nodeid()) {
7154 dout(10) << " frag " << (*p
)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl
;
7155 force_journal
= true;
7158 dout(20) << " frag " << (*p
)->get_frag() << " is not auth subtree dirfrag" << dendl
;
7161 // see if any children of our frags are auth subtrees.
7162 list
<CDir
*> subtrees
;
7163 mdcache
->list_subtrees(subtrees
);
7164 dout(10) << " subtrees " << subtrees
<< " frags " << ls
<< dendl
;
7165 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7167 for (list
<CDir
*>::iterator q
= subtrees
.begin(); q
!= subtrees
.end(); ++q
) {
7168 if (dir
->contains(*q
)) {
7169 if ((*q
)->get_dir_auth().first
== mds
->get_nodeid()) {
7170 dout(10) << " frag " << (*p
)->get_frag() << " contains (maybe) auth subtree, will force journal "
7172 force_journal
= true;
7175 dout(20) << " frag " << (*p
)->get_frag() << " contains but isn't auth for " << **q
<< dendl
;
7177 dout(20) << " frag " << (*p
)->get_frag() << " does not contain " << **q
<< dendl
;
7183 return force_journal
;
7186 void Server::_rename_prepare(MDRequestRef
& mdr
,
7187 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
7188 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7190 dout(10) << "_rename_prepare " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
7192 dout(10) << " straydn " << *straydn
<< dendl
;
7194 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
7195 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
7196 CInode
*srci
= srcdnl
->get_inode();
7197 CInode
*oldin
= destdnl
->get_inode();
7199 // primary+remote link merge?
7200 bool linkmerge
= (srci
== destdnl
->get_inode() &&
7201 (srcdnl
->is_primary() || destdnl
->is_primary()));
7202 bool silent
= srcdn
->get_dir()->inode
->is_stray();
7204 bool force_journal_dest
= false;
7205 if (srci
->is_dir() && !destdn
->is_auth()) {
7206 if (srci
->is_auth()) {
7207 // if we are auth for srci and exporting it, force journal because journal replay needs
7208 // the source inode to create auth subtrees.
7209 dout(10) << " we are exporting srci, will force journal destdn" << dendl
;
7210 force_journal_dest
= true;
7212 force_journal_dest
= _need_force_journal(srci
, false);
7215 bool force_journal_stray
= false;
7216 if (oldin
&& oldin
->is_dir() && straydn
&& !straydn
->is_auth())
7217 force_journal_stray
= _need_force_journal(oldin
, true);
7220 dout(10) << " merging remote and primary links to the same inode" << dendl
;
7222 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl
;
7223 if (force_journal_dest
)
7224 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl
;
7225 if (force_journal_stray
)
7226 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl
;
7228 if (srci
->is_dir() && (destdn
->is_auth() || force_journal_dest
)) {
7229 dout(10) << " noting renamed dir ino " << srci
->ino() << " in metablob" << dendl
;
7230 metablob
->renamed_dirino
= srci
->ino();
7231 } else if (oldin
&& oldin
->is_dir() && force_journal_stray
) {
7232 dout(10) << " noting rename target dir " << oldin
->ino() << " in metablob" << dendl
;
7233 metablob
->renamed_dirino
= oldin
->ino();
7237 CInode::mempool_inode
*spi
= 0; // renamed inode
7238 CInode::mempool_inode
*tpi
= 0; // target/overwritten inode
7242 if (destdnl
->is_primary()) {
7243 assert(straydn
); // moving to straydn.
7244 // link--, and move.
7245 if (destdn
->is_auth()) {
7246 auto &pi
= oldin
->project_inode(); //project_snaprealm
7247 pi
.inode
.version
= straydn
->pre_dirty(pi
.inode
.version
);
7248 pi
.inode
.update_backtrace();
7251 straydn
->push_projected_linkage(oldin
);
7252 } else if (destdnl
->is_remote()) {
7254 if (oldin
->is_auth()) {
7255 auto &pi
= oldin
->project_inode();
7256 pi
.inode
.version
= oldin
->pre_dirty();
7263 if (srcdnl
->is_remote()) {
7266 if (destdn
->is_auth())
7267 mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty();
7268 destdn
->push_projected_linkage(srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
7270 if (srci
->is_auth()) {
7271 auto &pi
= srci
->project_inode();
7272 pi
.inode
.version
= srci
->pre_dirty();
7276 dout(10) << " will merge remote onto primary link" << dendl
;
7277 if (destdn
->is_auth()) {
7278 auto &pi
= oldin
->project_inode();
7279 pi
.inode
.version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldin
->inode
.version
);
7284 if (destdn
->is_auth()) {
7286 if (srcdn
->is_auth())
7287 oldpv
= srci
->get_projected_version();
7289 oldpv
= _rename_prepare_import(mdr
, srcdn
, client_map_bl
);
7291 // note which dirfrags have child subtrees in the journal
7292 // event, so that we can open those (as bounds) during replay.
7293 if (srci
->is_dir()) {
7295 srci
->get_dirfrags(ls
);
7296 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7298 if (!dir
->is_auth())
7299 metablob
->renamed_dir_frags
.push_back(dir
->get_frag());
7301 dout(10) << " noting renamed dir open frags " << metablob
->renamed_dir_frags
<< dendl
;
7304 auto &pi
= srci
->project_inode(); // project snaprealm if srcdnl->is_primary
7305 // & srcdnl->snaprealm
7306 pi
.inode
.version
= mdr
->more()->pvmap
[destdn
] = destdn
->pre_dirty(oldpv
);
7307 pi
.inode
.update_backtrace();
7310 destdn
->push_projected_linkage(srci
);
7314 if (srcdn
->is_auth())
7315 mdr
->more()->pvmap
[srcdn
] = srcdn
->pre_dirty();
7316 srcdn
->push_projected_linkage(); // push null linkage
7320 spi
->ctime
= mdr
->get_op_stamp();
7321 if (mdr
->get_op_stamp() > spi
->rstat
.rctime
)
7322 spi
->rstat
.rctime
= mdr
->get_op_stamp();
7328 tpi
->ctime
= mdr
->get_op_stamp();
7329 if (mdr
->get_op_stamp() > tpi
->rstat
.rctime
)
7330 tpi
->rstat
.rctime
= mdr
->get_op_stamp();
7334 destdn
->make_path_string(t
, true);
7335 tpi
->stray_prior_path
= mempool::mds_co::string(boost::string_view(t
));
7338 if (tpi
->nlink
== 0)
7339 oldin
->state_set(CInode::STATE_ORPHAN
);
7343 // prepare nesting, mtime updates
7344 int predirty_dir
= silent
? 0:PREDIRTY_DIR
;
7346 // guarantee stray dir is processed first during journal replay. unlink the old inode,
7347 // then link the source inode to destdn
7348 if (destdnl
->is_primary()) {
7350 if (straydn
->is_auth()) {
7351 metablob
->add_dir_context(straydn
->get_dir());
7352 metablob
->add_dir(straydn
->get_dir(), true);
7357 if (destdn
->is_auth() && !destdnl
->is_null()) {
7358 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, destdn
->get_dir(),
7359 (destdnl
->is_primary() ? PREDIRTY_PRIMARY
:0)|predirty_dir
, -1);
7360 if (destdnl
->is_primary()) {
7362 mdcache
->predirty_journal_parents(mdr
, metablob
, oldin
, straydn
->get_dir(),
7363 PREDIRTY_PRIMARY
|PREDIRTY_DIR
, 1);
7368 int predirty_primary
= (srcdnl
->is_primary() && srcdn
->get_dir() != destdn
->get_dir()) ? PREDIRTY_PRIMARY
:0;
7369 int flags
= predirty_dir
| predirty_primary
;
7370 if (srcdn
->is_auth())
7371 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, srcdn
->get_dir(), PREDIRTY_SHALLOW
|flags
, -1);
7372 if (destdn
->is_auth())
7373 mdcache
->predirty_journal_parents(mdr
, metablob
, srci
, destdn
->get_dir(), flags
, 1);
7375 SnapRealm
*src_realm
= srci
->find_snaprealm();
7376 SnapRealm
*dest_realm
= destdn
->get_dir()->inode
->find_snaprealm();
7377 snapid_t next_dest_snap
= dest_realm
->get_newest_seq() + 1;
7379 // add it all to the metablob
7382 if (destdnl
->is_primary()) {
7384 if (destdn
->is_auth()) {
7385 // project snaprealm, too
7386 if (oldin
->snaprealm
|| dest_realm
->get_newest_seq() + 1 > oldin
->get_oldest_snap())
7387 oldin
->project_past_snaprealm_parent(straydn
->get_dir()->inode
->find_snaprealm());
7388 straydn
->first
= MAX(oldin
->first
, next_dest_snap
);
7389 metablob
->add_primary_dentry(straydn
, oldin
, true, true);
7390 } else if (force_journal_stray
) {
7391 dout(10) << " forced journaling straydn " << *straydn
<< dendl
;
7392 metablob
->add_dir_context(straydn
->get_dir());
7393 metablob
->add_primary_dentry(straydn
, oldin
, true);
7395 } else if (destdnl
->is_remote()) {
7396 if (oldin
->is_auth()) {
7398 metablob
->add_dir_context(oldin
->get_projected_parent_dir());
7399 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, oldin
->get_projected_parent_dn(),
7400 CEPH_NOSNAP
, 0, destdnl
);
7401 metablob
->add_primary_dentry(oldin
->get_projected_parent_dn(), oldin
, true);
7407 if (srcdnl
->is_remote()) {
7409 if (destdn
->is_auth() && !destdnl
->is_null())
7410 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
7412 destdn
->first
= MAX(destdn
->first
, next_dest_snap
);
7414 if (destdn
->is_auth())
7415 metablob
->add_remote_dentry(destdn
, true, srcdnl
->get_remote_ino(), srcdnl
->get_remote_d_type());
7416 if (srci
->get_projected_parent_dn()->is_auth()) { // it's remote
7417 metablob
->add_dir_context(srci
->get_projected_parent_dir());
7418 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srci
->get_projected_parent_dn(), CEPH_NOSNAP
, 0, srcdnl
);
7419 metablob
->add_primary_dentry(srci
->get_projected_parent_dn(), srci
, true);
7422 if (destdn
->is_auth() && !destdnl
->is_null())
7423 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
7425 destdn
->first
= MAX(destdn
->first
, next_dest_snap
);
7427 if (destdn
->is_auth())
7428 metablob
->add_primary_dentry(destdn
, destdnl
->get_inode(), true, true);
7430 } else if (srcdnl
->is_primary()) {
7431 // project snap parent update?
7432 if (destdn
->is_auth() && src_realm
!= dest_realm
&&
7433 (srci
->snaprealm
|| src_realm
->get_newest_seq() + 1 > srci
->get_oldest_snap()))
7434 srci
->project_past_snaprealm_parent(dest_realm
);
7436 if (destdn
->is_auth() && !destdnl
->is_null())
7437 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, destdn
, CEPH_NOSNAP
, 0, destdnl
);
7439 destdn
->first
= MAX(destdn
->first
, next_dest_snap
);
7441 if (destdn
->is_auth())
7442 metablob
->add_primary_dentry(destdn
, srci
, true, true);
7443 else if (force_journal_dest
) {
7444 dout(10) << " forced journaling destdn " << *destdn
<< dendl
;
7445 metablob
->add_dir_context(destdn
->get_dir());
7446 metablob
->add_primary_dentry(destdn
, srci
, true);
7447 if (srcdn
->is_auth() && srci
->is_dir()) {
7448 // journal new subtrees root dirfrags
7450 srci
->get_dirfrags(ls
);
7451 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
7454 metablob
->add_dir(dir
, true);
7461 if (srcdn
->is_auth()) {
7462 dout(10) << " journaling srcdn " << *srcdn
<< dendl
;
7463 mdcache
->journal_cow_dentry(mdr
.get(), metablob
, srcdn
, CEPH_NOSNAP
, 0, srcdnl
);
7464 // also journal the inode in case we need do slave rename rollback. It is Ok to add
7465 // both primary and NULL dentries. Because during journal replay, null dentry is
7466 // processed after primary dentry.
7467 if (srcdnl
->is_primary() && !srci
->is_dir() && !destdn
->is_auth())
7468 metablob
->add_primary_dentry(srcdn
, srci
, true);
7469 metablob
->add_null_dentry(srcdn
, true);
7471 dout(10) << " NOT journaling srcdn " << *srcdn
<< dendl
;
7473 // make renamed inode first track the dn
7474 if (srcdnl
->is_primary() && destdn
->is_auth())
7475 srci
->first
= destdn
->first
;
7477 if (oldin
&& oldin
->is_dir()) {
7479 mdcache
->project_subtree_rename(oldin
, destdn
->get_dir(), straydn
->get_dir());
7482 mdcache
->project_subtree_rename(srci
, srcdn
->get_dir(), destdn
->get_dir());
7487 void Server::_rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7489 dout(10) << "_rename_apply " << *mdr
<< " " << *srcdn
<< " " << *destdn
<< dendl
;
7490 dout(10) << " pvs " << mdr
->more()->pvmap
<< dendl
;
7492 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
7493 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
7495 CInode
*oldin
= destdnl
->get_inode();
7497 // primary+remote link merge?
7498 bool linkmerge
= (srcdnl
->get_inode() == destdnl
->get_inode() &&
7499 (srcdnl
->is_primary() || destdnl
->is_primary()));
7503 if (destdnl
->is_primary()) {
7505 dout(10) << "straydn is " << *straydn
<< dendl
;
7506 destdn
->get_dir()->unlink_inode(destdn
, false);
7508 straydn
->pop_projected_linkage();
7509 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7510 assert(!straydn
->is_projected()); // no other projected
7512 mdcache
->touch_dentry_bottom(straydn
); // drop dn as quickly as possible.
7515 if (destdn
->is_auth()) {
7516 bool hadrealm
= (oldin
->snaprealm
? true : false);
7517 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
7518 if (oldin
->snaprealm
&& !hadrealm
)
7519 mdcache
->do_realm_invalidate_and_update_notify(oldin
, CEPH_SNAP_OP_SPLIT
);
7521 // FIXME this snaprealm is not filled out correctly
7522 //oldin->open_snaprealm(); might be sufficient..
7524 } else if (destdnl
->is_remote()) {
7525 destdn
->get_dir()->unlink_inode(destdn
, false);
7526 if (oldin
->is_auth())
7527 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
7531 // unlink src before we relink it at dest
7532 CInode
*in
= srcdnl
->get_inode();
7535 bool srcdn_was_remote
= srcdnl
->is_remote();
7536 srcdn
->get_dir()->unlink_inode(srcdn
);
7539 if (srcdn_was_remote
) {
7542 destdnl
= destdn
->pop_projected_linkage();
7543 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7544 assert(!destdn
->is_projected()); // no other projected
7546 destdn
->link_remote(destdnl
, in
);
7547 if (destdn
->is_auth())
7548 destdn
->mark_dirty(mdr
->more()->pvmap
[destdn
], mdr
->ls
);
7551 in
->pop_and_dirty_projected_inode(mdr
->ls
);
7553 dout(10) << "merging remote onto primary link" << dendl
;
7554 oldin
->pop_and_dirty_projected_inode(mdr
->ls
);
7558 dout(10) << "merging primary onto remote link" << dendl
;
7559 destdn
->get_dir()->unlink_inode(destdn
, false);
7561 destdnl
= destdn
->pop_projected_linkage();
7562 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7563 assert(!destdn
->is_projected()); // no other projected
7565 // srcdn inode import?
7566 if (!srcdn
->is_auth() && destdn
->is_auth()) {
7567 assert(mdr
->more()->inode_import
.length() > 0);
7569 map
<client_t
,Capability::Import
> imported_caps
;
7571 // finish cap imports
7572 finish_force_open_sessions(mdr
->more()->imported_session_map
);
7573 if (mdr
->more()->cap_imports
.count(destdnl
->get_inode())) {
7574 mdcache
->migrator
->finish_import_inode_caps(destdnl
->get_inode(),
7575 mdr
->more()->srcdn_auth_mds
, true,
7576 mdr
->more()->imported_session_map
,
7577 mdr
->more()->cap_imports
[destdnl
->get_inode()],
7581 mdr
->more()->inode_import
.clear();
7582 ::encode(imported_caps
, mdr
->more()->inode_import
);
7584 /* hack: add an auth pin for each xlock we hold. These were
7585 * remote xlocks previously but now they're local and
7586 * we're going to try and unpin when we xlock_finish. */
7587 for (set
<SimpleLock
*>::iterator i
= mdr
->xlocks
.begin();
7588 i
!= mdr
->xlocks
.end();
7590 if ((*i
)->get_parent() == destdnl
->get_inode() &&
7591 !(*i
)->is_locallock())
7592 mds
->locker
->xlock_import(*i
);
7594 // hack: fix auth bit
7595 in
->state_set(CInode::STATE_AUTH
);
7597 mdr
->clear_ambiguous_auth();
7600 if (destdn
->is_auth()) {
7601 in
->pop_and_dirty_projected_inode(mdr
->ls
);
7604 // FIXME: fix up snaprealm!
7609 if (srcdn
->is_auth())
7610 srcdn
->mark_dirty(mdr
->more()->pvmap
[srcdn
], mdr
->ls
);
7611 srcdn
->pop_projected_linkage();
7612 if (mdr
->is_slave() && !mdr
->more()->slave_update_journaled
)
7613 assert(!srcdn
->is_projected()); // no other projected
7615 // apply remaining projected inodes (nested)
7618 // update subtree map?
7619 if (destdnl
->is_primary() && in
->is_dir()) {
7620 mdcache
->adjust_subtree_after_rename(in
, srcdn
->get_dir(), true);
7622 if (destdn
->is_auth())
7623 mdcache
->migrator
->adjust_export_after_rename(in
, srcdn
->get_dir());
7626 if (straydn
&& oldin
->is_dir())
7627 mdcache
->adjust_subtree_after_rename(oldin
, destdn
->get_dir(), true);
7629 // removing a new dn?
7630 if (srcdn
->is_auth())
7631 srcdn
->get_dir()->try_remove_unlinked_dn(srcdn
);
7639 class C_MDS_SlaveRenamePrep
: public ServerLogContext
{
7640 CDentry
*srcdn
, *destdn
, *straydn
;
7642 C_MDS_SlaveRenamePrep(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
7643 ServerLogContext(s
, m
), srcdn(sr
), destdn(de
), straydn(st
) {}
7644 void finish(int r
) override
{
7645 server
->_logged_slave_rename(mdr
, srcdn
, destdn
, straydn
);
7649 class C_MDS_SlaveRenameCommit
: public ServerContext
{
7651 CDentry
*srcdn
, *destdn
, *straydn
;
7653 C_MDS_SlaveRenameCommit(Server
*s
, MDRequestRef
& m
, CDentry
*sr
, CDentry
*de
, CDentry
*st
) :
7654 ServerContext(s
), mdr(m
), srcdn(sr
), destdn(de
), straydn(st
) {}
7655 void finish(int r
) override
{
7656 server
->_commit_slave_rename(mdr
, r
, srcdn
, destdn
, straydn
);
7660 class C_MDS_SlaveRenameSessionsFlushed
: public ServerContext
{
7663 C_MDS_SlaveRenameSessionsFlushed(Server
*s
, MDRequestRef
& r
) :
7664 ServerContext(s
), mdr(r
) {}
7665 void finish(int r
) override
{
7666 server
->_slave_rename_sessions_flushed(mdr
);
7670 /* This function DOES put the mdr->slave_request before returning*/
7671 void Server::handle_slave_rename_prep(MDRequestRef
& mdr
)
7673 dout(10) << "handle_slave_rename_prep " << *mdr
7674 << " " << mdr
->slave_request
->srcdnpath
7675 << " to " << mdr
->slave_request
->destdnpath
7678 if (mdr
->slave_request
->is_interrupted()) {
7679 dout(10) << " slave request interrupted, sending noop reply" << dendl
;
7680 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
7681 reply
->mark_interrupted();
7682 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
7683 mdr
->reset_slave_request();
7688 filepath
destpath(mdr
->slave_request
->destdnpath
);
7689 dout(10) << " dest " << destpath
<< dendl
;
7690 vector
<CDentry
*> trace
;
7691 int r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, destpath
, &trace
, NULL
, MDS_TRAVERSE_DISCOVERXLOCK
);
7694 mdcache
->find_ino_peers(destpath
.get_ino(), new C_MDS_RetryRequest(mdcache
, mdr
),
7698 assert(r
== 0); // we shouldn't get an error here!
7700 CDentry
*destdn
= trace
.back();
7701 CDentry::linkage_t
*destdnl
= destdn
->get_projected_linkage();
7702 dout(10) << " destdn " << *destdn
<< dendl
;
7706 filepath
srcpath(mdr
->slave_request
->srcdnpath
);
7707 dout(10) << " src " << srcpath
<< dendl
;
7708 CInode
*srci
= nullptr;
7709 r
= mdcache
->path_traverse(mdr
, NULL
, NULL
, srcpath
, &trace
, &srci
, MDS_TRAVERSE_DISCOVERXLOCK
);
7713 // srcpath must not point to a null dentry
7714 assert(srci
!= nullptr);
7716 CDentry
*srcdn
= trace
.back();
7717 CDentry::linkage_t
*srcdnl
= srcdn
->get_projected_linkage();
7718 dout(10) << " srcdn " << *srcdn
<< dendl
;
7723 bool linkmerge
= (srcdnl
->get_inode() == destdnl
->get_inode() &&
7724 (srcdnl
->is_primary() || destdnl
->is_primary()));
7725 CDentry
*straydn
= mdr
->straydn
;
7726 if (destdnl
->is_primary() && !linkmerge
)
7729 mdr
->set_op_stamp(mdr
->slave_request
->op_stamp
);
7730 mdr
->more()->srcdn_auth_mds
= srcdn
->authority().first
;
7732 // set up commit waiter (early, to clean up any freezing etc we do)
7733 if (!mdr
->more()->slave_commit
)
7734 mdr
->more()->slave_commit
= new C_MDS_SlaveRenameCommit(this, mdr
, srcdn
, destdn
, straydn
);
7737 if (srcdn
->is_auth()) {
7738 set
<mds_rank_t
> srcdnrep
;
7739 srcdn
->list_replicas(srcdnrep
);
7741 bool reply_witness
= false;
7742 if (srcdnl
->is_primary() && !srcdnl
->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
7745 // - avoid conflicting lock state changes
7746 // - avoid concurrent updates to the inode
7747 // (this could also be accomplished with the versionlock)
7748 int allowance
= 2; // 1 for the mdr auth_pin, 1 for the link lock
7749 allowance
+= srcdnl
->get_inode()->is_dir(); // for the snap lock
7750 dout(10) << " freezing srci " << *srcdnl
->get_inode() << " with allowance " << allowance
<< dendl
;
7751 bool frozen_inode
= srcdnl
->get_inode()->freeze_inode(allowance
);
7753 // unfreeze auth pin after freezing the inode to avoid queueing waiters
7754 if (srcdnl
->get_inode()->is_frozen_auth_pin())
7755 mdr
->unfreeze_auth_pin();
7757 if (!frozen_inode
) {
7758 srcdnl
->get_inode()->add_waiter(CInode::WAIT_FROZEN
, new C_MDS_RetryRequest(mdcache
, mdr
));
7763 * set ambiguous auth for srci
7764 * NOTE: we don't worry about ambiguous cache expire as we do
7765 * with subtree migrations because all slaves will pin
7766 * srcdn->get_inode() for duration of this rename.
7768 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
7770 // just mark the source inode as ambiguous auth if more than two MDS are involved.
7771 // the master will send another OP_RENAMEPREP slave request later.
7772 if (mdr
->slave_request
->witnesses
.size() > 1) {
7773 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl
;
7774 reply_witness
= true;
7777 // make sure bystanders have received all lock related messages
7778 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
7779 if (*p
== mdr
->slave_to_mds
||
7780 (mds
->is_cluster_degraded() &&
7781 !mds
->mdsmap
->is_clientreplay_or_active_or_stopping(*p
)))
7783 MMDSSlaveRequest
*notify
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
7784 MMDSSlaveRequest::OP_RENAMENOTIFY
);
7785 mds
->send_message_mds(notify
, *p
);
7786 mdr
->more()->waiting_on_slave
.insert(*p
);
7789 // make sure clients have received all cap related messages
7790 set
<client_t
> export_client_set
;
7791 mdcache
->migrator
->get_export_client_set(srcdnl
->get_inode(), export_client_set
);
7793 MDSGatherBuilder
gather(g_ceph_context
);
7794 flush_client_sessions(export_client_set
, gather
);
7795 if (gather
.has_subs()) {
7796 mdr
->more()->waiting_on_slave
.insert(MDS_RANK_NONE
);
7797 gather
.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr
));
7802 // is witness list sufficient?
7803 for (set
<mds_rank_t
>::iterator p
= srcdnrep
.begin(); p
!= srcdnrep
.end(); ++p
) {
7804 if (*p
== mdr
->slave_to_mds
||
7805 mdr
->slave_request
->witnesses
.count(*p
)) continue;
7806 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl
;
7807 reply_witness
= true;
7811 if (reply_witness
) {
7812 assert(!srcdnrep
.empty());
7813 MMDSSlaveRequest
*reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
,
7814 MMDSSlaveRequest::OP_RENAMEPREPACK
);
7815 reply
->witnesses
.swap(srcdnrep
);
7816 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
7817 mdr
->reset_slave_request();
7820 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl
;
7821 if (!mdr
->more()->waiting_on_slave
.empty()) {
7822 dout(10) << " still waiting for rename notify acks from "
7823 << mdr
->more()->waiting_on_slave
<< dendl
;
7826 } else if (srcdnl
->is_primary() && srcdn
->authority() != destdn
->authority()) {
7827 // set ambiguous auth for srci on witnesses
7828 mdr
->set_ambiguous_auth(srcdnl
->get_inode());
7831 // encode everything we'd need to roll this back... basically, just the original state.
7832 rename_rollback rollback
;
7834 rollback
.reqid
= mdr
->reqid
;
7836 rollback
.orig_src
.dirfrag
= srcdn
->get_dir()->dirfrag();
7837 rollback
.orig_src
.dirfrag_old_mtime
= srcdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
7838 rollback
.orig_src
.dirfrag_old_rctime
= srcdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
7839 rollback
.orig_src
.dname
= std::string(srcdn
->get_name());
7840 if (srcdnl
->is_primary())
7841 rollback
.orig_src
.ino
= srcdnl
->get_inode()->ino();
7843 assert(srcdnl
->is_remote());
7844 rollback
.orig_src
.remote_ino
= srcdnl
->get_remote_ino();
7845 rollback
.orig_src
.remote_d_type
= srcdnl
->get_remote_d_type();
7848 rollback
.orig_dest
.dirfrag
= destdn
->get_dir()->dirfrag();
7849 rollback
.orig_dest
.dirfrag_old_mtime
= destdn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
7850 rollback
.orig_dest
.dirfrag_old_rctime
= destdn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
7851 rollback
.orig_dest
.dname
= std::string(destdn
->get_name());
7852 if (destdnl
->is_primary())
7853 rollback
.orig_dest
.ino
= destdnl
->get_inode()->ino();
7854 else if (destdnl
->is_remote()) {
7855 rollback
.orig_dest
.remote_ino
= destdnl
->get_remote_ino();
7856 rollback
.orig_dest
.remote_d_type
= destdnl
->get_remote_d_type();
7860 rollback
.stray
.dirfrag
= straydn
->get_dir()->dirfrag();
7861 rollback
.stray
.dirfrag_old_mtime
= straydn
->get_dir()->get_projected_fnode()->fragstat
.mtime
;
7862 rollback
.stray
.dirfrag_old_rctime
= straydn
->get_dir()->get_projected_fnode()->rstat
.rctime
;
7863 rollback
.stray
.dname
= std::string(straydn
->get_name());
7865 ::encode(rollback
, mdr
->more()->rollback_bl
);
7866 dout(20) << " rollback is " << mdr
->more()->rollback_bl
.length() << " bytes" << dendl
;
7869 mdr
->ls
= mdlog
->get_current_segment();
7870 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_prep", mdr
->reqid
, mdr
->slave_to_mds
,
7871 ESlaveUpdate::OP_PREPARE
, ESlaveUpdate::RENAME
);
7872 mdlog
->start_entry(le
);
7873 le
->rollback
= mdr
->more()->rollback_bl
;
7875 bufferlist blah
; // inode import data... obviously not used if we're the slave
7876 _rename_prepare(mdr
, &le
->commit
, &blah
, srcdn
, destdn
, straydn
);
7878 if (le
->commit
.empty()) {
7879 dout(10) << " empty metablob, skipping journal" << dendl
;
7880 mdlog
->cancel_entry(le
);
7882 _logged_slave_rename(mdr
, srcdn
, destdn
, straydn
);
7884 mdr
->more()->slave_update_journaled
= true;
7885 submit_mdlog_entry(le
, new C_MDS_SlaveRenamePrep(this, mdr
, srcdn
, destdn
, straydn
),
7891 void Server::_logged_slave_rename(MDRequestRef
& mdr
,
7892 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7894 dout(10) << "_logged_slave_rename " << *mdr
<< dendl
;
7897 MMDSSlaveRequest
*reply
= NULL
;
7898 if (!mdr
->aborted
) {
7899 reply
= new MMDSSlaveRequest(mdr
->reqid
, mdr
->attempt
, MMDSSlaveRequest::OP_RENAMEPREPACK
);
7900 if (!mdr
->more()->slave_update_journaled
)
7901 reply
->mark_not_journaled();
7904 CDentry::linkage_t
*srcdnl
= srcdn
->get_linkage();
7905 CDentry::linkage_t
*destdnl
= NULL
;
7906 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
7909 if (srcdn
->is_auth() && srcdnl
->is_primary()) {
7910 // set export bounds for CInode::encode_export()
7912 if (srcdnl
->get_inode()->is_dir()) {
7913 srcdnl
->get_inode()->get_dirfrags(bounds
);
7914 for (list
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
)
7915 (*p
)->state_set(CDir::STATE_EXPORTBOUND
);
7918 map
<client_t
,entity_inst_t
> exported_client_map
;
7920 mdcache
->migrator
->encode_export_inode(srcdnl
->get_inode(), inodebl
,
7921 exported_client_map
);
7923 for (list
<CDir
*>::iterator p
= bounds
.begin(); p
!= bounds
.end(); ++p
)
7924 (*p
)->state_clear(CDir::STATE_EXPORTBOUND
);
7927 ::encode(exported_client_map
, reply
->inode_export
, mds
->mdsmap
->get_up_features());
7928 reply
->inode_export
.claim_append(inodebl
);
7929 reply
->inode_export_v
= srcdnl
->get_inode()->inode
.version
;
7932 // remove mdr auth pin
7933 mdr
->auth_unpin(srcdnl
->get_inode());
7934 mdr
->more()->is_inode_exporter
= true;
7936 if (srcdnl
->get_inode()->is_dirty())
7937 srcdnl
->get_inode()->mark_clean();
7939 dout(10) << " exported srci " << *srcdnl
->get_inode() << dendl
;
7943 _rename_apply(mdr
, srcdn
, destdn
, straydn
);
7945 destdnl
= destdn
->get_linkage();
7948 utime_t now
= ceph_clock_now();
7949 mds
->balancer
->hit_dir(now
, srcdn
->get_dir(), META_POP_IWR
);
7950 if (destdnl
->get_inode() && destdnl
->get_inode()->is_auth())
7951 mds
->balancer
->hit_inode(now
, destdnl
->get_inode(), META_POP_IWR
);
7954 mdr
->reset_slave_request();
7958 mds
->send_message_mds(reply
, mdr
->slave_to_mds
);
7960 assert(mdr
->aborted
);
7961 dout(10) << " abort flag set, finishing" << dendl
;
7962 mdcache
->request_finish(mdr
);
7966 void Server::_commit_slave_rename(MDRequestRef
& mdr
, int r
,
7967 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
)
7969 dout(10) << "_commit_slave_rename " << *mdr
<< " r=" << r
<< dendl
;
7971 CDentry::linkage_t
*destdnl
= destdn
->get_linkage();
7973 list
<MDSInternalContextBase
*> finished
;
7975 // unfreeze+singleauth inode
7976 // hmm, do i really need to delay this?
7977 if (mdr
->more()->is_inode_exporter
) {
7979 CInode
*in
= destdnl
->get_inode();
7982 // we exported, clear out any xlocks that we moved to another MDS
7983 set
<SimpleLock
*>::iterator i
= mdr
->xlocks
.begin();
7984 while (i
!= mdr
->xlocks
.end()) {
7985 SimpleLock
*lock
= *i
++;
7987 // we only care about xlocks on the exported inode
7988 if (lock
->get_parent() == in
&&
7989 !lock
->is_locallock())
7990 mds
->locker
->xlock_export(lock
, mdr
.get());
7993 map
<client_t
,Capability::Import
> peer_imported
;
7994 bufferlist::iterator bp
= mdr
->more()->inode_import
.begin();
7995 ::decode(peer_imported
, bp
);
7997 dout(10) << " finishing inode export on " << *destdnl
->get_inode() << dendl
;
7998 mdcache
->migrator
->finish_export_inode(destdnl
->get_inode(), ceph_clock_now(),
7999 mdr
->slave_to_mds
, peer_imported
, finished
);
8000 mds
->queue_waiters(finished
); // this includes SINGLEAUTH waiters.
8003 assert(destdnl
->get_inode()->is_frozen_inode());
8004 destdnl
->get_inode()->unfreeze_inode(finished
);
8008 if (mdr
->more()->is_ambiguous_auth
) {
8009 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
8010 mdr
->more()->is_ambiguous_auth
= false;
8013 if (straydn
&& mdr
->more()->slave_update_journaled
) {
8014 CInode
*strayin
= straydn
->get_projected_linkage()->get_inode();
8015 if (strayin
&& !strayin
->snaprealm
)
8016 mdcache
->clear_dirty_bits_for_stray(strayin
);
8019 mds
->queue_waiters(finished
);
8022 if (mdr
->more()->slave_update_journaled
) {
8023 // write a commit to the journal
8024 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_commit", mdr
->reqid
,
8025 mdr
->slave_to_mds
, ESlaveUpdate::OP_COMMIT
,
8026 ESlaveUpdate::RENAME
);
8027 mdlog
->start_entry(le
);
8028 submit_mdlog_entry(le
, new C_MDS_CommittedSlave(this, mdr
), mdr
, __func__
);
8031 _committed_slave(mdr
);
8036 // rollback_bl may be empty if we froze the inode but had to provide an expanded
8037 // witness list from the master, and they failed before we tried prep again.
8038 if (mdr
->more()->rollback_bl
.length()) {
8039 if (mdr
->more()->is_inode_exporter
) {
8040 dout(10) << " reversing inode export of " << *destdnl
->get_inode() << dendl
;
8041 destdnl
->get_inode()->abort_export();
8043 if (mdcache
->is_ambiguous_slave_update(mdr
->reqid
, mdr
->slave_to_mds
)) {
8044 mdcache
->remove_ambiguous_slave_update(mdr
->reqid
, mdr
->slave_to_mds
);
8045 // rollback but preserve the slave request
8046 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
, false);
8047 mdr
->more()->rollback_bl
.clear();
8049 do_rename_rollback(mdr
->more()->rollback_bl
, mdr
->slave_to_mds
, mdr
, true);
8051 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl
;
8053 if (mdr
->more()->is_ambiguous_auth
) {
8054 if (srcdn
->is_auth())
8055 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
8057 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
8058 mdr
->more()->is_ambiguous_auth
= false;
8060 mds
->queue_waiters(finished
);
8061 mdcache
->request_finish(mdr
);
8066 void _rollback_repair_dir(MutationRef
& mut
, CDir
*dir
, rename_rollback::drec
&r
, utime_t ctime
,
8067 bool isdir
, int linkunlink
, nest_info_t
&rstat
)
8070 pf
= dir
->project_fnode();
8071 mut
->add_projected_fnode(dir
);
8072 pf
->version
= dir
->pre_dirty();
8075 pf
->fragstat
.nsubdirs
+= linkunlink
;
8077 pf
->fragstat
.nfiles
+= linkunlink
;
8080 pf
->rstat
.rbytes
+= linkunlink
* rstat
.rbytes
;
8081 pf
->rstat
.rfiles
+= linkunlink
* rstat
.rfiles
;
8082 pf
->rstat
.rsubdirs
+= linkunlink
* rstat
.rsubdirs
;
8083 pf
->rstat
.rsnaprealms
+= linkunlink
* rstat
.rsnaprealms
;
8085 if (pf
->fragstat
.mtime
== ctime
) {
8086 pf
->fragstat
.mtime
= r
.dirfrag_old_mtime
;
8087 if (pf
->rstat
.rctime
== ctime
)
8088 pf
->rstat
.rctime
= r
.dirfrag_old_rctime
;
8090 mut
->add_updated_lock(&dir
->get_inode()->filelock
);
8091 mut
->add_updated_lock(&dir
->get_inode()->nestlock
);
8094 struct C_MDS_LoggedRenameRollback
: public ServerLogContext
{
8101 C_MDS_LoggedRenameRollback(Server
*s
, MutationRef
& m
, MDRequestRef
& r
,
8102 CDentry
*sd
, version_t pv
, CDentry
*dd
,
8103 CDentry
*st
, bool f
) :
8104 ServerLogContext(s
, r
), mut(m
), srcdn(sd
), srcdnpv(pv
), destdn(dd
),
8105 straydn(st
), finish_mdr(f
) {}
8106 void finish(int r
) override
{
8107 server
->_rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
,
8108 destdn
, straydn
, finish_mdr
);
8112 void Server::do_rename_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
,
8115 rename_rollback rollback
;
8116 bufferlist::iterator p
= rbl
.begin();
8117 ::decode(rollback
, p
);
8119 dout(10) << "do_rename_rollback on " << rollback
.reqid
<< dendl
;
8120 // need to finish this update before sending resolve to claim the subtree
8121 mdcache
->add_rollback(rollback
.reqid
, master
);
8123 MutationRef
mut(new MutationImpl(nullptr, utime_t(), rollback
.reqid
));
8124 mut
->ls
= mds
->mdlog
->get_current_segment();
8126 CDentry
*srcdn
= NULL
;
8127 CDir
*srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
);
8129 srcdir
= mdcache
->get_dirfrag(rollback
.orig_src
.dirfrag
.ino
, rollback
.orig_src
.dname
);
8131 dout(10) << " srcdir " << *srcdir
<< dendl
;
8132 srcdn
= srcdir
->lookup(rollback
.orig_src
.dname
);
8134 dout(10) << " srcdn " << *srcdn
<< dendl
;
8135 assert(srcdn
->get_linkage()->is_null());
8137 dout(10) << " srcdn not found" << dendl
;
8139 dout(10) << " srcdir not found" << dendl
;
8141 CDentry
*destdn
= NULL
;
8142 CDir
*destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
);
8144 destdir
= mdcache
->get_dirfrag(rollback
.orig_dest
.dirfrag
.ino
, rollback
.orig_dest
.dname
);
8146 dout(10) << " destdir " << *destdir
<< dendl
;
8147 destdn
= destdir
->lookup(rollback
.orig_dest
.dname
);
8149 dout(10) << " destdn " << *destdn
<< dendl
;
8151 dout(10) << " destdn not found" << dendl
;
8153 dout(10) << " destdir not found" << dendl
;
8156 if (rollback
.orig_src
.ino
) {
8157 in
= mdcache
->get_inode(rollback
.orig_src
.ino
);
8158 if (in
&& in
->is_dir())
8159 assert(srcdn
&& destdn
);
8161 in
= mdcache
->get_inode(rollback
.orig_src
.remote_ino
);
8163 CDir
*straydir
= NULL
;
8164 CDentry
*straydn
= NULL
;
8165 if (rollback
.stray
.dirfrag
.ino
) {
8166 straydir
= mdcache
->get_dirfrag(rollback
.stray
.dirfrag
);
8168 dout(10) << "straydir " << *straydir
<< dendl
;
8169 straydn
= straydir
->lookup(rollback
.stray
.dname
);
8171 dout(10) << " straydn " << *straydn
<< dendl
;
8172 assert(straydn
->get_linkage()->is_primary());
8174 dout(10) << " straydn not found" << dendl
;
8176 dout(10) << "straydir not found" << dendl
;
8179 CInode
*target
= NULL
;
8180 if (rollback
.orig_dest
.ino
) {
8181 target
= mdcache
->get_inode(rollback
.orig_dest
.ino
);
8183 assert(destdn
&& straydn
);
8184 } else if (rollback
.orig_dest
.remote_ino
)
8185 target
= mdcache
->get_inode(rollback
.orig_dest
.remote_ino
);
8187 // can't use is_auth() in the resolve stage
8188 mds_rank_t whoami
= mds
->get_nodeid();
8190 assert(!destdn
|| destdn
->authority().first
!= whoami
);
8191 assert(!straydn
|| straydn
->authority().first
!= whoami
);
8193 bool force_journal_src
= false;
8194 bool force_journal_dest
= false;
8195 if (in
&& in
->is_dir() && srcdn
->authority().first
!= whoami
)
8196 force_journal_src
= _need_force_journal(in
, false);
8197 if (in
&& target
&& target
->is_dir())
8198 force_journal_dest
= _need_force_journal(in
, true);
8200 version_t srcdnpv
= 0;
8203 if (srcdn
->authority().first
== whoami
)
8204 srcdnpv
= srcdn
->pre_dirty();
8205 if (rollback
.orig_src
.ino
) {
8207 srcdn
->push_projected_linkage(in
);
8209 srcdn
->push_projected_linkage(rollback
.orig_src
.remote_ino
,
8210 rollback
.orig_src
.remote_d_type
);
8213 CInode::mempool_inode
*pip
= 0;
8215 if (in
->authority().first
== whoami
) {
8216 auto &pi
= in
->project_inode();
8217 mut
->add_projected_inode(in
);
8218 pi
.inode
.version
= in
->pre_dirty();
8221 pip
= in
->get_projected_inode();
8222 if (pip
->ctime
== rollback
.ctime
)
8223 pip
->ctime
= rollback
.orig_src
.old_ctime
;
8226 if (srcdn
&& srcdn
->authority().first
== whoami
) {
8228 _rollback_repair_dir(mut
, srcdir
, rollback
.orig_src
, rollback
.ctime
,
8229 in
? in
->is_dir() : false, 1, pip
? pip
->accounted_rstat
: blah
);
8234 if (rollback
.orig_dest
.ino
&& target
) {
8235 destdn
->push_projected_linkage(target
);
8236 } else if (rollback
.orig_dest
.remote_ino
) {
8237 destdn
->push_projected_linkage(rollback
.orig_dest
.remote_ino
,
8238 rollback
.orig_dest
.remote_d_type
);
8240 // the dentry will be trimmed soon, it's ok to have wrong linkage
8241 if (rollback
.orig_dest
.ino
)
8242 assert(mds
->is_resolve());
8243 destdn
->push_projected_linkage();
8248 straydn
->push_projected_linkage();
8251 CInode::mempool_inode
*ti
= NULL
;
8252 if (target
->authority().first
== whoami
) {
8253 auto &pi
= target
->project_inode();
8254 mut
->add_projected_inode(target
);
8255 pi
.inode
.version
= target
->pre_dirty();
8258 ti
= target
->get_projected_inode();
8259 if (ti
->ctime
== rollback
.ctime
)
8260 ti
->ctime
= rollback
.orig_dest
.old_ctime
;
8261 if (MDS_INO_IS_STRAY(rollback
.orig_src
.dirfrag
.ino
)) {
8262 if (MDS_INO_IS_STRAY(rollback
.orig_dest
.dirfrag
.ino
))
8263 assert(!rollback
.orig_dest
.ino
&& !rollback
.orig_dest
.remote_ino
);
8265 assert(rollback
.orig_dest
.remote_ino
&&
8266 rollback
.orig_dest
.remote_ino
== rollback
.orig_src
.ino
);
8272 dout(0) << " srcdn back to " << *srcdn
<< dendl
;
8274 dout(0) << " srci back to " << *in
<< dendl
;
8276 dout(0) << " destdn back to " << *destdn
<< dendl
;
8278 dout(0) << " desti back to " << *target
<< dendl
;
8281 ESlaveUpdate
*le
= new ESlaveUpdate(mdlog
, "slave_rename_rollback", rollback
.reqid
, master
,
8282 ESlaveUpdate::OP_ROLLBACK
, ESlaveUpdate::RENAME
);
8283 mdlog
->start_entry(le
);
8285 if (srcdn
&& (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
8286 le
->commit
.add_dir_context(srcdir
);
8287 if (rollback
.orig_src
.ino
)
8288 le
->commit
.add_primary_dentry(srcdn
, 0, true);
8290 le
->commit
.add_remote_dentry(srcdn
, true);
8293 if (!rollback
.orig_src
.ino
&& // remote linkage
8294 in
&& in
->authority().first
== whoami
) {
8295 le
->commit
.add_dir_context(in
->get_projected_parent_dir());
8296 le
->commit
.add_primary_dentry(in
->get_projected_parent_dn(), in
, true);
8299 if (force_journal_dest
) {
8300 assert(rollback
.orig_dest
.ino
);
8301 le
->commit
.add_dir_context(destdir
);
8302 le
->commit
.add_primary_dentry(destdn
, 0, true);
8305 // slave: no need to journal straydn
8307 if (target
&& target
!= in
&& target
->authority().first
== whoami
) {
8308 assert(rollback
.orig_dest
.remote_ino
);
8309 le
->commit
.add_dir_context(target
->get_projected_parent_dir());
8310 le
->commit
.add_primary_dentry(target
->get_projected_parent_dn(), target
, true);
8313 if (in
&& in
->is_dir() && (srcdn
->authority().first
== whoami
|| force_journal_src
)) {
8314 dout(10) << " noting renamed dir ino " << in
->ino() << " in metablob" << dendl
;
8315 le
->commit
.renamed_dirino
= in
->ino();
8316 if (srcdn
->authority().first
== whoami
) {
8318 in
->get_dirfrags(ls
);
8319 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
8321 if (!dir
->is_auth())
8322 le
->commit
.renamed_dir_frags
.push_back(dir
->get_frag());
8324 dout(10) << " noting renamed dir open frags " << le
->commit
.renamed_dir_frags
<< dendl
;
8326 } else if (force_journal_dest
) {
8327 dout(10) << " noting rename target ino " << target
->ino() << " in metablob" << dendl
;
8328 le
->commit
.renamed_dirino
= target
->ino();
8331 if (target
&& target
->is_dir()) {
8333 mdcache
->project_subtree_rename(target
, straydir
, destdir
);
8336 if (in
&& in
->is_dir()) {
8338 mdcache
->project_subtree_rename(in
, destdir
, srcdir
);
8341 if (mdr
&& !mdr
->more()->slave_update_journaled
) {
8342 assert(le
->commit
.empty());
8343 mdlog
->cancel_entry(le
);
8345 _rename_rollback_finish(mut
, mdr
, srcdn
, srcdnpv
, destdn
, straydn
, finish_mdr
);
8347 assert(!le
->commit
.empty());
8349 mdr
->more()->slave_update_journaled
= false;
8350 MDSLogContextBase
*fin
= new C_MDS_LoggedRenameRollback(this, mut
, mdr
, srcdn
, srcdnpv
,
8351 destdn
, straydn
, finish_mdr
);
8352 submit_mdlog_entry(le
, fin
, mdr
, __func__
);
8357 void Server::_rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
,
8358 version_t srcdnpv
, CDentry
*destdn
,
8359 CDentry
*straydn
, bool finish_mdr
)
8361 dout(10) << "_rename_rollback_finish " << mut
->reqid
<< dendl
;
8364 straydn
->get_dir()->unlink_inode(straydn
);
8365 straydn
->pop_projected_linkage();
8368 destdn
->get_dir()->unlink_inode(destdn
);
8369 destdn
->pop_projected_linkage();
8372 srcdn
->pop_projected_linkage();
8373 if (srcdn
->authority().first
== mds
->get_nodeid())
8374 srcdn
->mark_dirty(srcdnpv
, mut
->ls
);
8379 if (srcdn
&& srcdn
->get_linkage()->is_primary()) {
8380 CInode
*in
= srcdn
->get_linkage()->get_inode();
8381 if (srcdn
->authority().first
== mds
->get_nodeid())
8382 in
->state_set(CInode::STATE_AUTH
);
8383 // update subtree map?
8384 if (in
&& in
->is_dir()) {
8386 mdcache
->adjust_subtree_after_rename(in
, destdn
->get_dir(), true);
8391 CInode
*oldin
= destdn
->get_linkage()->get_inode();
8392 // update subtree map?
8393 if (oldin
&& oldin
->is_dir()) {
8395 mdcache
->adjust_subtree_after_rename(oldin
, straydn
->get_dir(), true);
8399 if (mds
->is_resolve()) {
8402 root
= mdcache
->get_subtree_root(straydn
->get_dir());
8404 root
= mdcache
->get_subtree_root(destdn
->get_dir());
8406 mdcache
->try_trim_non_auth_subtree(root
);
8410 list
<MDSInternalContextBase
*> finished
;
8411 if (mdr
->more()->is_ambiguous_auth
) {
8412 if (srcdn
->is_auth())
8413 mdr
->more()->rename_inode
->unfreeze_inode(finished
);
8415 mdr
->more()->rename_inode
->clear_ambiguous_auth(finished
);
8416 mdr
->more()->is_ambiguous_auth
= false;
8418 mds
->queue_waiters(finished
);
8419 if (finish_mdr
|| mdr
->aborted
)
8420 mdcache
->request_finish(mdr
);
8422 mdr
->more()->slave_rolling_back
= false;
8425 mdcache
->finish_rollback(mut
->reqid
);
8430 /* This function DOES put the passed message before returning*/
8431 void Server::handle_slave_rename_prep_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
8433 dout(10) << "handle_slave_rename_prep_ack " << *mdr
8434 << " witnessed by " << ack
->get_source()
8435 << " " << *ack
<< dendl
;
8436 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
8439 mdr
->more()->slaves
.insert(from
);
8440 if (mdr
->more()->srcdn_auth_mds
== from
&&
8441 mdr
->more()->is_remote_frozen_authpin
&&
8442 !mdr
->more()->is_ambiguous_auth
) {
8443 mdr
->set_ambiguous_auth(mdr
->more()->rename_inode
);
8446 // witnessed? or add extra witnesses?
8447 assert(mdr
->more()->witnessed
.count(from
) == 0);
8448 if (ack
->is_interrupted()) {
8449 dout(10) << " slave request interrupted, noop" << dendl
;
8450 } else if (ack
->witnesses
.empty()) {
8451 mdr
->more()->witnessed
.insert(from
);
8452 if (!ack
->is_not_journaled())
8453 mdr
->more()->has_journaled_slaves
= true;
8455 dout(10) << " extra witnesses (srcdn replicas) are " << ack
->witnesses
<< dendl
;
8456 mdr
->more()->extra_witnesses
.swap(ack
->witnesses
);
8457 mdr
->more()->extra_witnesses
.erase(mds
->get_nodeid()); // not me!
8461 if (ack
->inode_export
.length()) {
8462 dout(10) << " got srci import" << dendl
;
8463 mdr
->more()->inode_import
.claim(ack
->inode_export
);
8464 mdr
->more()->inode_import_v
= ack
->inode_export_v
;
8467 // remove from waiting list
8468 assert(mdr
->more()->waiting_on_slave
.count(from
));
8469 mdr
->more()->waiting_on_slave
.erase(from
);
8471 if (mdr
->more()->waiting_on_slave
.empty())
8472 dispatch_client_request(mdr
); // go again!
8474 dout(10) << "still waiting on slaves " << mdr
->more()->waiting_on_slave
<< dendl
;
8477 void Server::handle_slave_rename_notify_ack(MDRequestRef
& mdr
, MMDSSlaveRequest
*ack
)
8479 dout(10) << "handle_slave_rename_notify_ack " << *mdr
<< " from mds."
8480 << ack
->get_source() << dendl
;
8481 assert(mdr
->is_slave());
8482 mds_rank_t from
= mds_rank_t(ack
->get_source().num());
8484 if (mdr
->more()->waiting_on_slave
.count(from
)) {
8485 mdr
->more()->waiting_on_slave
.erase(from
);
8487 if (mdr
->more()->waiting_on_slave
.empty()) {
8488 if (mdr
->slave_request
)
8489 dispatch_slave_request(mdr
);
8491 dout(10) << " still waiting for rename notify acks from "
8492 << mdr
->more()->waiting_on_slave
<< dendl
;
8496 void Server::_slave_rename_sessions_flushed(MDRequestRef
& mdr
)
8498 dout(10) << "_slave_rename_sessions_flushed " << *mdr
<< dendl
;
8500 if (mdr
->more()->waiting_on_slave
.count(MDS_RANK_NONE
)) {
8501 mdr
->more()->waiting_on_slave
.erase(MDS_RANK_NONE
);
8503 if (mdr
->more()->waiting_on_slave
.empty()) {
8504 if (mdr
->slave_request
)
8505 dispatch_slave_request(mdr
);
8507 dout(10) << " still waiting for rename notify acks from "
8508 << mdr
->more()->waiting_on_slave
<< dendl
;
8513 /* This function takes responsibility for the passed mdr*/
8514 void Server::handle_client_lssnap(MDRequestRef
& mdr
)
8516 MClientRequest
*req
= mdr
->client_request
;
8519 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8520 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8521 respond_to_request(mdr
, -ESTALE
);
8524 if (!diri
->is_auth()) {
8525 mdcache
->request_forward(mdr
, diri
->authority().first
);
8528 if (!diri
->is_dir()) {
8529 respond_to_request(mdr
, -ENOTDIR
);
8532 dout(10) << "lssnap on " << *diri
<< dendl
;
8535 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8536 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8537 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8540 if (!check_access(mdr
, diri
, MAY_READ
))
8543 SnapRealm
*realm
= diri
->find_snaprealm();
8544 map
<snapid_t
,SnapInfo
*> infomap
;
8545 realm
->get_snap_info(infomap
, diri
->get_oldest_snap());
8547 unsigned max_entries
= req
->head
.args
.readdir
.max_entries
;
8549 max_entries
= infomap
.size();
8550 int max_bytes
= req
->head
.args
.readdir
.max_bytes
;
8552 // make sure at least one item can be encoded
8553 max_bytes
= (512 << 10) + g_conf
->mds_max_xattr_pairs_size
;
8555 __u64 last_snapid
= 0;
8556 string offset_str
= req
->get_path2();
8557 if (!offset_str
.empty())
8558 last_snapid
= realm
->resolve_snapname(offset_str
, diri
->ino());
8561 encode_empty_dirstat(dirbl
);
8563 max_bytes
-= dirbl
.length() - sizeof(__u32
) + sizeof(__u8
) * 2;
8567 map
<snapid_t
,SnapInfo
*>::iterator p
= infomap
.upper_bound(last_snapid
);
8568 for (; p
!= infomap
.end() && num
< max_entries
; ++p
) {
8569 dout(10) << p
->first
<< " -> " << *p
->second
<< dendl
;
8573 if (p
->second
->ino
== diri
->ino())
8574 snap_name
= std::string(p
->second
->name
);
8576 snap_name
= std::string(p
->second
->get_long_name());
8578 unsigned start_len
= dnbl
.length();
8579 if (int(start_len
+ snap_name
.length() + sizeof(__u32
) + sizeof(LeaseStat
)) > max_bytes
)
8582 ::encode(snap_name
, dnbl
);
8583 encode_infinite_lease(dnbl
);
8585 int r
= diri
->encode_inodestat(dnbl
, mdr
->session
, realm
, p
->first
, max_bytes
- (int)dnbl
.length());
8588 keep
.substr_of(dnbl
, 0, start_len
);
8595 ::encode(num
, dirbl
);
8597 if (p
== infomap
.end()) {
8598 flags
= CEPH_READDIR_FRAG_END
;
8599 if (last_snapid
== 0)
8600 flags
|= CEPH_READDIR_FRAG_COMPLETE
;
8602 ::encode(flags
, dirbl
);
8603 dirbl
.claim_append(dnbl
);
8605 mdr
->reply_extra_bl
= dirbl
;
8607 respond_to_request(mdr
, 0);
8613 struct C_MDS_mksnap_finish
: public ServerLogContext
{
8616 C_MDS_mksnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, SnapInfo
&i
) :
8617 ServerLogContext(s
, r
), diri(di
), info(i
) {}
8618 void finish(int r
) override
{
8619 server
->_mksnap_finish(mdr
, diri
, info
);
8623 /* This function takes responsibility for the passed mdr*/
8624 void Server::handle_client_mksnap(MDRequestRef
& mdr
)
8626 if (!mds
->mdsmap
->allows_snaps()) {
8627 // you can't make snapshots until you set an option right now
8628 respond_to_request(mdr
, -EPERM
);
8632 MClientRequest
*req
= mdr
->client_request
;
8633 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8634 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8635 respond_to_request(mdr
, -ESTALE
);
8639 if (!diri
->is_auth()) { // fw to auth?
8640 mdcache
->request_forward(mdr
, diri
->authority().first
);
8645 if (!diri
->is_dir()) {
8646 respond_to_request(mdr
, -ENOTDIR
);
8649 if (diri
->is_system() && !diri
->is_root()) {
8650 // no snaps in system dirs (root is ok)
8651 respond_to_request(mdr
, -EPERM
);
8655 boost::string_view snapname
= req
->get_filepath().last_dentry();
8657 if (mdr
->client_request
->get_caller_uid() < g_conf
->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf
->mds_snap_max_uid
) {
8658 dout(20) << "mksnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
8659 respond_to_request(mdr
, -EPERM
);
8663 dout(10) << "mksnap " << snapname
<< " on " << *diri
<< dendl
;
8666 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8668 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8669 rdlocks
.erase(&diri
->snaplock
);
8670 xlocks
.insert(&diri
->snaplock
);
8672 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8675 if (!check_access(mdr
, diri
, MAY_WRITE
))
8678 // make sure name is unique
8679 if (diri
->snaprealm
&&
8680 diri
->snaprealm
->exists(snapname
)) {
8681 respond_to_request(mdr
, -EEXIST
);
8684 if (snapname
.length() == 0 ||
8685 snapname
[0] == '_') {
8686 respond_to_request(mdr
, -EINVAL
);
8690 // allocate a snapid
8691 if (!mdr
->more()->stid
) {
8693 mds
->snapclient
->prepare_create(diri
->ino(), snapname
,
8694 mdr
->get_mds_stamp(),
8695 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
8696 new C_MDS_RetryRequest(mdcache
, mdr
));
8700 version_t stid
= mdr
->more()->stid
;
8702 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8703 ::decode(snapid
, p
);
8704 dout(10) << " stid " << stid
<< " snapid " << snapid
<< dendl
;
8708 info
.ino
= diri
->ino();
8709 info
.snapid
= snapid
;
8710 info
.name
= std::string(snapname
);
8711 info
.stamp
= mdr
->get_op_stamp();
8713 auto &pi
= diri
->project_inode(false, true);
8714 pi
.inode
.ctime
= info
.stamp
;
8715 if (info
.stamp
> pi
.inode
.rstat
.rctime
)
8716 pi
.inode
.rstat
.rctime
= info
.stamp
;
8717 pi
.inode
.version
= diri
->pre_dirty();
8719 // project the snaprealm
8720 auto &newsnap
= *pi
.snapnode
;
8721 newsnap
.created
= snapid
;
8722 auto em
= newsnap
.snaps
.emplace(std::piecewise_construct
, std::forward_as_tuple(snapid
), std::forward_as_tuple(info
));
8724 em
.first
->second
= info
;
8725 newsnap
.seq
= snapid
;
8726 newsnap
.last_created
= snapid
;
8728 // journal the inode changes
8729 mdr
->ls
= mdlog
->get_current_segment();
8730 EUpdate
*le
= new EUpdate(mdlog
, "mksnap");
8731 mdlog
->start_entry(le
);
8733 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
8734 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
8735 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
8736 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
8738 // journal the snaprealm changes
8739 submit_mdlog_entry(le
, new C_MDS_mksnap_finish(this, mdr
, diri
, info
),
8744 void Server::_mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
)
8746 dout(10) << "_mksnap_finish " << *mdr
<< " " << info
<< dendl
;
8748 int op
= (diri
->snaprealm
? CEPH_SNAP_OP_CREATE
: CEPH_SNAP_OP_SPLIT
);
8750 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
8753 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
8756 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
8758 mdcache
->do_realm_invalidate_and_update_notify(diri
, op
);
8762 mdr
->snapid
= info
.snapid
;
8764 respond_to_request(mdr
, 0);
8770 struct C_MDS_rmsnap_finish
: public ServerLogContext
{
8773 C_MDS_rmsnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
8774 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
8775 void finish(int r
) override
{
8776 server
->_rmsnap_finish(mdr
, diri
, snapid
);
8780 /* This function takes responsibility for the passed mdr*/
8781 void Server::handle_client_rmsnap(MDRequestRef
& mdr
)
8783 MClientRequest
*req
= mdr
->client_request
;
8785 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8786 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8787 respond_to_request(mdr
, -ESTALE
);
8790 if (!diri
->is_auth()) { // fw to auth?
8791 mdcache
->request_forward(mdr
, diri
->authority().first
);
8794 if (!diri
->is_dir()) {
8795 respond_to_request(mdr
, -ENOTDIR
);
8799 boost::string_view snapname
= req
->get_filepath().last_dentry();
8801 if (mdr
->client_request
->get_caller_uid() < g_conf
->mds_snap_min_uid
|| mdr
->client_request
->get_caller_uid() > g_conf
->mds_snap_max_uid
) {
8802 dout(20) << "rmsnap " << snapname
<< " on " << *diri
<< " denied to uid " << mdr
->client_request
->get_caller_uid() << dendl
;
8803 respond_to_request(mdr
, -EPERM
);
8807 dout(10) << "rmsnap " << snapname
<< " on " << *diri
<< dendl
;
8810 if (snapname
.length() == 0 || snapname
[0] == '_') {
8811 respond_to_request(mdr
, -EINVAL
); // can't prune a parent snap, currently.
8814 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(snapname
)) {
8815 respond_to_request(mdr
, -ENOENT
);
8818 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(snapname
, diri
->ino());
8819 dout(10) << " snapname " << snapname
<< " is " << snapid
<< dendl
;
8821 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8822 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8823 rdlocks
.erase(&diri
->snaplock
);
8824 xlocks
.insert(&diri
->snaplock
);
8826 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8829 if (!check_access(mdr
, diri
, MAY_WRITE
))
8833 if (!mdr
->more()->stid
) {
8834 mds
->snapclient
->prepare_destroy(diri
->ino(), snapid
,
8835 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
8836 new C_MDS_RetryRequest(mdcache
, mdr
));
8839 version_t stid
= mdr
->more()->stid
;
8840 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8843 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
8846 auto &pi
= diri
->project_inode(false, true);
8847 pi
.inode
.version
= diri
->pre_dirty();
8848 pi
.inode
.ctime
= mdr
->get_op_stamp();
8849 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
8850 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
8852 mdr
->ls
= mdlog
->get_current_segment();
8853 EUpdate
*le
= new EUpdate(mdlog
, "rmsnap");
8854 mdlog
->start_entry(le
);
8856 // project the snaprealm
8857 auto &newnode
= *pi
.snapnode
;
8858 newnode
.snaps
.erase(snapid
);
8860 newnode
.last_destroyed
= seq
;
8862 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
8863 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
8864 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
8865 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
8867 submit_mdlog_entry(le
, new C_MDS_rmsnap_finish(this, mdr
, diri
, snapid
),
8872 void Server::_rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
8874 dout(10) << "_rmsnap_finish " << *mdr
<< " " << snapid
<< dendl
;
8875 snapid_t stid
= mdr
->more()->stid
;
8876 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8880 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
8883 mds
->snapclient
->commit(stid
, mdr
->ls
);
8885 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
8887 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_DESTROY
);
8891 respond_to_request(mdr
, 0);
8893 // purge snapshot data
8894 if (diri
->snaprealm
->have_past_parents_open())
8895 diri
->purge_stale_snap_data(diri
->snaprealm
->get_snaps());
8898 struct C_MDS_renamesnap_finish
: public ServerLogContext
{
8901 C_MDS_renamesnap_finish(Server
*s
, MDRequestRef
& r
, CInode
*di
, snapid_t sn
) :
8902 ServerLogContext(s
, r
), diri(di
), snapid(sn
) {}
8903 void finish(int r
) override
{
8904 server
->_renamesnap_finish(mdr
, diri
, snapid
);
8908 /* This function takes responsibility for the passed mdr*/
8909 void Server::handle_client_renamesnap(MDRequestRef
& mdr
)
8911 MClientRequest
*req
= mdr
->client_request
;
8912 if (req
->get_filepath().get_ino() != req
->get_filepath2().get_ino()) {
8913 respond_to_request(mdr
, -EINVAL
);
8917 CInode
*diri
= mdcache
->get_inode(req
->get_filepath().get_ino());
8918 if (!diri
|| diri
->state_test(CInode::STATE_PURGING
)) {
8919 respond_to_request(mdr
, -ESTALE
);
8923 if (!diri
->is_auth()) { // fw to auth?
8924 mdcache
->request_forward(mdr
, diri
->authority().first
);
8928 if (!diri
->is_dir()) { // dir only
8929 respond_to_request(mdr
, -ENOTDIR
);
8933 if (mdr
->client_request
->get_caller_uid() < g_conf
->mds_snap_min_uid
||
8934 mdr
->client_request
->get_caller_uid() > g_conf
->mds_snap_max_uid
) {
8935 respond_to_request(mdr
, -EPERM
);
8939 boost::string_view dstname
= req
->get_filepath().last_dentry();
8940 boost::string_view srcname
= req
->get_filepath2().last_dentry();
8941 dout(10) << "renamesnap " << srcname
<< "->" << dstname
<< " on " << *diri
<< dendl
;
8943 if (srcname
.length() == 0 || srcname
[0] == '_') {
8944 respond_to_request(mdr
, -EINVAL
); // can't rename a parent snap.
8947 if (!diri
->snaprealm
|| !diri
->snaprealm
->exists(srcname
)) {
8948 respond_to_request(mdr
, -ENOENT
);
8951 if (dstname
.length() == 0 || dstname
[0] == '_') {
8952 respond_to_request(mdr
, -EINVAL
);
8955 if (diri
->snaprealm
->exists(dstname
)) {
8956 respond_to_request(mdr
, -EEXIST
);
8960 snapid_t snapid
= diri
->snaprealm
->resolve_snapname(srcname
, diri
->ino());
8961 dout(10) << " snapname " << srcname
<< " is " << snapid
<< dendl
;
8964 set
<SimpleLock
*> rdlocks
, wrlocks
, xlocks
;
8966 mds
->locker
->include_snap_rdlocks(rdlocks
, diri
);
8967 rdlocks
.erase(&diri
->snaplock
);
8968 xlocks
.insert(&diri
->snaplock
);
8970 if (!mds
->locker
->acquire_locks(mdr
, rdlocks
, wrlocks
, xlocks
))
8973 if (!check_access(mdr
, diri
, MAY_WRITE
))
8977 if (!mdr
->more()->stid
) {
8978 mds
->snapclient
->prepare_update(diri
->ino(), snapid
, dstname
, utime_t(),
8979 &mdr
->more()->stid
, &mdr
->more()->snapidbl
,
8980 new C_MDS_RetryRequest(mdcache
, mdr
));
8984 version_t stid
= mdr
->more()->stid
;
8985 bufferlist::iterator p
= mdr
->more()->snapidbl
.begin();
8988 dout(10) << " stid is " << stid
<< ", seq is " << seq
<< dendl
;
8991 auto &pi
= diri
->project_inode(false, true);
8992 pi
.inode
.ctime
= mdr
->get_op_stamp();
8993 if (mdr
->get_op_stamp() > pi
.inode
.rstat
.rctime
)
8994 pi
.inode
.rstat
.rctime
= mdr
->get_op_stamp();
8995 pi
.inode
.version
= diri
->pre_dirty();
8997 // project the snaprealm
8998 auto &newsnap
= *pi
.snapnode
;
8999 auto it
= newsnap
.snaps
.find(snapid
);
9000 assert(it
!= newsnap
.snaps
.end());
9001 it
->second
.name
= std::string(dstname
);
9003 // journal the inode changes
9004 mdr
->ls
= mdlog
->get_current_segment();
9005 EUpdate
*le
= new EUpdate(mdlog
, "renamesnap");
9006 mdlog
->start_entry(le
);
9008 le
->metablob
.add_client_req(req
->get_reqid(), req
->get_oldest_client_tid());
9009 le
->metablob
.add_table_transaction(TABLE_SNAP
, stid
);
9010 mdcache
->predirty_journal_parents(mdr
, &le
->metablob
, diri
, 0, PREDIRTY_PRIMARY
, false);
9011 mdcache
->journal_dirty_inode(mdr
.get(), &le
->metablob
, diri
);
9013 // journal the snaprealm changes
9014 submit_mdlog_entry(le
, new C_MDS_renamesnap_finish(this, mdr
, diri
, snapid
),
9019 void Server::_renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
)
9021 dout(10) << "_renamesnap_finish " << *mdr
<< " " << snapid
<< dendl
;
9023 diri
->pop_and_dirty_projected_inode(mdr
->ls
);
9026 mds
->snapclient
->commit(mdr
->more()->stid
, mdr
->ls
);
9028 dout(10) << "snaprealm now " << *diri
->snaprealm
<< dendl
;
9030 mdcache
->do_realm_invalidate_and_update_notify(diri
, CEPH_SNAP_OP_UPDATE
, true);
9035 mdr
->snapid
= snapid
;
9036 respond_to_request(mdr
, 0);
9040 * Return true if server is in state RECONNECT and this
9041 * client has not yet reconnected.
9043 bool Server::waiting_for_reconnect(client_t c
) const
9045 return client_reconnect_gather
.count(c
) > 0;
9048 void Server::dump_reconnect_status(Formatter
*f
) const
9050 f
->open_object_section("reconnect_status");
9051 f
->dump_stream("client_reconnect_gather") << client_reconnect_gather
;