]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
update sources to v12.2.3
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20
21 #include "MDSRank.h"
22 #include "Server.h"
23 #include "Locker.h"
24 #include "MDCache.h"
25 #include "MDLog.h"
26 #include "Migrator.h"
27 #include "MDBalancer.h"
28 #include "InoTable.h"
29 #include "SnapClient.h"
30 #include "Mutation.h"
31
32 #include "msg/Messenger.h"
33
34 #include "osdc/Objecter.h"
35
36 #include "messages/MClientSession.h"
37 #include "messages/MClientRequest.h"
38 #include "messages/MClientReply.h"
39 #include "messages/MClientReconnect.h"
40 #include "messages/MClientCaps.h"
41 #include "messages/MClientSnap.h"
42
43 #include "messages/MMDSSlaveRequest.h"
44
45 #include "messages/MLock.h"
46
47 #include "events/EUpdate.h"
48 #include "events/ESlaveUpdate.h"
49 #include "events/ESession.h"
50 #include "events/EOpen.h"
51 #include "events/ECommitted.h"
52
53 #include "include/filepath.h"
54 #include "common/errno.h"
55 #include "common/Timer.h"
56 #include "common/perf_counters.h"
57 #include "include/compat.h"
58 #include "osd/OSDMap.h"
59
60 #include <errno.h>
61
62 #include <list>
63 #include <iostream>
64 using namespace std;
65
66 #include "common/config.h"
67
68 #define dout_context g_ceph_context
69 #define dout_subsys ceph_subsys_mds
70 #undef dout_prefix
71 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
72
73 class ServerContext : public MDSInternalContextBase {
74 protected:
75 Server *server;
76 MDSRank *get_mds() override
77 {
78 return server->mds;
79 }
80
81 public:
82 explicit ServerContext(Server *s) : server(s) {
83 assert(server != NULL);
84 }
85 };
86
87 class ServerLogContext : public MDSLogContextBase {
88 protected:
89 Server *server;
90 MDSRank *get_mds() override
91 {
92 return server->mds;
93 }
94
95 MDRequestRef mdr;
96 void pre_finish(int r) override {
97 if (mdr)
98 mdr->mark_event("journal_committed: ");
99 }
100 public:
101 explicit ServerLogContext(Server *s) : server(s) {
102 assert(server != NULL);
103 }
104 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
105 assert(server != NULL);
106 }
107 };
108
109 void Server::create_logger()
110 {
111 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
112 plb.add_u64_counter(l_mdss_handle_client_request,"handle_client_request",
113 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
114 plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
115 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
116 plb.add_u64_counter(l_mdss_handle_client_session, "handle_client_session",
117 "Client session messages", "hcs", PerfCountersBuilder::PRIO_INTERESTING);
118 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", "Client requests dispatched");
119 plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request", "Server requests dispatched");
120 plb.add_u64_counter(l_mdss_req_lookuphash, "req_lookuphash",
121 "Request type lookup hash of inode");
122 plb.add_u64_counter(l_mdss_req_lookupino, "req_lookupino",
123 "Request type lookup inode");
124 plb.add_u64_counter(l_mdss_req_lookupparent, "req_lookupparent",
125 "Request type lookup parent");
126 plb.add_u64_counter(l_mdss_req_lookupname, "req_lookupname",
127 "Request type lookup name");
128 plb.add_u64_counter(l_mdss_req_lookup, "req_lookup",
129 "Request type lookup");
130 plb.add_u64_counter(l_mdss_req_lookupsnap, "req_lookupsnap",
131 "Request type lookup snapshot");
132 plb.add_u64_counter(l_mdss_req_getattr, "req_getattr",
133 "Request type get attribute");
134 plb.add_u64_counter(l_mdss_req_setattr, "req_setattr",
135 "Request type set attribute");
136 plb.add_u64_counter(l_mdss_req_setlayout, "req_setlayout",
137 "Request type set file layout");
138 plb.add_u64_counter(l_mdss_req_setdirlayout, "req_setdirlayout",
139 "Request type set directory layout");
140 plb.add_u64_counter(l_mdss_req_setxattr, "req_setxattr",
141 "Request type set extended attribute");
142 plb.add_u64_counter(l_mdss_req_rmxattr, "req_rmxattr",
143 "Request type remove extended attribute");
144 plb.add_u64_counter(l_mdss_req_readdir, "req_readdir",
145 "Request type read directory");
146 plb.add_u64_counter(l_mdss_req_setfilelock, "req_setfilelock",
147 "Request type set file lock");
148 plb.add_u64_counter(l_mdss_req_getfilelock, "req_getfilelock",
149 "Request type get file lock");
150 plb.add_u64_counter(l_mdss_req_create, "req_create",
151 "Request type create");
152 plb.add_u64_counter(l_mdss_req_open, "req_open",
153 "Request type open");
154 plb.add_u64_counter(l_mdss_req_mknod, "req_mknod",
155 "Request type make node");
156 plb.add_u64_counter(l_mdss_req_link, "req_link",
157 "Request type link");
158 plb.add_u64_counter(l_mdss_req_unlink, "req_unlink",
159 "Request type unlink");
160 plb.add_u64_counter(l_mdss_req_rmdir, "req_rmdir",
161 "Request type remove directory");
162 plb.add_u64_counter(l_mdss_req_rename, "req_rename",
163 "Request type rename");
164 plb.add_u64_counter(l_mdss_req_mkdir, "req_mkdir",
165 "Request type make directory");
166 plb.add_u64_counter(l_mdss_req_symlink, "req_symlink",
167 "Request type symbolic link");
168 plb.add_u64_counter(l_mdss_req_lssnap, "req_lssnap",
169 "Request type list snapshot");
170 plb.add_u64_counter(l_mdss_req_mksnap, "req_mksnap",
171 "Request type make snapshot");
172 plb.add_u64_counter(l_mdss_req_rmsnap, "req_rmsnap",
173 "Request type remove snapshot");
174 plb.add_u64_counter(l_mdss_req_renamesnap, "req_renamesnap",
175 "Request type rename snapshot");
176 logger = plb.create_perf_counters();
177 g_ceph_context->get_perfcounters_collection()->add(logger);
178 }
179
180 Server::Server(MDSRank *m) :
181 mds(m),
182 mdcache(mds->mdcache), mdlog(mds->mdlog),
183 logger(0),
184 is_full(false),
185 reconnect_done(NULL),
186 failed_reconnects(0),
187 reconnect_evicting(false),
188 terminating_sessions(false)
189 {
190 }
191
192
193 /* This function DOES put the passed message before returning*/
194 void Server::dispatch(Message *m)
195 {
196 switch (m->get_type()) {
197 case CEPH_MSG_CLIENT_RECONNECT:
198 handle_client_reconnect(static_cast<MClientReconnect*>(m));
199 return;
200 }
201
202 // active?
203 if (!mds->is_active()) {
204 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
205 (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT)) {
206 MClientRequest *req = static_cast<MClientRequest*>(m);
207 Session *session = get_session(req);
208 if (!session || session->is_closed()) {
209 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
210 req->put();
211 return;
212 }
213 bool queue_replay = false;
214 if (req->is_replay()) {
215 dout(3) << "queuing replayed op" << dendl;
216 queue_replay = true;
217 } else if (req->get_retry_attempt()) {
218 // process completed request in clientreplay stage. The completed request
219 // might have created new file/directorie. This guarantees MDS sends a reply
220 // to client before other request modifies the new file/directorie.
221 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
222 dout(3) << "queuing completed op" << dendl;
223 queue_replay = true;
224 }
225 // this request was created before the cap reconnect message, drop any embedded
226 // cap releases.
227 req->releases.clear();
228 }
229 if (queue_replay) {
230 req->mark_queued_for_replay();
231 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
232 return;
233 }
234 }
235
236 bool wait_for_active = true;
237 if (m->get_type() == MSG_MDS_SLAVE_REQUEST) {
238 // handle_slave_request() will wait if necessary
239 wait_for_active = false;
240 } else if (mds->is_stopping()) {
241 if (m->get_source().is_mds() ||
242 m->get_type() == CEPH_MSG_CLIENT_SESSION)
243 wait_for_active = false;
244 } else if (mds->is_clientreplay()) {
245 // session open requests need to be handled during replay,
246 // close requests need to be delayed
247 if ((m->get_type() == CEPH_MSG_CLIENT_SESSION &&
248 (static_cast<MClientSession*>(m))->get_op() != CEPH_SESSION_REQUEST_CLOSE)) {
249 wait_for_active = false;
250 } else if (m->get_type() == CEPH_MSG_CLIENT_REQUEST) {
251 MClientRequest *req = static_cast<MClientRequest*>(m);
252 if (req->is_queued_for_replay()) {
253 wait_for_active = false;
254 }
255 }
256 }
257 if (wait_for_active) {
258 dout(3) << "not active yet, waiting" << dendl;
259 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
260 return;
261 }
262 }
263
264 switch (m->get_type()) {
265 case CEPH_MSG_CLIENT_SESSION:
266 handle_client_session(static_cast<MClientSession*>(m));
267 return;
268 case CEPH_MSG_CLIENT_REQUEST:
269 handle_client_request(static_cast<MClientRequest*>(m));
270 return;
271 case MSG_MDS_SLAVE_REQUEST:
272 handle_slave_request(static_cast<MMDSSlaveRequest*>(m));
273 return;
274 default:
275 derr << "server unknown message " << m->get_type() << dendl;
276 assert(0 == "server unknown message");
277 }
278 }
279
280
281
282 // ----------------------------------------------------------
283 // SESSION management
284
285 class C_MDS_session_finish : public ServerLogContext {
286 Session *session;
287 uint64_t state_seq;
288 bool open;
289 version_t cmapv;
290 interval_set<inodeno_t> inos;
291 version_t inotablev;
292 Context *fin;
293 public:
294 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
295 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
296 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
297 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { }
298 void finish(int r) override {
299 assert(r == 0);
300 server->_session_logged(session, state_seq, open, cmapv, inos, inotablev);
301 if (fin) {
302 fin->complete(r);
303 }
304 }
305 };
306
307 Session *Server::get_session(Message *m)
308 {
309 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
310 if (session) {
311 dout(20) << "get_session have " << session << " " << session->info.inst
312 << " state " << session->get_state_name() << dendl;
313 session->put(); // not carry ref
314 } else {
315 dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
316 }
317 return session;
318 }
319
320 /* This function DOES put the passed message before returning*/
321 void Server::handle_client_session(MClientSession *m)
322 {
323 version_t pv;
324 bool blacklisted = false;
325 Session *session = get_session(m);
326
327 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
328 assert(m->get_source().is_client()); // should _not_ come from an mds!
329
330 if (!session) {
331 dout(0) << " ignoring sessionless msg " << *m << dendl;
332 m->put();
333 return;
334 }
335
336 if (logger)
337 logger->inc(l_mdss_handle_client_session);
338
339 uint64_t sseq = 0;
340 switch (m->get_op()) {
341 case CEPH_SESSION_REQUEST_OPEN:
342 if (session->is_opening() ||
343 session->is_open() ||
344 session->is_stale() ||
345 session->is_killing()) {
346 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
347 // set client metadata for session opened by prepare_force_open_sessions
348 if (!m->client_meta.empty())
349 session->set_client_metadata(m->client_meta);
350 m->put();
351 return;
352 }
353 assert(session->is_closed() ||
354 session->is_closing());
355
356 if (mds->is_stopping()) {
357 dout(10) << "mds is stopping, dropping open req" << dendl;
358 m->put();
359 return;
360 }
361
362 blacklisted = mds->objecter->with_osdmap(
363 [session](const OSDMap &osd_map) -> bool {
364 return osd_map.is_blacklisted(session->info.inst.addr);
365 });
366
367 if (blacklisted) {
368 dout(10) << "ignoring blacklisted client " << session->info.inst.addr << dendl;
369 m->put();
370 return;
371 }
372
373 session->set_client_metadata(m->client_meta);
374 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN "
375 << session->info.client_metadata.size() << " metadata entries:" << dendl;
376 for (map<string, string>::iterator i = session->info.client_metadata.begin();
377 i != session->info.client_metadata.end(); ++i) {
378 dout(20) << " " << i->first << ": " << i->second << dendl;
379 }
380
381 // Special case for the 'root' metadata path; validate that the claimed
382 // root is actually within the caps of the session
383 if (session->info.client_metadata.count("root")) {
384 const auto claimed_root = session->info.client_metadata.at("root");
385 // claimed_root has a leading "/" which we strip before passing
386 // into caps check
387 if (claimed_root.empty() || claimed_root[0] != '/' ||
388 !session->auth_caps.path_capable(claimed_root.substr(1))) {
389 derr << __func__ << " forbidden path claimed as mount root: "
390 << claimed_root << " by " << m->get_source() << dendl;
391 // Tell the client we're rejecting their open
392 mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
393 mds->clog->warn() << "client session with invalid root '" <<
394 claimed_root << "' denied (" << session->info.inst << ")";
395 session->clear();
396 // Drop out; don't record this session in SessionMap or journal it.
397 break;
398 }
399 }
400
401 if (session->is_closed())
402 mds->sessionmap.add_session(session);
403
404 pv = mds->sessionmap.mark_projected(session);
405 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
406 mds->sessionmap.touch_session(session);
407 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, m->client_meta),
408 new C_MDS_session_finish(this, session, sseq, true, pv));
409 mdlog->flush();
410 break;
411
412 case CEPH_SESSION_REQUEST_RENEWCAPS:
413 if (session->is_open() ||
414 session->is_stale()) {
415 mds->sessionmap.touch_session(session);
416 if (session->is_stale()) {
417 mds->sessionmap.set_state(session, Session::STATE_OPEN);
418 mds->locker->resume_stale_caps(session);
419 mds->sessionmap.touch_session(session);
420 }
421 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_RENEWCAPS, m->get_seq()));
422 } else {
423 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
424 }
425 break;
426
427 case CEPH_SESSION_REQUEST_CLOSE:
428 {
429 if (session->is_closed() ||
430 session->is_closing() ||
431 session->is_killing()) {
432 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
433 m->put();
434 return;
435 }
436 if (session->is_importing()) {
437 dout(10) << "ignoring close req on importing session" << dendl;
438 m->put();
439 return;
440 }
441 assert(session->is_open() ||
442 session->is_stale() ||
443 session->is_opening());
444 if (m->get_seq() < session->get_push_seq()) {
445 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
446 << ", dropping" << dendl;
447 m->put();
448 return;
449 }
450 // We are getting a seq that is higher than expected.
451 // Handle the same as any other seqn error.
452 //
453 if (m->get_seq() != session->get_push_seq()) {
454 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
455 << ", BUGGY!" << dendl;
456 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
457 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
458 m->put();
459 return;
460 }
461 journal_close_session(session, Session::STATE_CLOSING, NULL);
462 }
463 break;
464
465 case CEPH_SESSION_FLUSHMSG_ACK:
466 finish_flush_session(session, m->get_seq());
467 break;
468
469 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
470 if (mds->is_active())
471 mdlog->flush();
472 break;
473
474 default:
475 ceph_abort();
476 }
477 m->put();
478 }
479
480 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
481 {
482 for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
483 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
484 assert(session);
485 if (!session->is_open() ||
486 !session->connection.get() ||
487 !session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER))
488 continue;
489 version_t seq = session->wait_for_flush(gather.new_sub());
490 mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
491 }
492 }
493
494 void Server::finish_flush_session(Session *session, version_t seq)
495 {
496 list<MDSInternalContextBase*> finished;
497 session->finish_flush(seq, finished);
498 mds->queue_waiters(finished);
499 }
500
501 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
502 interval_set<inodeno_t>& inos, version_t piv)
503 {
504 dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close")
505 << " " << pv << dendl;
506
507 if (piv) {
508 assert(session->is_closing() || session->is_killing() ||
509 session->is_opening()); // re-open closing session
510 session->info.prealloc_inos.subtract(inos);
511 mds->inotable->apply_release_ids(inos);
512 assert(mds->inotable->get_version() == piv);
513 }
514
515 mds->sessionmap.mark_dirty(session);
516
517 // apply
518 if (session->get_state_seq() != state_seq) {
519 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
520 << ", noop" << dendl;
521 // close must have been canceled (by an import?), or any number of other things..
522 } else if (open) {
523 assert(session->is_opening());
524 mds->sessionmap.set_state(session, Session::STATE_OPEN);
525 mds->sessionmap.touch_session(session);
526 assert(session->connection != NULL);
527 session->connection->send_message(new MClientSession(CEPH_SESSION_OPEN));
528 if (mdcache->is_readonly())
529 session->connection->send_message(new MClientSession(CEPH_SESSION_FORCE_RO));
530 } else if (session->is_closing() ||
531 session->is_killing()) {
532 // kill any lingering capabilities, leases, requests
533 while (!session->caps.empty()) {
534 Capability *cap = session->caps.front();
535 CInode *in = cap->get_inode();
536 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
537 mds->locker->remove_client_cap(in, session->info.inst.name.num());
538 }
539 while (!session->leases.empty()) {
540 ClientLease *r = session->leases.front();
541 CDentry *dn = static_cast<CDentry*>(r->parent);
542 dout(20) << " killing client lease of " << *dn << dendl;
543 dn->remove_client_lease(r, mds->locker);
544 }
545 if (client_reconnect_gather.count(session->info.get_client())) {
546 dout(20) << " removing client from reconnect set" << dendl;
547 client_reconnect_gather.erase(session->info.get_client());
548
549 if (client_reconnect_gather.empty()) {
550 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
551 reconnect_gather_finish();
552 }
553 }
554
555 if (session->is_closing()) {
556 // mark con disposable. if there is a fault, we will get a
557 // reset and clean it up. if the client hasn't received the
558 // CLOSE message yet, they will reconnect and get an
559 // ms_handle_remote_reset() and realize they had in fact closed.
560 // do this *before* sending the message to avoid a possible
561 // race.
562 if (session->connection != NULL) {
563 // Conditional because terminate_sessions will indiscrimately
564 // put sessions in CLOSING whether they ever had a conn or not.
565 session->connection->mark_disposable();
566 }
567
568 // reset session
569 mds->send_message_client(new MClientSession(CEPH_SESSION_CLOSE), session);
570 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
571 session->clear();
572 mds->sessionmap.remove_session(session);
573 } else if (session->is_killing()) {
574 // destroy session, close connection
575 if (session->connection != NULL) {
576 session->connection->mark_down();
577 }
578 mds->sessionmap.remove_session(session);
579 } else {
580 ceph_abort();
581 }
582 } else {
583 ceph_abort();
584 }
585 }
586
587 /**
588 * Inject sessions from some source other than actual connections.
589 *
590 * For example:
591 * - sessions inferred from journal replay
592 * - sessions learned from other MDSs during rejoin
593 * - sessions learned from other MDSs during dir/caps migration
594 * - sessions learned from other MDSs during a cross-MDS rename
595 */
596 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
597 map<client_t,uint64_t>& sseqmap)
598 {
599 version_t pv = mds->sessionmap.get_projected();
600
601 dout(10) << "prepare_force_open_sessions " << pv
602 << " on " << cm.size() << " clients"
603 << dendl;
604 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
605
606 Session *session = mds->sessionmap.get_or_add_session(p->second);
607 pv = mds->sessionmap.mark_projected(session);
608 if (session->is_closed() ||
609 session->is_closing() ||
610 session->is_killing())
611 sseqmap[p->first] = mds->sessionmap.set_state(session, Session::STATE_OPENING);
612 else
613 assert(session->is_open() ||
614 session->is_opening() ||
615 session->is_stale());
616 session->inc_importing();
617 }
618 return pv;
619 }
620
621 void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm,
622 map<client_t,uint64_t>& sseqmap,
623 bool dec_import)
624 {
625 /*
626 * FIXME: need to carefully consider the race conditions between a
627 * client trying to close a session and an MDS doing an import
628 * trying to force open a session...
629 */
630 dout(10) << "finish_force_open_sessions on " << cm.size() << " clients,"
631 << " initial v " << mds->sessionmap.get_version() << dendl;
632
633
634 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
635
636 Session *session = mds->sessionmap.get_session(p->second.name);
637 assert(session);
638
639 if (sseqmap.count(p->first)) {
640 uint64_t sseq = sseqmap[p->first];
641 if (session->get_state_seq() != sseq) {
642 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
643 } else {
644 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
645 mds->sessionmap.set_state(session, Session::STATE_OPEN);
646 mds->sessionmap.touch_session(session);
647 mds->send_message_client(new MClientSession(CEPH_SESSION_OPEN), session);
648 if (mdcache->is_readonly())
649 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
650 }
651 } else {
652 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
653 assert(session->is_open() || session->is_stale());
654 }
655
656 if (dec_import) {
657 session->dec_importing();
658 }
659
660 mds->sessionmap.mark_dirty(session);
661 }
662
663 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
664 }
665
666 class C_MDS_TerminatedSessions : public ServerContext {
667 void finish(int r) override {
668 server->terminating_sessions = false;
669 }
670 public:
671 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
672 };
673
674 void Server::terminate_sessions()
675 {
676 dout(2) << "terminate_sessions" << dendl;
677
678 terminating_sessions = true;
679
680 // kill them off. clients will retry etc.
681 set<Session*> sessions;
682 mds->sessionmap.get_client_session_set(sessions);
683 for (set<Session*>::const_iterator p = sessions.begin();
684 p != sessions.end();
685 ++p) {
686 Session *session = *p;
687 if (session->is_closing() ||
688 session->is_killing() ||
689 session->is_closed())
690 continue;
691 journal_close_session(session, Session::STATE_CLOSING, NULL);
692 }
693
694 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
695 }
696
697
698 void Server::find_idle_sessions()
699 {
700 dout(10) << "find_idle_sessions. laggy until " << mds->get_laggy_until() << dendl;
701
702 // timeout/stale
703 // (caps go stale, lease die)
704 utime_t now = ceph_clock_now();
705 utime_t cutoff = now;
706 cutoff -= g_conf->mds_session_timeout;
707 while (1) {
708 Session *session = mds->sessionmap.get_oldest_session(Session::STATE_OPEN);
709 if (!session) break;
710 dout(20) << "laggiest active session is " << session->info.inst << dendl;
711 if (session->last_cap_renew >= cutoff) {
712 dout(20) << "laggiest active session is " << session->info.inst << " and sufficiently new ("
713 << session->last_cap_renew << ")" << dendl;
714 break;
715 }
716
717 dout(10) << "new stale session " << session->info.inst << " last " << session->last_cap_renew << dendl;
718 mds->sessionmap.set_state(session, Session::STATE_STALE);
719 mds->locker->revoke_stale_caps(session);
720 mds->locker->remove_stale_leases(session);
721 mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
722 finish_flush_session(session, session->get_push_seq());
723 }
724
725 // autoclose
726 cutoff = now;
727 cutoff -= g_conf->mds_session_autoclose;
728
729 // don't kick clients if we've been laggy
730 if (mds->get_laggy_until() > cutoff) {
731 dout(10) << " laggy_until " << mds->get_laggy_until() << " > cutoff " << cutoff
732 << ", not kicking any clients to be safe" << dendl;
733 return;
734 }
735
736 if (mds->sessionmap.get_sessions().size() == 1 &&
737 mds->mdsmap->get_num_in_mds() == 1) {
738 dout(20) << "not evicting a slow client, because there is only one"
739 << dendl;
740 return;
741 }
742
743 // Collect a list of sessions exceeding the autoclose threshold
744 std::vector<Session *> to_evict;
745 const auto sessions_p = mds->sessionmap.by_state.find(Session::STATE_STALE);
746 if (sessions_p == mds->sessionmap.by_state.end() || sessions_p->second->empty()) {
747 return;
748 }
749 const auto &stale_sessions = sessions_p->second;
750 assert(stale_sessions != nullptr);
751
752 for (const auto &session: *stale_sessions) {
753 if (session->is_importing()) {
754 dout(10) << "stopping at importing session " << session->info.inst << dendl;
755 break;
756 }
757 assert(session->is_stale());
758 if (session->last_cap_renew >= cutoff) {
759 dout(20) << "oldest stale session is " << session->info.inst << " and sufficiently new ("
760 << session->last_cap_renew << ")" << dendl;
761 break;
762 }
763
764 to_evict.push_back(session);
765 }
766
767 for (const auto &session: to_evict) {
768 utime_t age = now;
769 age -= session->last_cap_renew;
770 mds->clog->warn() << "evicting unresponsive client " << *session
771 << ", after " << age << " seconds";
772 dout(10) << "autoclosing stale session " << session->info.inst << " last "
773 << session->last_cap_renew << dendl;
774
775 if (g_conf->mds_session_blacklist_on_timeout) {
776 std::stringstream ss;
777 mds->evict_client(session->info.inst.name.num(), false, true,
778 ss, nullptr);
779 } else {
780 kill_session(session, NULL);
781 }
782 }
783 }
784
785 /*
786 * XXX bump in the interface here, not using an MDSInternalContextBase here
787 * because all the callers right now happen to use a SaferCond
788 */
789 void Server::kill_session(Session *session, Context *on_safe)
790 {
791 assert(mds->mds_lock.is_locked_by_me());
792
793 if ((session->is_opening() ||
794 session->is_open() ||
795 session->is_stale()) &&
796 !session->is_importing()) {
797 dout(10) << "kill_session " << session << dendl;
798 journal_close_session(session, Session::STATE_KILLING, on_safe);
799 } else {
800 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
801 assert(session->is_closing() ||
802 session->is_closed() ||
803 session->is_killing() ||
804 session->is_importing());
805 if (on_safe) {
806 on_safe->complete(0);
807 }
808 }
809 }
810
811 size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
812 {
813 std::list<Session*> victims;
814 const auto sessions = mds->sessionmap.get_sessions();
815 for (const auto p : sessions) {
816 if (!p.first.is_client()) {
817 // Do not apply OSDMap blacklist to MDS daemons, we find out
818 // about their death via MDSMap.
819 continue;
820 }
821
822 Session *s = p.second;
823 if (blacklist.count(s->info.inst.addr)) {
824 victims.push_back(s);
825 }
826 }
827
828 for (const auto s : victims) {
829 kill_session(s, nullptr);
830 }
831
832 dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
833
834 return victims.size();
835 }
836
837 void Server::journal_close_session(Session *session, int state, Context *on_safe)
838 {
839 uint64_t sseq = mds->sessionmap.set_state(session, state);
840 version_t pv = mds->sessionmap.mark_projected(session);
841 version_t piv = 0;
842
843 // release alloc and pending-alloc inos for this session
844 // and wipe out session state, in case the session close aborts for some reason
845 interval_set<inodeno_t> both;
846 both.insert(session->info.prealloc_inos);
847 both.insert(session->pending_prealloc_inos);
848 if (both.size()) {
849 mds->inotable->project_release_ids(both);
850 piv = mds->inotable->get_projected_version();
851 } else
852 piv = 0;
853
854 mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv),
855 new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe));
856 mdlog->flush();
857
858 // clean up requests, too
859 elist<MDRequestImpl*>::iterator p =
860 session->requests.begin(member_offset(MDRequestImpl,
861 item_session_request));
862 while (!p.end()) {
863 MDRequestRef mdr = mdcache->request_get((*p)->reqid);
864 ++p;
865 mdcache->request_kill(mdr);
866 }
867
868 finish_flush_session(session, session->get_push_seq());
869 }
870
871 void Server::reconnect_clients(MDSInternalContext *reconnect_done_)
872 {
873 reconnect_done = reconnect_done_;
874 mds->sessionmap.get_client_set(client_reconnect_gather);
875
876 if (client_reconnect_gather.empty()) {
877 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
878 reconnect_gather_finish();
879 return;
880 }
881
882 // clients will get the mdsmap and discover we're reconnecting via the monitor.
883
884 reconnect_start = ceph_clock_now();
885 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
886 mds->sessionmap.dump();
887 }
888
889 /* This function DOES put the passed message before returning*/
890 void Server::handle_client_reconnect(MClientReconnect *m)
891 {
892 dout(7) << "handle_client_reconnect " << m->get_source() << dendl;
893 client_t from = m->get_source().num();
894 Session *session = get_session(m);
895 assert(session);
896
897 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
898 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
899 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
900 return;
901 }
902
903 utime_t delay = ceph_clock_now();
904 delay -= reconnect_start;
905 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
906
907 bool deny = false;
908 if (!mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
909 // XXX maybe in the future we can do better than this?
910 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
911 mds->clog->info() << "denied reconnect attempt (mds is "
912 << ceph_mds_state_name(mds->get_state())
913 << ") from " << m->get_source_inst()
914 << " after " << delay << " (allowed interval " << g_conf->mds_reconnect_timeout << ")";
915 deny = true;
916 } else if (session->is_closed()) {
917 dout(1) << " session is closed, ignoring reconnect, sending close" << dendl;
918 mds->clog->info() << "denied reconnect attempt (mds is "
919 << ceph_mds_state_name(mds->get_state())
920 << ") from " << m->get_source_inst() << " (session is closed)";
921 deny = true;
922 } else if (mdcache->is_readonly()) {
923 dout(1) << " read-only FS, ignoring reconnect, sending close" << dendl;
924 mds->clog->info() << "denied reconnect attempt (mds is read-only)";
925 deny = true;
926 }
927
928 if (deny) {
929 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_CLOSE));
930 m->put();
931 return;
932 }
933
934 // notify client of success with an OPEN
935 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_OPEN));
936 session->last_cap_renew = ceph_clock_now();
937 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
938
939 // snaprealms
940 for (vector<ceph_mds_snaprealm_reconnect>::iterator p = m->realms.begin();
941 p != m->realms.end();
942 ++p) {
943 CInode *in = mdcache->get_inode(inodeno_t(p->ino));
944 if (in && in->state_test(CInode::STATE_PURGING))
945 continue;
946 if (in) {
947 assert(in->snaprealm);
948 if (in->snaprealm->have_past_parents_open()) {
949 dout(15) << "open snaprealm (w/ past parents) on " << *in << dendl;
950 mdcache->finish_snaprealm_reconnect(from, in->snaprealm, snapid_t(p->seq));
951 } else {
952 dout(15) << "open snaprealm (w/o past parents) on " << *in << dendl;
953 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
954 }
955 } else {
956 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(p->ino)
957 << " seq " << p->seq << dendl;
958 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
959 }
960 }
961
962 // caps
963 for (map<inodeno_t, cap_reconnect_t>::iterator p = m->caps.begin();
964 p != m->caps.end();
965 ++p) {
966 // make sure our last_cap_id is MAX over all issued caps
967 if (p->second.capinfo.cap_id > mdcache->last_cap_id)
968 mdcache->last_cap_id = p->second.capinfo.cap_id;
969
970 CInode *in = mdcache->get_inode(p->first);
971 if (in && in->state_test(CInode::STATE_PURGING))
972 continue;
973 if (in && in->is_auth()) {
974 // we recovered it, and it's ours. take note.
975 dout(15) << "open cap realm " << inodeno_t(p->second.capinfo.snaprealm)
976 << " on " << *in << dendl;
977 in->reconnect_cap(from, p->second, session);
978 mdcache->add_reconnected_cap(from, p->first, p->second);
979 recover_filelocks(in, p->second.flockbl, m->get_orig_source().num());
980 continue;
981 }
982
983 if (in && !in->is_auth()) {
984 // not mine.
985 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
986 // add to cap export list.
987 p->second.path.clear(); // we don't need path
988 mdcache->rejoin_export_caps(p->first, from, p->second,
989 in->authority().first);
990 } else {
991 // don't know if the inode is mine
992 dout(10) << "missing ino " << p->first << ", will load later" << dendl;
993 p->second.path.clear(); // we don't need path
994 mdcache->rejoin_recovered_caps(p->first, from, p->second, MDS_RANK_NONE);
995 }
996 }
997
998 // remove from gather set
999 client_reconnect_gather.erase(from);
1000 if (client_reconnect_gather.empty())
1001 reconnect_gather_finish();
1002
1003 m->put();
1004 }
1005
1006
1007
1008 void Server::reconnect_gather_finish()
1009 {
1010 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1011 assert(reconnect_done);
1012 reconnect_done->complete(0);
1013 reconnect_done = NULL;
1014 }
1015
1016 void Server::reconnect_tick()
1017 {
1018 if (reconnect_evicting) {
1019 dout(4) << "reconnect_tick: waiting for evictions" << dendl;
1020 return;
1021 }
1022
1023 utime_t reconnect_end = reconnect_start;
1024 reconnect_end += g_conf->mds_reconnect_timeout;
1025 if (ceph_clock_now() >= reconnect_end &&
1026 !client_reconnect_gather.empty()) {
1027 dout(10) << "reconnect timed out" << dendl;
1028
1029 // If we're doing blacklist evictions, use this to wait for them before
1030 // proceeding to reconnect_gather_finish
1031 MDSGatherBuilder gather(g_ceph_context);
1032
1033 for (set<client_t>::iterator p = client_reconnect_gather.begin();
1034 p != client_reconnect_gather.end();
1035 ++p) {
1036 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
1037 assert(session);
1038 dout(1) << "reconnect gave up on " << session->info.inst << dendl;
1039
1040 mds->clog->warn() << "evicting unresponsive client " << *session
1041 << ", after waiting " << g_conf->mds_reconnect_timeout
1042 << " seconds during MDS startup";
1043
1044 if (g_conf->mds_session_blacklist_on_timeout) {
1045 std::stringstream ss;
1046 mds->evict_client(session->info.inst.name.num(), false, true, ss,
1047 gather.new_sub());
1048 } else {
1049 kill_session(session, NULL);
1050 }
1051
1052 failed_reconnects++;
1053 }
1054 client_reconnect_gather.clear();
1055
1056 if (gather.has_subs()) {
1057 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1058 gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
1059 [this](int r){reconnect_gather_finish();})));
1060 gather.activate();
1061 reconnect_evicting = true;
1062 } else {
1063 reconnect_gather_finish();
1064 }
1065 }
1066 }
1067
1068 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1069 {
1070 if (!locks.length()) return;
1071 int numlocks;
1072 ceph_filelock lock;
1073 bufferlist::iterator p = locks.begin();
1074 ::decode(numlocks, p);
1075 for (int i = 0; i < numlocks; ++i) {
1076 ::decode(lock, p);
1077 lock.client = client;
1078 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1079 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1080 }
1081 ::decode(numlocks, p);
1082 for (int i = 0; i < numlocks; ++i) {
1083 ::decode(lock, p);
1084 lock.client = client;
1085 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1086 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1087 }
1088 }
1089
1090
1091 /**
1092 * Call this when the MDCache is oversized, to send requests to the clients
1093 * to trim some caps, and consequently unpin some inodes in the MDCache so
1094 * that it can trim too.
1095 */
1096 void Server::recall_client_state(void)
1097 {
1098 /* try to recall at least 80% of all caps */
1099 uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
1100 uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
1101 if (max_caps_per_client < min_caps_per_client) {
1102 dout(0) << "max_caps_per_client " << max_caps_per_client
1103 << " < min_caps_per_client " << min_caps_per_client << dendl;
1104 max_caps_per_client = min_caps_per_client + 1;
1105 }
1106
1107 /* unless this ratio is smaller: */
1108 /* ratio: determine the amount of caps to recall from each client. Use
1109 * percentage full over the cache reservation. Cap the ratio at 80% of client
1110 * caps. */
1111 double ratio = 1.0-fmin(0.80, mdcache->cache_toofull_ratio());
1112
1113 dout(10) << "recall_client_state " << ratio
1114 << ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
1115 << dendl;
1116
1117 set<Session*> sessions;
1118 mds->sessionmap.get_client_session_set(sessions);
1119 for (auto &session : sessions) {
1120 if (!session->is_open() ||
1121 !session->info.inst.name.is_client())
1122 continue;
1123
1124 dout(10) << " session " << session->info.inst
1125 << " caps " << session->caps.size()
1126 << ", leases " << session->leases.size()
1127 << dendl;
1128
1129 uint64_t newlim = MAX(MIN((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
1130 if (session->caps.size() > newlim) {
1131 MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
1132 m->head.max_caps = newlim;
1133 mds->send_message_client(m, session);
1134 session->notify_recall_sent(newlim);
1135 }
1136 }
1137 }
1138
1139 void Server::force_clients_readonly()
1140 {
1141 dout(10) << "force_clients_readonly" << dendl;
1142 set<Session*> sessions;
1143 mds->sessionmap.get_client_session_set(sessions);
1144 for (set<Session*>::const_iterator p = sessions.begin();
1145 p != sessions.end();
1146 ++p) {
1147 Session *session = *p;
1148 if (!session->info.inst.name.is_client() ||
1149 !(session->is_open() || session->is_stale()))
1150 continue;
1151 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
1152 }
1153 }
1154
1155 /*******
1156 * some generic stuff for finishing off requests
1157 */
1158 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1159 {
1160 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1161 assert(!mdr->has_completed);
1162
1163 // note trace items for eventual reply.
1164 mdr->tracei = in;
1165 if (in)
1166 mdr->pin(in);
1167
1168 mdr->tracedn = dn;
1169 if (dn)
1170 mdr->pin(dn);
1171
1172 early_reply(mdr, in, dn);
1173
1174 mdr->committing = true;
1175 submit_mdlog_entry(le, fin, mdr, __func__);
1176
1177 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1178 if (mds->queue_one_replay()) {
1179 dout(10) << " queued next replay op" << dendl;
1180 } else {
1181 dout(10) << " journaled last replay op, flushing" << dendl;
1182 mdlog->flush();
1183 }
1184 } else if (mdr->did_early_reply)
1185 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
1186 else
1187 mdlog->flush();
1188 }
1189
1190 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1191 const char *event)
1192 {
1193 if (mdr) {
1194 string event_str("submit entry: ");
1195 event_str += event;
1196 mdr->mark_event_string(event_str);
1197 }
1198 mdlog->submit_entry(le, fin);
1199 }
1200
1201 /*
1202 * send response built from mdr contents and error code; clean up mdr
1203 */
1204 void Server::respond_to_request(MDRequestRef& mdr, int r)
1205 {
1206 if (mdr->client_request) {
1207 reply_client_request(mdr, new MClientReply(mdr->client_request, r));
1208
1209 // add here to avoid counting ops multiple times (e.g., locks, loading)
1210 switch(mdr->client_request->get_op()) {
1211 case CEPH_MDS_OP_LOOKUPHASH:
1212 logger->inc(l_mdss_req_lookuphash);
1213 break;
1214 case CEPH_MDS_OP_LOOKUPINO:
1215 logger->inc(l_mdss_req_lookupino);
1216 break;
1217 case CEPH_MDS_OP_LOOKUPPARENT:
1218 logger->inc(l_mdss_req_lookupparent);
1219 break;
1220 case CEPH_MDS_OP_LOOKUPNAME:
1221 logger->inc(l_mdss_req_lookupname);
1222 break;
1223 case CEPH_MDS_OP_LOOKUP:
1224 logger->inc(l_mdss_req_lookup);
1225 break;
1226 case CEPH_MDS_OP_LOOKUPSNAP:
1227 logger->inc(l_mdss_req_lookupsnap);
1228 break;
1229 case CEPH_MDS_OP_GETATTR:
1230 logger->inc(l_mdss_req_getattr);
1231 break;
1232 case CEPH_MDS_OP_SETATTR:
1233 logger->inc(l_mdss_req_setattr);
1234 break;
1235 case CEPH_MDS_OP_SETLAYOUT:
1236 logger->inc(l_mdss_req_setlayout);
1237 break;
1238 case CEPH_MDS_OP_SETDIRLAYOUT:
1239 logger->inc(l_mdss_req_setdirlayout);
1240 break;
1241 case CEPH_MDS_OP_SETXATTR:
1242 logger->inc(l_mdss_req_setxattr);
1243 break;
1244 case CEPH_MDS_OP_RMXATTR:
1245 logger->inc(l_mdss_req_rmxattr);
1246 break;
1247 case CEPH_MDS_OP_READDIR:
1248 logger->inc(l_mdss_req_readdir);
1249 break;
1250 case CEPH_MDS_OP_SETFILELOCK:
1251 logger->inc(l_mdss_req_setfilelock);
1252 break;
1253 case CEPH_MDS_OP_GETFILELOCK:
1254 logger->inc(l_mdss_req_getfilelock);
1255 break;
1256 case CEPH_MDS_OP_CREATE:
1257 logger->inc(l_mdss_req_create);
1258 case CEPH_MDS_OP_OPEN:
1259 logger->inc(l_mdss_req_open);
1260 break;
1261 case CEPH_MDS_OP_MKNOD:
1262 logger->inc(l_mdss_req_mknod);
1263 break;
1264 case CEPH_MDS_OP_LINK:
1265 logger->inc(l_mdss_req_link);
1266 break;
1267 case CEPH_MDS_OP_UNLINK:
1268 logger->inc(l_mdss_req_unlink);
1269 break;
1270 case CEPH_MDS_OP_RMDIR:
1271 logger->inc(l_mdss_req_rmdir);
1272 break;
1273 case CEPH_MDS_OP_RENAME:
1274 logger->inc(l_mdss_req_rename);
1275 break;
1276 case CEPH_MDS_OP_MKDIR:
1277 logger->inc(l_mdss_req_mkdir);
1278 break;
1279 case CEPH_MDS_OP_SYMLINK:
1280 logger->inc(l_mdss_req_symlink);
1281 break;
1282 case CEPH_MDS_OP_LSSNAP:
1283 logger->inc(l_mdss_req_lssnap);
1284 break;
1285 case CEPH_MDS_OP_MKSNAP:
1286 logger->inc(l_mdss_req_mksnap);
1287 break;
1288 case CEPH_MDS_OP_RMSNAP:
1289 logger->inc(l_mdss_req_rmsnap);
1290 break;
1291 case CEPH_MDS_OP_RENAMESNAP:
1292 logger->inc(l_mdss_req_renamesnap);
1293 break;
1294 }
1295 } else if (mdr->internal_op > -1) {
1296 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1297 if (!mdr->internal_op_finish)
1298 assert(0 == "trying to respond to internal op without finisher");
1299 mdr->internal_op_finish->complete(r);
1300 mdcache->request_finish(mdr);
1301 }
1302 }
1303
1304 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
1305 {
1306 if (!g_conf->mds_early_reply)
1307 return;
1308
1309 if (mdr->no_early_reply) {
1310 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
1311 return;
1312 }
1313
1314 if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
1315 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
1316 return;
1317 }
1318
1319 if (mdr->alloc_ino) {
1320 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
1321 return;
1322 }
1323
1324 MClientRequest *req = mdr->client_request;
1325 entity_inst_t client_inst = req->get_source_inst();
1326 if (client_inst.name.is_mds())
1327 return;
1328
1329 if (req->is_replay()) {
1330 dout(10) << " no early reply on replay op" << dendl;
1331 return;
1332 }
1333
1334
1335 MClientReply *reply = new MClientReply(req, 0);
1336 reply->set_unsafe();
1337
1338 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1339 //
1340 //_rename_finish() does not send dentry link/unlink message to replicas.
1341 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1342 // that have projected linkages from getting new replica.
1343 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
1344
1345 dout(10) << "early_reply " << reply->get_result()
1346 << " (" << cpp_strerror(reply->get_result())
1347 << ") " << *req << dendl;
1348
1349 if (tracei || tracedn) {
1350 if (tracei)
1351 mdr->cap_releases.erase(tracei->vino());
1352 if (tracedn)
1353 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1354
1355 set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid,
1356 req->get_dentry_wanted(), mdr);
1357 }
1358
1359 reply->set_extra_bl(mdr->reply_extra_bl);
1360 req->get_connection()->send_message(reply);
1361
1362 mdr->did_early_reply = true;
1363
1364 mds->logger->inc(l_mds_reply);
1365 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
1366 mds->logger->tinc(l_mds_reply_latency, lat);
1367 dout(20) << "lat " << lat << dendl;
1368
1369 mdr->mark_event("early_replied");
1370 }
1371
1372 /*
1373 * send given reply
1374 * include a trace to tracei
1375 * Clean up mdr
1376 */
1377 void Server::reply_client_request(MDRequestRef& mdr, MClientReply *reply)
1378 {
1379 assert(mdr.get());
1380 MClientRequest *req = mdr->client_request;
1381
1382 dout(7) << "reply_client_request " << reply->get_result()
1383 << " (" << cpp_strerror(reply->get_result())
1384 << ") " << *req << dendl;
1385
1386 mdr->mark_event("replying");
1387
1388 Session *session = mdr->session;
1389
1390 // note successful request in session map?
1391 //
1392 // setfilelock requests are special, they only modify states in MDS memory.
1393 // The states get lost when MDS fails. If Client re-send a completed
1394 // setfilelock request, it means that client did not receive corresponding
1395 // setfilelock reply. So MDS should re-execute the setfilelock request.
1396 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
1397 reply->get_result() == 0 && session) {
1398 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
1399 session->add_completed_request(mdr->reqid.tid, created);
1400 if (mdr->ls) {
1401 mdr->ls->touched_sessions.insert(session->info.inst.name);
1402 }
1403 }
1404
1405 // give any preallocated inos to the session
1406 apply_allocated_inos(mdr, session);
1407
1408 // get tracei/tracedn from mdr?
1409 snapid_t snapid = mdr->snapid;
1410 CInode *tracei = mdr->tracei;
1411 CDentry *tracedn = mdr->tracedn;
1412
1413 bool is_replay = mdr->client_request->is_replay();
1414 bool did_early_reply = mdr->did_early_reply;
1415 entity_inst_t client_inst = req->get_source_inst();
1416 int dentry_wanted = req->get_dentry_wanted();
1417
1418 if (!did_early_reply && !is_replay) {
1419
1420 mds->logger->inc(l_mds_reply);
1421 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
1422 mds->logger->tinc(l_mds_reply_latency, lat);
1423 dout(20) << "lat " << lat << dendl;
1424
1425 if (tracei)
1426 mdr->cap_releases.erase(tracei->vino());
1427 if (tracedn)
1428 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1429 }
1430
1431 // drop non-rdlocks before replying, so that we can issue leases
1432 mdcache->request_drop_non_rdlocks(mdr);
1433
1434 // reply at all?
1435 if (client_inst.name.is_mds() || !session) {
1436 reply->put(); // mds doesn't need a reply
1437 reply = 0;
1438 } else {
1439 // send reply.
1440 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
1441 (tracei || tracedn)) {
1442 if (is_replay) {
1443 if (tracei)
1444 mdcache->try_reconnect_cap(tracei, session);
1445 } else {
1446 // include metadata in reply
1447 set_trace_dist(session, reply, tracei, tracedn,
1448 snapid, dentry_wanted,
1449 mdr);
1450 }
1451 }
1452
1453 // We can set the extra bl unconditionally: if it's already been sent in the
1454 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
1455 reply->set_extra_bl(mdr->reply_extra_bl);
1456
1457 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
1458 req->get_connection()->send_message(reply);
1459 }
1460
1461 if (req->is_queued_for_replay() &&
1462 (mdr->has_completed || reply->get_result() < 0)) {
1463 if (reply->get_result() < 0) {
1464 int r = reply->get_result();
1465 derr << "reply_client_request: failed to replay " << *req
1466 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
1467 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
1468 }
1469 mds->queue_one_replay();
1470 }
1471
1472 // clean up request
1473 mdcache->request_finish(mdr);
1474
1475 // take a closer look at tracei, if it happens to be a remote link
1476 if (tracei &&
1477 tracedn &&
1478 tracedn->get_projected_linkage()->is_remote()) {
1479 mdcache->eval_remote(tracedn);
1480 }
1481 }
1482
1483
1484 void Server::encode_empty_dirstat(bufferlist& bl)
1485 {
1486 static DirStat empty;
1487 empty.encode(bl);
1488 }
1489
1490 void Server::encode_infinite_lease(bufferlist& bl)
1491 {
1492 LeaseStat e;
1493 e.seq = 0;
1494 e.mask = -1;
1495 e.duration_ms = -1;
1496 ::encode(e, bl);
1497 dout(20) << "encode_infinite_lease " << e << dendl;
1498 }
1499
1500 void Server::encode_null_lease(bufferlist& bl)
1501 {
1502 LeaseStat e;
1503 e.seq = 0;
1504 e.mask = 0;
1505 e.duration_ms = 0;
1506 ::encode(e, bl);
1507 dout(20) << "encode_null_lease " << e << dendl;
1508 }
1509
1510
1511 /*
1512 * pass inode OR dentry (not both, or we may get confused)
1513 *
1514 * trace is in reverse order (i.e. root inode comes last)
1515 */
1516 void Server::set_trace_dist(Session *session, MClientReply *reply,
1517 CInode *in, CDentry *dn,
1518 snapid_t snapid,
1519 int dentry_wanted,
1520 MDRequestRef& mdr)
1521 {
1522 // skip doing this for debugging purposes?
1523 if (g_conf->mds_inject_traceless_reply_probability &&
1524 mdr->ls && !mdr->o_trunc &&
1525 (rand() % 10000 < g_conf->mds_inject_traceless_reply_probability * 10000.0)) {
1526 dout(5) << "deliberately skipping trace for " << *reply << dendl;
1527 return;
1528 }
1529
1530 // inode, dentry, dir, ..., inode
1531 bufferlist bl;
1532 mds_rank_t whoami = mds->get_nodeid();
1533 client_t client = session->get_client();
1534 utime_t now = ceph_clock_now();
1535
1536 dout(20) << "set_trace_dist snapid " << snapid << dendl;
1537
1538 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
1539
1540 // realm
1541 if (snapid == CEPH_NOSNAP) {
1542 SnapRealm *realm;
1543 if (in)
1544 realm = in->find_snaprealm();
1545 else
1546 realm = dn->get_dir()->get_inode()->find_snaprealm();
1547 reply->snapbl = realm->get_snap_trace();
1548 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
1549 }
1550
1551 // dir + dentry?
1552 if (dn) {
1553 reply->head.is_dentry = 1;
1554 CDir *dir = dn->get_dir();
1555 CInode *diri = dir->get_inode();
1556
1557 diri->encode_inodestat(bl, session, NULL, snapid);
1558 dout(20) << "set_trace_dist added diri " << *diri << dendl;
1559
1560 #ifdef MDS_VERIFY_FRAGSTAT
1561 if (dir->is_complete())
1562 dir->verify_fragstat();
1563 #endif
1564 dir->encode_dirstat(bl, whoami);
1565 dout(20) << "set_trace_dist added dir " << *dir << dendl;
1566
1567 ::encode(dn->get_name(), bl);
1568 if (snapid == CEPH_NOSNAP)
1569 mds->locker->issue_client_lease(dn, client, bl, now, session);
1570 else
1571 encode_null_lease(bl);
1572 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
1573 } else
1574 reply->head.is_dentry = 0;
1575
1576 // inode
1577 if (in) {
1578 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
1579 dout(20) << "set_trace_dist added in " << *in << dendl;
1580 reply->head.is_target = 1;
1581 } else
1582 reply->head.is_target = 0;
1583
1584 reply->set_trace(bl);
1585 }
1586
1587
1588
1589
1590 /***
1591 * process a client request
1592 * This function DOES put the passed message before returning
1593 */
1594 void Server::handle_client_request(MClientRequest *req)
1595 {
1596 dout(4) << "handle_client_request " << *req << dendl;
1597
1598 if (mds->logger)
1599 mds->logger->inc(l_mds_request);
1600 if (logger)
1601 logger->inc(l_mdss_handle_client_request);
1602
1603 if (!mdcache->is_open()) {
1604 dout(5) << "waiting for root" << dendl;
1605 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
1606 return;
1607 }
1608
1609 // active session?
1610 Session *session = 0;
1611 if (req->get_source().is_client()) {
1612 session = get_session(req);
1613 if (!session) {
1614 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
1615 } else if (session->is_closed() ||
1616 session->is_closing() ||
1617 session->is_killing()) {
1618 dout(5) << "session closed|closing|killing, dropping" << dendl;
1619 session = NULL;
1620 }
1621 if (!session) {
1622 if (req->is_queued_for_replay())
1623 mds->queue_one_replay();
1624 req->put();
1625 return;
1626 }
1627 }
1628
1629 // old mdsmap?
1630 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
1631 // send it? hrm, this isn't ideal; they may get a lot of copies if
1632 // they have a high request rate.
1633 }
1634
1635 // completed request?
1636 bool has_completed = false;
1637 if (req->is_replay() || req->get_retry_attempt()) {
1638 assert(session);
1639 inodeno_t created;
1640 if (session->have_completed_request(req->get_reqid().tid, &created)) {
1641 has_completed = true;
1642 // Don't send traceless reply if the completed request has created
1643 // new inode. Treat the request as lookup request instead.
1644 if (req->is_replay() ||
1645 ((created == inodeno_t() || !mds->is_clientreplay()) &&
1646 req->get_op() != CEPH_MDS_OP_OPEN &&
1647 req->get_op() != CEPH_MDS_OP_CREATE)) {
1648 dout(5) << "already completed " << req->get_reqid() << dendl;
1649 MClientReply *reply = new MClientReply(req, 0);
1650 if (created != inodeno_t()) {
1651 bufferlist extra;
1652 ::encode(created, extra);
1653 reply->set_extra_bl(extra);
1654 }
1655 req->get_connection()->send_message(reply);
1656
1657 if (req->is_queued_for_replay())
1658 mds->queue_one_replay();
1659
1660 req->put();
1661 return;
1662 }
1663 if (req->get_op() != CEPH_MDS_OP_OPEN &&
1664 req->get_op() != CEPH_MDS_OP_CREATE) {
1665 dout(10) << " completed request which created new inode " << created
1666 << ", convert it to lookup request" << dendl;
1667 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
1668 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
1669 }
1670 }
1671 }
1672
1673 // trim completed_request list
1674 if (req->get_oldest_client_tid() > 0) {
1675 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
1676 assert(session);
1677 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
1678 // Sessions 'completed_requests' was dirtied, mark it to be
1679 // potentially flushed at segment expiry.
1680 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
1681
1682 if (session->get_num_trim_requests_warnings() > 0 &&
1683 session->get_num_completed_requests() * 2 < g_conf->mds_max_completed_requests)
1684 session->reset_num_trim_requests_warnings();
1685 } else {
1686 if (session->get_num_completed_requests() >=
1687 (g_conf->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
1688 session->inc_num_trim_requests_warnings();
1689 stringstream ss;
1690 ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
1691 << req->get_oldest_client_tid() << "), "
1692 << session->get_num_completed_requests()
1693 << " completed requests recorded in session\n";
1694 mds->clog->warn() << ss.str();
1695 dout(20) << __func__ << " " << ss.str() << dendl;
1696 }
1697 }
1698 }
1699
1700 // register + dispatch
1701 MDRequestRef mdr = mdcache->request_start(req);
1702 if (!mdr.get())
1703 return;
1704
1705 if (session) {
1706 mdr->session = session;
1707 session->requests.push_back(&mdr->item_session_request);
1708 }
1709
1710 if (has_completed)
1711 mdr->has_completed = true;
1712
1713 // process embedded cap releases?
1714 // (only if NOT replay!)
1715 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
1716 client_t client = req->get_source().num();
1717 for (vector<MClientRequest::Release>::iterator p = req->releases.begin();
1718 p != req->releases.end();
1719 ++p)
1720 mds->locker->process_request_cap_release(mdr, client, p->item, p->dname);
1721 req->releases.clear();
1722 }
1723
1724 dispatch_client_request(mdr);
1725 return;
1726 }
1727
1728 void Server::handle_osd_map()
1729 {
1730 /* Note that we check the OSDMAP_FULL flag directly rather than
1731 * using osdmap_full_flag(), because we want to know "is the flag set"
1732 * rather than "does the flag apply to us?" */
1733 mds->objecter->with_osdmap([this](const OSDMap& o) {
1734 auto pi = o.get_pg_pool(mds->mdsmap->get_metadata_pool());
1735 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
1736 dout(7) << __func__ << ": full = " << is_full << " epoch = "
1737 << o.get_epoch() << dendl;
1738 });
1739 }
1740
1741 void Server::dispatch_client_request(MDRequestRef& mdr)
1742 {
1743 // we shouldn't be waiting on anyone.
1744 assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
1745
1746 if (mdr->killed) {
1747 dout(10) << "request " << *mdr << " was killed" << dendl;
1748 return;
1749 }
1750
1751 MClientRequest *req = mdr->client_request;
1752
1753 if (logger) logger->inc(l_mdss_dispatch_client_request);
1754
1755 dout(7) << "dispatch_client_request " << *req << dendl;
1756
1757 if (req->may_write()) {
1758 if (mdcache->is_readonly()) {
1759 dout(10) << " read-only FS" << dendl;
1760 respond_to_request(mdr, -EROFS);
1761 return;
1762 }
1763 if (mdr->has_more() && mdr->more()->slave_error) {
1764 dout(10) << " got error from slaves" << dendl;
1765 respond_to_request(mdr, mdr->more()->slave_error);
1766 return;
1767 }
1768 }
1769
1770 if (is_full) {
1771 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1772 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
1773 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1774 req->get_op() == CEPH_MDS_OP_RMXATTR ||
1775 req->get_op() == CEPH_MDS_OP_SETXATTR ||
1776 req->get_op() == CEPH_MDS_OP_CREATE ||
1777 req->get_op() == CEPH_MDS_OP_SYMLINK ||
1778 req->get_op() == CEPH_MDS_OP_MKSNAP ||
1779 ((req->get_op() == CEPH_MDS_OP_LINK ||
1780 req->get_op() == CEPH_MDS_OP_RENAME) &&
1781 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
1782 ) {
1783
1784 dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
1785 respond_to_request(mdr, -ENOSPC);
1786 return;
1787 } else {
1788 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
1789 }
1790 }
1791
1792 switch (req->get_op()) {
1793 case CEPH_MDS_OP_LOOKUPHASH:
1794 case CEPH_MDS_OP_LOOKUPINO:
1795 handle_client_lookup_ino(mdr, false, false);
1796 break;
1797 case CEPH_MDS_OP_LOOKUPPARENT:
1798 handle_client_lookup_ino(mdr, true, false);
1799 break;
1800 case CEPH_MDS_OP_LOOKUPNAME:
1801 handle_client_lookup_ino(mdr, false, true);
1802 break;
1803
1804 // inodes ops.
1805 case CEPH_MDS_OP_LOOKUP:
1806 handle_client_getattr(mdr, true);
1807 break;
1808
1809 case CEPH_MDS_OP_LOOKUPSNAP:
1810 // lookupsnap does not reference a CDentry; treat it as a getattr
1811 case CEPH_MDS_OP_GETATTR:
1812 handle_client_getattr(mdr, false);
1813 break;
1814
1815 case CEPH_MDS_OP_SETATTR:
1816 handle_client_setattr(mdr);
1817 break;
1818 case CEPH_MDS_OP_SETLAYOUT:
1819 handle_client_setlayout(mdr);
1820 break;
1821 case CEPH_MDS_OP_SETDIRLAYOUT:
1822 handle_client_setdirlayout(mdr);
1823 break;
1824 case CEPH_MDS_OP_SETXATTR:
1825 handle_client_setxattr(mdr);
1826 break;
1827 case CEPH_MDS_OP_RMXATTR:
1828 handle_client_removexattr(mdr);
1829 break;
1830
1831 case CEPH_MDS_OP_READDIR:
1832 handle_client_readdir(mdr);
1833 break;
1834
1835 case CEPH_MDS_OP_SETFILELOCK:
1836 handle_client_file_setlock(mdr);
1837 break;
1838
1839 case CEPH_MDS_OP_GETFILELOCK:
1840 handle_client_file_readlock(mdr);
1841 break;
1842
1843 // funky.
1844 case CEPH_MDS_OP_CREATE:
1845 if (mdr->has_completed)
1846 handle_client_open(mdr); // already created.. just open
1847 else
1848 handle_client_openc(mdr);
1849 break;
1850
1851 case CEPH_MDS_OP_OPEN:
1852 handle_client_open(mdr);
1853 break;
1854
1855 // namespace.
1856 // no prior locks.
1857 case CEPH_MDS_OP_MKNOD:
1858 handle_client_mknod(mdr);
1859 break;
1860 case CEPH_MDS_OP_LINK:
1861 handle_client_link(mdr);
1862 break;
1863 case CEPH_MDS_OP_UNLINK:
1864 case CEPH_MDS_OP_RMDIR:
1865 handle_client_unlink(mdr);
1866 break;
1867 case CEPH_MDS_OP_RENAME:
1868 handle_client_rename(mdr);
1869 break;
1870 case CEPH_MDS_OP_MKDIR:
1871 handle_client_mkdir(mdr);
1872 break;
1873 case CEPH_MDS_OP_SYMLINK:
1874 handle_client_symlink(mdr);
1875 break;
1876
1877
1878 // snaps
1879 case CEPH_MDS_OP_LSSNAP:
1880 handle_client_lssnap(mdr);
1881 break;
1882 case CEPH_MDS_OP_MKSNAP:
1883 handle_client_mksnap(mdr);
1884 break;
1885 case CEPH_MDS_OP_RMSNAP:
1886 handle_client_rmsnap(mdr);
1887 break;
1888 case CEPH_MDS_OP_RENAMESNAP:
1889 handle_client_renamesnap(mdr);
1890 break;
1891
1892 default:
1893 dout(1) << " unknown client op " << req->get_op() << dendl;
1894 respond_to_request(mdr, -EOPNOTSUPP);
1895 }
1896 }
1897
1898
1899 // ---------------------------------------
1900 // SLAVE REQUESTS
1901
1902 /* This function DOES put the passed message before returning*/
1903 void Server::handle_slave_request(MMDSSlaveRequest *m)
1904 {
1905 dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
1906 mds_rank_t from = mds_rank_t(m->get_source().num());
1907
1908 if (logger) logger->inc(l_mdss_handle_slave_request);
1909
1910 // reply?
1911 if (m->is_reply())
1912 return handle_slave_request_reply(m);
1913
1914 // the purpose of rename notify is enforcing causal message ordering. making sure
1915 // bystanders have received all messages from rename srcdn's auth MDS.
1916 if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
1917 MMDSSlaveRequest *reply = new MMDSSlaveRequest(m->get_reqid(), m->get_attempt(),
1918 MMDSSlaveRequest::OP_RENAMENOTIFYACK);
1919 mds->send_message(reply, m->get_connection());
1920 m->put();
1921 return;
1922 }
1923
1924 CDentry *straydn = NULL;
1925 if (m->stray.length() > 0) {
1926 straydn = mdcache->add_replica_stray(m->stray, from);
1927 assert(straydn);
1928 m->stray.clear();
1929 }
1930
1931 // am i a new slave?
1932 MDRequestRef mdr;
1933 if (mdcache->have_request(m->get_reqid())) {
1934 // existing?
1935 mdr = mdcache->request_get(m->get_reqid());
1936
1937 // is my request newer?
1938 if (mdr->attempt > m->get_attempt()) {
1939 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
1940 << ", dropping " << *m << dendl;
1941 m->put();
1942 return;
1943 }
1944
1945
1946 if (mdr->attempt < m->get_attempt()) {
1947 // mine is old, close it out
1948 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
1949 << ", closing out" << dendl;
1950 mdcache->request_finish(mdr);
1951 mdr.reset();
1952 } else if (mdr->slave_to_mds != from) {
1953 dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
1954 m->put();
1955 return;
1956 }
1957
1958 if (m->get_op() == MMDSSlaveRequest::OP_FINISH && m->is_abort()) {
1959 mdr->aborted = true;
1960 if (mdr->slave_request) {
1961 // only abort on-going xlock, wrlock and auth pin
1962 assert(!mdr->slave_did_prepare());
1963 } else {
1964 mdcache->request_finish(mdr);
1965 }
1966 return;
1967 }
1968 }
1969 if (!mdr.get()) {
1970 // new?
1971 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
1972 dout(10) << "missing slave request for " << m->get_reqid()
1973 << " OP_FINISH, must have lost race with a forward" << dendl;
1974 m->put();
1975 return;
1976 }
1977 mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
1978 mdr->set_op_stamp(m->op_stamp);
1979 }
1980 assert(mdr->slave_request == 0); // only one at a time, please!
1981
1982 if (straydn) {
1983 mdr->pin(straydn);
1984 mdr->straydn = straydn;
1985 }
1986
1987 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
1988 dout(3) << "not clientreplay|active yet, waiting" << dendl;
1989 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
1990 return;
1991 } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
1992 mdr->locks.empty()) {
1993 dout(3) << "not active yet, waiting" << dendl;
1994 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
1995 return;
1996 }
1997
1998 mdr->slave_request = m;
1999
2000 dispatch_slave_request(mdr);
2001 }
2002
2003 /* This function DOES put the passed message before returning*/
2004 void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
2005 {
2006 mds_rank_t from = mds_rank_t(m->get_source().num());
2007
2008 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2009 metareqid_t r = m->get_reqid();
2010 if (!mdcache->have_uncommitted_master(r, from)) {
2011 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2012 << from << " reqid " << r << dendl;
2013 m->put();
2014 return;
2015 }
2016 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2017 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2018 return;
2019 }
2020
2021 if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
2022 metareqid_t r = m->get_reqid();
2023 mdcache->committed_master_slave(r, from);
2024 m->put();
2025 return;
2026 }
2027
2028 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2029 if (m->get_attempt() != mdr->attempt) {
2030 dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
2031 << m->get_attempt() << dendl;
2032 m->put();
2033 return;
2034 }
2035
2036 switch (m->get_op()) {
2037 case MMDSSlaveRequest::OP_XLOCKACK:
2038 {
2039 // identify lock, master request
2040 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2041 m->get_object_info());
2042 mdr->more()->slaves.insert(from);
2043 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2044 mdr->xlocks.insert(lock);
2045 mdr->locks.insert(lock);
2046 mdr->finish_locking(lock);
2047 lock->get_xlock(mdr, mdr->get_client());
2048
2049 assert(mdr->more()->waiting_on_slave.count(from));
2050 mdr->more()->waiting_on_slave.erase(from);
2051 assert(mdr->more()->waiting_on_slave.empty());
2052 mdcache->dispatch_request(mdr);
2053 }
2054 break;
2055
2056 case MMDSSlaveRequest::OP_WRLOCKACK:
2057 {
2058 // identify lock, master request
2059 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2060 m->get_object_info());
2061 mdr->more()->slaves.insert(from);
2062 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2063 mdr->remote_wrlocks[lock] = from;
2064 mdr->locks.insert(lock);
2065 mdr->finish_locking(lock);
2066
2067 assert(mdr->more()->waiting_on_slave.count(from));
2068 mdr->more()->waiting_on_slave.erase(from);
2069 assert(mdr->more()->waiting_on_slave.empty());
2070 mdcache->dispatch_request(mdr);
2071 }
2072 break;
2073
2074 case MMDSSlaveRequest::OP_AUTHPINACK:
2075 handle_slave_auth_pin_ack(mdr, m);
2076 break;
2077
2078 case MMDSSlaveRequest::OP_LINKPREPACK:
2079 handle_slave_link_prep_ack(mdr, m);
2080 break;
2081
2082 case MMDSSlaveRequest::OP_RMDIRPREPACK:
2083 handle_slave_rmdir_prep_ack(mdr, m);
2084 break;
2085
2086 case MMDSSlaveRequest::OP_RENAMEPREPACK:
2087 handle_slave_rename_prep_ack(mdr, m);
2088 break;
2089
2090 case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
2091 handle_slave_rename_notify_ack(mdr, m);
2092 break;
2093
2094 default:
2095 ceph_abort();
2096 }
2097
2098 // done with reply.
2099 m->put();
2100 }
2101
2102 /* This function DOES put the mdr->slave_request before returning*/
2103 void Server::dispatch_slave_request(MDRequestRef& mdr)
2104 {
2105 dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
2106
2107 if (mdr->aborted) {
2108 dout(7) << " abort flag set, finishing" << dendl;
2109 mdcache->request_finish(mdr);
2110 return;
2111 }
2112
2113 if (logger) logger->inc(l_mdss_dispatch_slave_request);
2114
2115 int op = mdr->slave_request->get_op();
2116 switch (op) {
2117 case MMDSSlaveRequest::OP_XLOCK:
2118 case MMDSSlaveRequest::OP_WRLOCK:
2119 {
2120 // identify object
2121 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2122 mdr->slave_request->get_object_info());
2123
2124 if (!lock) {
2125 dout(10) << "don't have object, dropping" << dendl;
2126 ceph_abort(); // can this happen, if we auth pinned properly.
2127 }
2128 if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2129 dout(10) << "not auth for remote xlock attempt, dropping on "
2130 << *lock << " on " << *lock->get_parent() << dendl;
2131 } else {
2132 // use acquire_locks so that we get auth_pinning.
2133 set<SimpleLock*> rdlocks;
2134 set<SimpleLock*> wrlocks = mdr->wrlocks;
2135 set<SimpleLock*> xlocks = mdr->xlocks;
2136
2137 int replycode = 0;
2138 switch (op) {
2139 case MMDSSlaveRequest::OP_XLOCK:
2140 xlocks.insert(lock);
2141 replycode = MMDSSlaveRequest::OP_XLOCKACK;
2142 break;
2143 case MMDSSlaveRequest::OP_WRLOCK:
2144 wrlocks.insert(lock);
2145 replycode = MMDSSlaveRequest::OP_WRLOCKACK;
2146 break;
2147 }
2148
2149 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
2150 return;
2151
2152 // ack
2153 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, replycode);
2154 r->set_lock_type(lock->get_type());
2155 lock->get_parent()->set_object_info(r->get_object_info());
2156 mds->send_message(r, mdr->slave_request->get_connection());
2157 }
2158
2159 // done.
2160 mdr->slave_request->put();
2161 mdr->slave_request = 0;
2162 }
2163 break;
2164
2165 case MMDSSlaveRequest::OP_UNXLOCK:
2166 case MMDSSlaveRequest::OP_UNWRLOCK:
2167 {
2168 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2169 mdr->slave_request->get_object_info());
2170 assert(lock);
2171 bool need_issue = false;
2172 switch (op) {
2173 case MMDSSlaveRequest::OP_UNXLOCK:
2174 mds->locker->xlock_finish(lock, mdr.get(), &need_issue);
2175 break;
2176 case MMDSSlaveRequest::OP_UNWRLOCK:
2177 mds->locker->wrlock_finish(lock, mdr.get(), &need_issue);
2178 break;
2179 }
2180 if (need_issue)
2181 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2182
2183 // done. no ack necessary.
2184 mdr->slave_request->put();
2185 mdr->slave_request = 0;
2186 }
2187 break;
2188
2189 case MMDSSlaveRequest::OP_DROPLOCKS:
2190 mds->locker->drop_locks(mdr.get());
2191 mdr->slave_request->put();
2192 mdr->slave_request = 0;
2193 break;
2194
2195 case MMDSSlaveRequest::OP_AUTHPIN:
2196 handle_slave_auth_pin(mdr);
2197 break;
2198
2199 case MMDSSlaveRequest::OP_LINKPREP:
2200 case MMDSSlaveRequest::OP_UNLINKPREP:
2201 handle_slave_link_prep(mdr);
2202 break;
2203
2204 case MMDSSlaveRequest::OP_RMDIRPREP:
2205 handle_slave_rmdir_prep(mdr);
2206 break;
2207
2208 case MMDSSlaveRequest::OP_RENAMEPREP:
2209 handle_slave_rename_prep(mdr);
2210 break;
2211
2212 case MMDSSlaveRequest::OP_FINISH:
2213 // information about rename imported caps
2214 if (mdr->slave_request->inode_export.length() > 0)
2215 mdr->more()->inode_import.claim(mdr->slave_request->inode_export);
2216 // finish off request.
2217 mdcache->request_finish(mdr);
2218 break;
2219
2220 default:
2221 ceph_abort();
2222 }
2223 }
2224
2225 /* This function DOES put the mdr->slave_request before returning*/
2226 void Server::handle_slave_auth_pin(MDRequestRef& mdr)
2227 {
2228 dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
2229
2230 // build list of objects
2231 list<MDSCacheObject*> objects;
2232 CInode *auth_pin_freeze = NULL;
2233 bool fail = false, wouldblock = false, readonly = false;
2234
2235 if (mdcache->is_readonly()) {
2236 dout(10) << " read-only FS" << dendl;
2237 readonly = true;
2238 fail = true;
2239 }
2240
2241 if (!fail) {
2242 for (vector<MDSCacheObjectInfo>::iterator p = mdr->slave_request->get_authpins().begin();
2243 p != mdr->slave_request->get_authpins().end();
2244 ++p) {
2245 MDSCacheObject *object = mdcache->get_object(*p);
2246 if (!object) {
2247 dout(10) << " don't have " << *p << dendl;
2248 fail = true;
2249 break;
2250 }
2251
2252 objects.push_back(object);
2253 if (*p == mdr->slave_request->get_authpin_freeze())
2254 auth_pin_freeze = static_cast<CInode*>(object);
2255 }
2256 }
2257
2258 // can we auth pin them?
2259 if (!fail) {
2260 for (list<MDSCacheObject*>::iterator p = objects.begin();
2261 p != objects.end();
2262 ++p) {
2263 if (!(*p)->is_auth()) {
2264 dout(10) << " not auth for " << **p << dendl;
2265 fail = true;
2266 break;
2267 }
2268 if (mdr->is_auth_pinned(*p))
2269 continue;
2270 if (!mdr->can_auth_pin(*p)) {
2271 if (mdr->slave_request->is_nonblock()) {
2272 dout(10) << " can't auth_pin (freezing?) " << **p << " nonblocking" << dendl;
2273 fail = true;
2274 wouldblock = true;
2275 break;
2276 }
2277 // wait
2278 dout(10) << " waiting for authpinnable on " << **p << dendl;
2279 (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2280 mdr->drop_local_auth_pins();
2281
2282 mds->locker->notify_freeze_waiter(*p);
2283 return;
2284 }
2285 }
2286 }
2287
2288 // auth pin!
2289 if (fail) {
2290 mdr->drop_local_auth_pins(); // just in case
2291 } else {
2292 /* freeze authpin wrong inode */
2293 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
2294 mdr->more()->rename_inode != auth_pin_freeze)
2295 mdr->unfreeze_auth_pin(true);
2296
2297 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2298 * on the source inode to complete. This happens after all locks for the rename
2299 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2300 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2301 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2302 * The solution is freeze the inode and prevent other MDRequests from getting new
2303 * auth pins.
2304 */
2305 if (auth_pin_freeze) {
2306 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
2307 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
2308 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
2309 mds->mdlog->flush();
2310 return;
2311 }
2312 }
2313 for (list<MDSCacheObject*>::iterator p = objects.begin();
2314 p != objects.end();
2315 ++p) {
2316 dout(10) << "auth_pinning " << **p << dendl;
2317 mdr->auth_pin(*p);
2318 }
2319 }
2320
2321 // ack!
2322 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
2323
2324 // return list of my auth_pins (if any)
2325 for (set<MDSCacheObject*>::iterator p = mdr->auth_pins.begin();
2326 p != mdr->auth_pins.end();
2327 ++p) {
2328 MDSCacheObjectInfo info;
2329 (*p)->set_object_info(info);
2330 reply->get_authpins().push_back(info);
2331 if (*p == (MDSCacheObject*)auth_pin_freeze)
2332 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
2333 }
2334
2335 if (wouldblock)
2336 reply->mark_error_wouldblock();
2337 if (readonly)
2338 reply->mark_error_rofs();
2339
2340 mds->send_message_mds(reply, mdr->slave_to_mds);
2341
2342 // clean up this request
2343 mdr->slave_request->put();
2344 mdr->slave_request = 0;
2345 return;
2346 }
2347
2348 /* This function DOES NOT put the passed ack before returning*/
2349 void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
2350 {
2351 dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
2352 mds_rank_t from = mds_rank_t(ack->get_source().num());
2353
2354 // added auth pins?
2355 set<MDSCacheObject*> pinned;
2356 for (vector<MDSCacheObjectInfo>::iterator p = ack->get_authpins().begin();
2357 p != ack->get_authpins().end();
2358 ++p) {
2359 MDSCacheObject *object = mdcache->get_object(*p);
2360 assert(object); // we pinned it
2361 dout(10) << " remote has pinned " << *object << dendl;
2362 if (!mdr->is_auth_pinned(object))
2363 mdr->remote_auth_pins[object] = from;
2364 if (*p == ack->get_authpin_freeze())
2365 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
2366 pinned.insert(object);
2367 }
2368
2369 // removed frozen auth pin ?
2370 if (mdr->more()->is_remote_frozen_authpin &&
2371 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
2372 auto p = mdr->remote_auth_pins.find(mdr->more()->rename_inode);
2373 assert(p != mdr->remote_auth_pins.end());
2374 if (p->second == from) {
2375 mdr->more()->is_remote_frozen_authpin = false;
2376 }
2377 }
2378
2379 // removed auth pins?
2380 map<MDSCacheObject*, mds_rank_t>::iterator p = mdr->remote_auth_pins.begin();
2381 while (p != mdr->remote_auth_pins.end()) {
2382 MDSCacheObject* object = p->first;
2383 if (p->second == from && pinned.count(object) == 0) {
2384 dout(10) << " remote has unpinned " << *object << dendl;
2385 mdr->remote_auth_pins.erase(p++);
2386 } else {
2387 ++p;
2388 }
2389 }
2390
2391 if (ack->is_error_rofs()) {
2392 mdr->more()->slave_error = -EROFS;
2393 mdr->aborted = true;
2394 } else if (ack->is_error_wouldblock()) {
2395 mdr->more()->slave_error = -EWOULDBLOCK;
2396 mdr->aborted = true;
2397 }
2398
2399 // note slave
2400 mdr->more()->slaves.insert(from);
2401
2402 // clear from waiting list
2403 assert(mdr->more()->waiting_on_slave.count(from));
2404 mdr->more()->waiting_on_slave.erase(from);
2405
2406 // go again?
2407 if (mdr->more()->waiting_on_slave.empty())
2408 mdcache->dispatch_request(mdr);
2409 else
2410 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
2411 }
2412
2413
2414 // ---------------------------------------
2415 // HELPERS
2416
2417
2418 /**
2419 * check whether we are permitted to complete a request
2420 *
2421 * Check whether we have permission to perform the operation specified
2422 * by mask on the given inode, based on the capability in the mdr's
2423 * session.
2424 */
2425 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
2426 {
2427 if (mdr->session) {
2428 int r = mdr->session->check_access(
2429 in, mask,
2430 mdr->client_request->get_caller_uid(),
2431 mdr->client_request->get_caller_gid(),
2432 &mdr->client_request->get_caller_gid_list(),
2433 mdr->client_request->head.args.setattr.uid,
2434 mdr->client_request->head.args.setattr.gid);
2435 if (r < 0) {
2436 respond_to_request(mdr, r);
2437 return false;
2438 }
2439 }
2440 return true;
2441 }
2442
2443 /**
2444 * check whether fragment has reached maximum size
2445 *
2446 */
2447 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
2448 {
2449 const auto size = in->get_frag_size();
2450 if (size >= g_conf->mds_bal_fragment_size_max) {
2451 dout(10) << "fragment " << *in << " size exceeds " << g_conf->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
2452 respond_to_request(mdr, -ENOSPC);
2453 return false;
2454 }
2455
2456 return true;
2457 }
2458
2459
2460 /** validate_dentry_dir
2461 *
2462 * verify that the dir exists and would own the dname.
2463 * do not check if the dentry exists.
2464 */
2465 CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, const string& dname)
2466 {
2467 // make sure parent is a dir?
2468 if (!diri->is_dir()) {
2469 dout(7) << "validate_dentry_dir: not a dir" << dendl;
2470 respond_to_request(mdr, -ENOTDIR);
2471 return NULL;
2472 }
2473
2474 // which dirfrag?
2475 frag_t fg = diri->pick_dirfrag(dname);
2476 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
2477 if (!dir)
2478 return 0;
2479
2480 // frozen?
2481 if (dir->is_frozen()) {
2482 dout(7) << "dir is frozen " << *dir << dendl;
2483 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2484 return NULL;
2485 }
2486
2487 return dir;
2488 }
2489
2490
2491 /** prepare_null_dentry
2492 * prepare a null (or existing) dentry in given dir.
2493 * wait for any dn lock.
2494 */
2495 CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, const string& dname, bool okexist)
2496 {
2497 dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
2498 assert(dir->is_auth());
2499
2500 client_t client = mdr->get_client();
2501
2502 // does it already exist?
2503 CDentry *dn = dir->lookup(dname);
2504 if (dn) {
2505 /*
2506 if (dn->lock.is_xlocked_by_other(mdr)) {
2507 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2508 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2509 return 0;
2510 }
2511 */
2512 if (!dn->get_linkage(client, mdr)->is_null()) {
2513 // name already exists
2514 dout(10) << "dentry " << dname << " exists in " << *dir << dendl;
2515 if (!okexist) {
2516 respond_to_request(mdr, -EEXIST);
2517 return 0;
2518 }
2519 } else {
2520 dn->first = dir->inode->find_snaprealm()->get_newest_seq() + 1;
2521 }
2522
2523 return dn;
2524 }
2525
2526 // make sure dir is complete
2527 if (!dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2528 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2529 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2530 return 0;
2531 }
2532
2533 // create
2534 dn = dir->add_null_dentry(dname, dir->inode->find_snaprealm()->get_newest_seq() + 1);
2535 dn->mark_new();
2536 dout(10) << "prepare_null_dentry added " << *dn << dendl;
2537 return dn;
2538 }
2539
2540 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
2541 {
2542 CDentry *straydn = mdr->straydn;
2543 if (straydn) {
2544 string straydname;
2545 in->name_stray_dentry(straydname);
2546 if (straydn->get_name() == straydname)
2547 return straydn;
2548
2549 assert(!mdr->done_locking);
2550 mdr->unpin(straydn);
2551 }
2552
2553 CDir *straydir = mdcache->get_stray_dir(in);
2554
2555 if (!mdr->client_request->is_replay() &&
2556 !check_fragment_space(mdr, straydir))
2557 return NULL;
2558
2559 straydn = mdcache->get_or_create_stray_dentry(in);
2560 mdr->straydn = straydn;
2561 mdr->pin(straydn);
2562 return straydn;
2563 }
2564
2565 /** prepare_new_inode
2566 *
2567 * create a new inode. set c/m/atime. hit dir pop.
2568 */
2569 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
2570 file_layout_t *layout)
2571 {
2572 CInode *in = new CInode(mdcache);
2573
2574 // Server::prepare_force_open_sessions() can re-open session in closing
2575 // state. In that corner case, session's prealloc_inos are being freed.
2576 // To simplify the code, we disallow using/refilling session's prealloc_ino
2577 // while session is opening.
2578 bool allow_prealloc_inos = !mdr->session->is_opening();
2579
2580 // assign ino
2581 if (allow_prealloc_inos &&
2582 mdr->session->info.prealloc_inos.size()) {
2583 mdr->used_prealloc_ino =
2584 in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used
2585 mds->sessionmap.mark_projected(mdr->session);
2586
2587 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
2588 << " (" << mdr->session->info.prealloc_inos
2589 << ", " << mdr->session->info.prealloc_inos.size() << " left)"
2590 << dendl;
2591 } else {
2592 mdr->alloc_ino =
2593 in->inode.ino = mds->inotable->project_alloc_id();
2594 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
2595 }
2596
2597 if (useino && useino != in->inode.ino) {
2598 dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
2599 mds->clog->error() << mdr->client_request->get_source()
2600 << " specified ino " << useino
2601 << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
2602 //ceph_abort(); // just for now.
2603 }
2604
2605 if (allow_prealloc_inos &&
2606 mdr->session->get_num_projected_prealloc_inos() < g_conf->mds_client_prealloc_inos / 2) {
2607 int need = g_conf->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
2608 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
2609 assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
2610 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
2611 mds->sessionmap.mark_projected(mdr->session);
2612 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
2613 }
2614
2615 in->inode.version = 1;
2616 in->inode.xattr_version = 1;
2617 in->inode.nlink = 1; // FIXME
2618
2619 in->inode.mode = mode;
2620
2621 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
2622 if (in->inode.is_dir()) {
2623 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
2624 } else if (layout) {
2625 in->inode.layout = *layout;
2626 } else {
2627 in->inode.layout = mdcache->default_file_layout;
2628 }
2629
2630 in->inode.truncate_size = -1ull; // not truncated, yet!
2631 in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
2632
2633 CInode *diri = dir->get_inode();
2634
2635 dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
2636
2637 if (diri->inode.mode & S_ISGID) {
2638 dout(10) << " dir is sticky" << dendl;
2639 in->inode.gid = diri->inode.gid;
2640 if (S_ISDIR(mode)) {
2641 dout(10) << " new dir also sticky" << dendl;
2642 in->inode.mode |= S_ISGID;
2643 }
2644 } else
2645 in->inode.gid = mdr->client_request->get_caller_gid();
2646
2647 in->inode.uid = mdr->client_request->get_caller_uid();
2648
2649 in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
2650 mdr->get_op_stamp();
2651
2652 in->inode.change_attr = 0;
2653
2654 MClientRequest *req = mdr->client_request;
2655 if (req->get_data().length()) {
2656 bufferlist::iterator p = req->get_data().begin();
2657
2658 // xattrs on new inode?
2659 map<string,bufferptr> xattrs;
2660 ::decode(xattrs, p);
2661 for (map<string,bufferptr>::iterator p = xattrs.begin(); p != xattrs.end(); ++p) {
2662 dout(10) << "prepare_new_inode setting xattr " << p->first << dendl;
2663 in->xattrs[p->first] = p->second;
2664 }
2665 }
2666
2667 if (!mds->mdsmap->get_inline_data_enabled() ||
2668 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
2669 in->inode.inline_data.version = CEPH_INLINE_NONE;
2670
2671 mdcache->add_inode(in); // add
2672 dout(10) << "prepare_new_inode " << *in << dendl;
2673 return in;
2674 }
2675
2676 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
2677 {
2678 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
2679 << " inotablev " << mds->inotable->get_projected_version()
2680 << dendl;
2681 blob->set_ino_alloc(mdr->alloc_ino,
2682 mdr->used_prealloc_ino,
2683 mdr->prealloc_inos,
2684 mdr->client_request->get_source(),
2685 mds->sessionmap.get_projected(),
2686 mds->inotable->get_projected_version());
2687 }
2688
2689 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
2690 {
2691 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
2692 << " / " << mdr->prealloc_inos
2693 << " / " << mdr->used_prealloc_ino << dendl;
2694
2695 if (mdr->alloc_ino) {
2696 mds->inotable->apply_alloc_id(mdr->alloc_ino);
2697 }
2698 if (mdr->prealloc_inos.size()) {
2699 assert(session);
2700 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
2701 session->info.prealloc_inos.insert(mdr->prealloc_inos);
2702 mds->sessionmap.mark_dirty(session);
2703 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
2704 }
2705 if (mdr->used_prealloc_ino) {
2706 assert(session);
2707 session->info.used_inos.erase(mdr->used_prealloc_ino);
2708 mds->sessionmap.mark_dirty(session);
2709 }
2710 }
2711
2712 class C_MDS_TryFindInode : public ServerContext {
2713 MDRequestRef mdr;
2714 public:
2715 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
2716 void finish(int r) override {
2717 if (r == -ESTALE) // :( find_ino_peers failed
2718 server->respond_to_request(mdr, r);
2719 else
2720 server->dispatch_client_request(mdr);
2721 }
2722 };
2723
2724 CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath)
2725 {
2726 // figure parent dir vs dname
2727 if (refpath.depth() == 0) {
2728 dout(7) << "can't do that to root" << dendl;
2729 respond_to_request(mdr, -EINVAL);
2730 return 0;
2731 }
2732 string dname = refpath.last_dentry();
2733 refpath.pop_dentry();
2734
2735 dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl;
2736
2737 // traverse to parent dir
2738 CInode *diri;
2739 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &trace, &diri, MDS_TRAVERSE_FORWARD);
2740 if (r > 0) return 0; // delayed
2741 if (r < 0) {
2742 if (r == -ESTALE) {
2743 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2744 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
2745 return 0;
2746 }
2747 respond_to_request(mdr, r);
2748 return 0;
2749 }
2750
2751 // is it an auth dir?
2752 CDir *dir = validate_dentry_dir(mdr, diri, dname);
2753 if (!dir)
2754 return 0; // forwarded or waiting for freeze
2755
2756 dout(10) << "traverse_to_auth_dir " << *dir << dendl;
2757 return dir;
2758 }
2759
2760 /* If this returns null, the request has been handled
2761 * as appropriate: forwarded on, or the client's been replied to */
2762 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n,
2763 set<SimpleLock*> &rdlocks,
2764 bool want_auth,
2765 bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's
2766 a snapped dir */
2767 file_layout_t **layout,
2768 bool no_lookup) // true if we cannot return a null dentry lease
2769 {
2770 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2771 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
2772
2773 if (mdr->done_locking)
2774 return mdr->in[n];
2775
2776 // traverse
2777 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &mdr->dn[n], &mdr->in[n], MDS_TRAVERSE_FORWARD);
2778 if (r > 0)
2779 return NULL; // delayed
2780 if (r < 0) { // error
2781 if (r == -ENOENT && n == 0 && mdr->dn[n].size()) {
2782 if (!no_lookup)
2783 mdr->tracedn = mdr->dn[n][mdr->dn[n].size()-1];
2784 respond_to_request(mdr, r);
2785 } else if (r == -ESTALE) {
2786 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2787 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
2788 mdcache->find_ino_peers(refpath.get_ino(), c);
2789 } else {
2790 dout(10) << "FAIL on error " << r << dendl;
2791 respond_to_request(mdr, r);
2792 }
2793 return 0;
2794 }
2795 CInode *ref = mdr->in[n];
2796 dout(10) << "ref is " << *ref << dendl;
2797
2798 // fw to inode auth?
2799 if (mdr->snapid != CEPH_NOSNAP && !no_want_auth)
2800 want_auth = true;
2801
2802 if (want_auth) {
2803 if (ref->is_ambiguous_auth()) {
2804 dout(10) << "waiting for single auth on " << *ref << dendl;
2805 ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
2806 return 0;
2807 }
2808 if (!ref->is_auth()) {
2809 dout(10) << "fw to auth for " << *ref << dendl;
2810 mdcache->request_forward(mdr, ref->authority().first);
2811 return 0;
2812 }
2813
2814 // auth_pin?
2815 // do NOT proceed if freezing, as cap release may defer in that case, and
2816 // we could deadlock when we try to lock @ref.
2817 // if we're already auth_pinned, continue; the release has already been processed.
2818 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
2819 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
2820 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
2821 ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2822 /* If we have any auth pins, this will deadlock.
2823 * But the only way to get here if we've already got auth pins
2824 * is because we're on an inode with snapshots that got updated
2825 * between dispatches of this request. So we're going to drop
2826 * our locks and our auth pins and reacquire them later.
2827 *
2828 * This is safe since we're only in this function when working on
2829 * a single MDS request; otherwise we'd be in
2830 * rdlock_path_xlock_dentry.
2831 */
2832 mds->locker->drop_locks(mdr.get(), NULL);
2833 mdr->drop_local_auth_pins();
2834 if (!mdr->remote_auth_pins.empty())
2835 mds->locker->notify_freeze_waiter(ref);
2836 return 0;
2837 }
2838
2839 mdr->auth_pin(ref);
2840 }
2841
2842 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2843 rdlocks.insert(&mdr->dn[n][i]->lock);
2844 if (layout)
2845 mds->locker->include_snap_rdlocks_wlayout(rdlocks, ref, layout);
2846 else
2847 mds->locker->include_snap_rdlocks(rdlocks, ref);
2848
2849 // set and pin ref
2850 mdr->pin(ref);
2851 return ref;
2852 }
2853
2854
2855 /** rdlock_path_xlock_dentry
2856 * traverse path to the directory that could/would contain dentry.
2857 * make sure i am auth for that dentry, forward as necessary.
2858 * create null dentry in place (or use existing if okexist).
2859 * get rdlocks on traversed dentries, xlock on new dentry.
2860 */
2861 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
2862 set<SimpleLock*>& rdlocks, set<SimpleLock*>& wrlocks, set<SimpleLock*>& xlocks,
2863 bool okexist, bool mustexist, bool alwaysxlock,
2864 file_layout_t **layout)
2865 {
2866 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2867
2868 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
2869
2870 client_t client = mdr->get_client();
2871
2872 if (mdr->done_locking)
2873 return mdr->dn[n].back();
2874
2875 CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
2876 if (!dir) return 0;
2877
2878 CInode *diri = dir->get_inode();
2879 if (!mdr->reqid.name.is_mds()) {
2880 if (diri->is_system() && !diri->is_root()) {
2881 respond_to_request(mdr, -EROFS);
2882 return 0;
2883 }
2884 }
2885 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
2886 respond_to_request(mdr, -ENOENT);
2887 return 0;
2888 }
2889
2890 // make a null dentry?
2891 const string &dname = refpath.last_dentry();
2892 CDentry *dn;
2893 if (mustexist) {
2894 dn = dir->lookup(dname);
2895
2896 // make sure dir is complete
2897 if (!dn && !dir->is_complete() &&
2898 (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2899 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2900 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2901 return 0;
2902 }
2903
2904 // readable?
2905 if (dn && !dn->lock.can_read(client) && dn->lock.get_xlock_by() != mdr) {
2906 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2907 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2908 return 0;
2909 }
2910
2911 // exists?
2912 if (!dn || dn->get_linkage(client, mdr)->is_null()) {
2913 dout(7) << "dentry " << dname << " dne in " << *dir << dendl;
2914 respond_to_request(mdr, -ENOENT);
2915 return 0;
2916 }
2917 } else {
2918 dn = prepare_null_dentry(mdr, dir, dname, okexist);
2919 if (!dn)
2920 return 0;
2921 }
2922
2923 mdr->dn[n].push_back(dn);
2924 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
2925 mdr->in[n] = dnl->get_inode();
2926
2927 // -- lock --
2928 // NOTE: rename takes the same set of locks for srcdn
2929 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2930 rdlocks.insert(&mdr->dn[n][i]->lock);
2931 if (alwaysxlock || dnl->is_null())
2932 xlocks.insert(&dn->lock); // new dn, xlock
2933 else
2934 rdlocks.insert(&dn->lock); // existing dn, rdlock
2935 wrlocks.insert(&dn->get_dir()->inode->filelock); // also, wrlock on dir mtime
2936 wrlocks.insert(&dn->get_dir()->inode->nestlock); // also, wrlock on dir mtime
2937 if (layout)
2938 mds->locker->include_snap_rdlocks_wlayout(rdlocks, dn->get_dir()->inode, layout);
2939 else
2940 mds->locker->include_snap_rdlocks(rdlocks, dn->get_dir()->inode);
2941
2942 return dn;
2943 }
2944
2945
2946
2947
2948
2949 /**
2950 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
2951 *
2952 * @param diri base inode
2953 * @param fg the exact frag we want
2954 * @param mdr request
2955 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
2956 */
2957 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
2958 {
2959 CDir *dir = diri->get_dirfrag(fg);
2960
2961 // not open and inode not mine?
2962 if (!dir && !diri->is_auth()) {
2963 mds_rank_t inauth = diri->authority().first;
2964 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
2965 mdcache->request_forward(mdr, inauth);
2966 return 0;
2967 }
2968
2969 // not open and inode frozen?
2970 if (!dir && diri->is_frozen()) {
2971 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
2972 assert(diri->get_parent_dir());
2973 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2974 return 0;
2975 }
2976
2977 // invent?
2978 if (!dir)
2979 dir = diri->get_or_open_dirfrag(mdcache, fg);
2980
2981 // am i auth for the dirfrag?
2982 if (!dir->is_auth()) {
2983 mds_rank_t auth = dir->authority().first;
2984 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
2985 << ", fw to mds." << auth << dendl;
2986 mdcache->request_forward(mdr, auth);
2987 return 0;
2988 }
2989
2990 return dir;
2991 }
2992
2993
2994 // ===============================================================================
2995 // STAT
2996
2997 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
2998 {
2999 MClientRequest *req = mdr->client_request;
3000 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3001
3002 if (req->get_filepath().depth() == 0 && is_lookup) {
3003 // refpath can't be empty for lookup but it can for
3004 // getattr (we do getattr with empty refpath for mount of '/')
3005 respond_to_request(mdr, -EINVAL);
3006 return;
3007 }
3008
3009 CInode *ref = rdlock_path_pin_ref(mdr, 0, rdlocks, false, false, NULL, !is_lookup);
3010 if (!ref) return;
3011
3012 /*
3013 * if client currently holds the EXCL cap on a field, do not rdlock
3014 * it; client's stat() will result in valid info if _either_ EXCL
3015 * cap is held or MDS rdlocks and reads the value here.
3016 *
3017 * handling this case here is easier than weakening rdlock
3018 * semantics... that would cause problems elsewhere.
3019 */
3020 client_t client = mdr->get_client();
3021 int issued = 0;
3022 Capability *cap = ref->get_client_cap(client);
3023 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3024 mdr->snapid <= cap->client_follows))
3025 issued = cap->issued();
3026
3027 int mask = req->head.args.getattr.mask;
3028 if ((mask & CEPH_CAP_LINK_SHARED) && (issued & CEPH_CAP_LINK_EXCL) == 0) rdlocks.insert(&ref->linklock);
3029 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0) rdlocks.insert(&ref->authlock);
3030 if ((mask & CEPH_CAP_FILE_SHARED) && (issued & CEPH_CAP_FILE_EXCL) == 0) rdlocks.insert(&ref->filelock);
3031 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0) rdlocks.insert(&ref->xattrlock);
3032
3033 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3034 return;
3035
3036 if (!check_access(mdr, ref, MAY_READ))
3037 return;
3038
3039 // note which caps are requested, so we return at least a snapshot
3040 // value for them. (currently this matters for xattrs and inline data)
3041 mdr->getattr_caps = mask;
3042
3043 mds->balancer->hit_inode(ceph_clock_now(), ref, META_POP_IRD,
3044 req->get_source().num());
3045
3046 // reply
3047 dout(10) << "reply to stat on " << *req << dendl;
3048 mdr->tracei = ref;
3049 if (is_lookup)
3050 mdr->tracedn = mdr->dn[0].back();
3051 respond_to_request(mdr, 0);
3052 }
3053
3054 struct C_MDS_LookupIno2 : public ServerContext {
3055 MDRequestRef mdr;
3056 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3057 void finish(int r) override {
3058 server->_lookup_ino_2(mdr, r);
3059 }
3060 };
3061
3062 /* This function DOES clean up the mdr before returning*/
3063 /*
3064 * filepath: ino
3065 */
3066 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3067 bool want_parent, bool want_dentry)
3068 {
3069 MClientRequest *req = mdr->client_request;
3070
3071 inodeno_t ino = req->get_filepath().get_ino();
3072 CInode *in = mdcache->get_inode(ino);
3073 if (in && in->state_test(CInode::STATE_PURGING)) {
3074 respond_to_request(mdr, -ESTALE);
3075 return;
3076 }
3077 if (!in) {
3078 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3079 return;
3080 }
3081
3082 if (mdr && in->snaprealm && !in->snaprealm->is_open() &&
3083 !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
3084 return;
3085 }
3086
3087 // check for nothing (not read or write); this still applies the
3088 // path check.
3089 if (!check_access(mdr, in, 0))
3090 return;
3091
3092 CDentry *dn = in->get_projected_parent_dn();
3093 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3094
3095 set<SimpleLock*> rdlocks;
3096 if (dn && (want_parent || want_dentry)) {
3097 mdr->pin(dn);
3098 rdlocks.insert(&dn->lock);
3099 }
3100
3101 unsigned mask = req->head.args.getattr.mask;
3102 if (mask) {
3103 Capability *cap = in->get_client_cap(mdr->get_client());
3104 int issued = 0;
3105 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3106 issued = cap->issued();
3107 // permission bits, ACL/security xattrs
3108 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3109 rdlocks.insert(&in->authlock);
3110 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3111 rdlocks.insert(&in->xattrlock);
3112
3113 mdr->getattr_caps = mask;
3114 }
3115
3116 if (!rdlocks.empty()) {
3117 set<SimpleLock*> wrlocks, xlocks;
3118 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3119 return;
3120
3121 if (diri != NULL) {
3122 // need read access to directory inode
3123 if (!check_access(mdr, diri, MAY_READ))
3124 return;
3125 }
3126 }
3127
3128 if (want_parent) {
3129 if (in->is_base()) {
3130 respond_to_request(mdr, -EINVAL);
3131 return;
3132 }
3133 if (!diri || diri->is_stray()) {
3134 respond_to_request(mdr, -ESTALE);
3135 return;
3136 }
3137 dout(10) << "reply to lookup_parent " << *in << dendl;
3138 mdr->tracei = diri;
3139 respond_to_request(mdr, 0);
3140 } else {
3141 if (want_dentry) {
3142 inodeno_t dirino = req->get_filepath2().get_ino();
3143 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3144 respond_to_request(mdr, -ENOENT);
3145 return;
3146 }
3147 dout(10) << "reply to lookup_name " << *in << dendl;
3148 } else
3149 dout(10) << "reply to lookup_ino " << *in << dendl;
3150
3151 mdr->tracei = in;
3152 if (want_dentry)
3153 mdr->tracedn = dn;
3154 respond_to_request(mdr, 0);
3155 }
3156 }
3157
3158 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
3159 {
3160 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
3161 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3162
3163 // `r` is a rank if >=0, else an error code
3164 if (r >= 0) {
3165 mds_rank_t dest_rank(r);
3166 if (dest_rank == mds->get_nodeid())
3167 dispatch_client_request(mdr);
3168 else
3169 mdcache->request_forward(mdr, dest_rank);
3170 return;
3171 }
3172
3173 // give up
3174 if (r == -ENOENT || r == -ENODATA)
3175 r = -ESTALE;
3176 respond_to_request(mdr, r);
3177 }
3178
3179
3180 /* This function takes responsibility for the passed mdr*/
3181 void Server::handle_client_open(MDRequestRef& mdr)
3182 {
3183 MClientRequest *req = mdr->client_request;
3184 dout(7) << "open on " << req->get_filepath() << dendl;
3185
3186 int flags = req->head.args.open.flags;
3187 int cmode = ceph_flags_to_mode(flags);
3188 if (cmode < 0) {
3189 respond_to_request(mdr, -EINVAL);
3190 return;
3191 }
3192
3193 bool need_auth = !file_mode_is_readonly(cmode) ||
3194 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
3195
3196 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
3197 dout(7) << "read-only FS" << dendl;
3198 respond_to_request(mdr, -EROFS);
3199 return;
3200 }
3201
3202 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3203 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, need_auth);
3204 if (!cur)
3205 return;
3206
3207 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
3208 assert(!need_auth);
3209 mdr->done_locking = false;
3210 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3211 if (!cur)
3212 return;
3213 }
3214
3215 if (!cur->inode.is_file()) {
3216 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3217 cmode = CEPH_FILE_MODE_PIN;
3218 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3219 if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
3220 flags &= ~CEPH_O_TRUNC;
3221 }
3222
3223 dout(10) << "open flags = " << flags
3224 << ", filemode = " << cmode
3225 << ", need_auth = " << need_auth
3226 << dendl;
3227
3228 // regular file?
3229 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3230 dout(7) << "not a file or dir " << *cur << dendl;
3231 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3232 return;
3233 }*/
3234 if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
3235 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
3236 respond_to_request(mdr, -EINVAL);
3237 return;
3238 }
3239
3240 if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
3241 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
3242 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3243 respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
3244 return;
3245 }
3246
3247 if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
3248 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
3249 dout(7) << "old client cannot open inline data file " << *cur << dendl;
3250 respond_to_request(mdr, -EPERM);
3251 return;
3252 }
3253
3254 // snapped data is read only
3255 if (mdr->snapid != CEPH_NOSNAP &&
3256 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
3257 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
3258 respond_to_request(mdr, -EROFS);
3259 return;
3260 }
3261
3262 unsigned mask = req->head.args.open.mask;
3263 if (mask) {
3264 Capability *cap = cur->get_client_cap(mdr->get_client());
3265 int issued = 0;
3266 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3267 issued = cap->issued();
3268 // permission bits, ACL/security xattrs
3269 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3270 rdlocks.insert(&cur->authlock);
3271 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3272 rdlocks.insert(&cur->xattrlock);
3273
3274 mdr->getattr_caps = mask;
3275 }
3276
3277 // O_TRUNC
3278 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
3279 assert(cur->is_auth());
3280
3281 xlocks.insert(&cur->filelock);
3282 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3283 return;
3284
3285 if (!check_access(mdr, cur, MAY_WRITE))
3286 return;
3287
3288 // wait for pending truncate?
3289 const inode_t *pi = cur->get_projected_inode();
3290 if (pi->is_truncating()) {
3291 dout(10) << " waiting for pending truncate from " << pi->truncate_from
3292 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
3293 mds->locker->drop_locks(mdr.get());
3294 mdr->drop_local_auth_pins();
3295 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
3296 return;
3297 }
3298
3299 do_open_truncate(mdr, cmode);
3300 return;
3301 }
3302
3303 // sync filelock if snapped.
3304 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3305 // and that data itself is flushed so that we can read the snapped data off disk.
3306 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
3307 rdlocks.insert(&cur->filelock);
3308 }
3309
3310 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3311 return;
3312
3313 mask = MAY_READ;
3314 if (cmode & CEPH_FILE_MODE_WR)
3315 mask |= MAY_WRITE;
3316 if (!check_access(mdr, cur, mask))
3317 return;
3318
3319 if (cur->is_file() || cur->is_dir()) {
3320 if (mdr->snapid == CEPH_NOSNAP) {
3321 // register new cap
3322 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr->session, 0, req->is_replay());
3323 if (cap)
3324 dout(12) << "open issued caps " << ccap_string(cap->pending())
3325 << " for " << req->get_source()
3326 << " on " << *cur << dendl;
3327 } else {
3328 int caps = ceph_caps_for_mode(cmode);
3329 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
3330 << " for " << req->get_source()
3331 << " snapid " << mdr->snapid
3332 << " on " << *cur << dendl;
3333 mdr->snap_caps = caps;
3334 }
3335 }
3336
3337 // increase max_size?
3338 if (cmode & CEPH_FILE_MODE_WR)
3339 mds->locker->check_inode_max_size(cur);
3340
3341 // make sure this inode gets into the journal
3342 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
3343 !cur->item_open_file.is_on_list()) {
3344 LogSegment *ls = mds->mdlog->get_current_segment();
3345 EOpen *le = new EOpen(mds->mdlog);
3346 mdlog->start_entry(le);
3347 le->add_clean_inode(cur);
3348 ls->open_files.push_back(&cur->item_open_file);
3349 mdlog->submit_entry(le);
3350 }
3351
3352 // hit pop
3353 if (cmode & CEPH_FILE_MODE_WR)
3354 mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IWR);
3355 else
3356 mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IRD,
3357 mdr->client_request->get_source().num());
3358
3359 CDentry *dn = 0;
3360 if (req->get_dentry_wanted()) {
3361 assert(mdr->dn[0].size());
3362 dn = mdr->dn[0].back();
3363 }
3364
3365 mdr->tracei = cur;
3366 mdr->tracedn = dn;
3367 respond_to_request(mdr, 0);
3368 }
3369
3370 class C_MDS_openc_finish : public ServerLogContext {
3371 CDentry *dn;
3372 CInode *newi;
3373 snapid_t follows;
3374 public:
3375 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni, snapid_t f) :
3376 ServerLogContext(s, r), dn(d), newi(ni), follows(f) {}
3377 void finish(int r) override {
3378 assert(r == 0);
3379
3380 dn->pop_projected_linkage();
3381
3382 // dirty inode, dn, dir
3383 newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
3384 newi->mark_dirty(newi->inode.version+1, mdr->ls);
3385 newi->_mark_dirty_parent(mdr->ls, true);
3386
3387 mdr->apply();
3388
3389 get_mds()->locker->share_inode_max_size(newi);
3390
3391 MDRequestRef null_ref;
3392 get_mds()->mdcache->send_dentry_link(dn, null_ref);
3393
3394 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
3395
3396 server->respond_to_request(mdr, 0);
3397
3398 assert(g_conf->mds_kill_openc_at != 1);
3399 }
3400 };
3401
3402 /* This function takes responsibility for the passed mdr*/
3403 void Server::handle_client_openc(MDRequestRef& mdr)
3404 {
3405 MClientRequest *req = mdr->client_request;
3406 client_t client = mdr->get_client();
3407
3408 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
3409
3410 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
3411 if (cmode < 0) {
3412 respond_to_request(mdr, -EINVAL);
3413 return;
3414 }
3415
3416 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
3417
3418 if (!excl) {
3419 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(),
3420 &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
3421 if (r > 0) return;
3422 if (r == 0) {
3423 // it existed.
3424 handle_client_open(mdr);
3425 return;
3426 }
3427 if (r < 0 && r != -ENOENT) {
3428 if (r == -ESTALE) {
3429 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3430 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
3431 mdcache->find_ino_peers(req->get_filepath().get_ino(), c);
3432 } else {
3433 dout(10) << "FAIL on error " << r << dendl;
3434 respond_to_request(mdr, r);
3435 }
3436 return;
3437 }
3438 }
3439
3440 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3441 file_layout_t *dir_layout = NULL;
3442 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks,
3443 !excl, false, false, &dir_layout);
3444 if (!dn) return;
3445 if (mdr->snapid != CEPH_NOSNAP) {
3446 respond_to_request(mdr, -EROFS);
3447 return;
3448 }
3449 // set layout
3450 file_layout_t layout;
3451 if (dir_layout)
3452 layout = *dir_layout;
3453 else
3454 layout = mdcache->default_file_layout;
3455
3456 // What kind of client caps are required to complete this operation
3457 uint64_t access = MAY_WRITE;
3458
3459 const auto default_layout = layout;
3460
3461 // fill in any special params from client
3462 if (req->head.args.open.stripe_unit)
3463 layout.stripe_unit = req->head.args.open.stripe_unit;
3464 if (req->head.args.open.stripe_count)
3465 layout.stripe_count = req->head.args.open.stripe_count;
3466 if (req->head.args.open.object_size)
3467 layout.object_size = req->head.args.open.object_size;
3468 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
3469 (__s32)req->head.args.open.pool >= 0) {
3470 layout.pool_id = req->head.args.open.pool;
3471
3472 // make sure we have as new a map as the client
3473 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
3474 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
3475 return;
3476 }
3477 }
3478
3479 // If client doesn't have capability to modify layout pools, then
3480 // only permit this request if the requested pool matches what the
3481 // file would have inherited anyway from its parent.
3482 if (default_layout != layout) {
3483 access |= MAY_SET_VXATTR;
3484 }
3485
3486 if (!layout.is_valid()) {
3487 dout(10) << " invalid initial file layout" << dendl;
3488 respond_to_request(mdr, -EINVAL);
3489 return;
3490 }
3491 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
3492 dout(10) << " invalid data pool " << layout.pool_id << dendl;
3493 respond_to_request(mdr, -EINVAL);
3494 return;
3495 }
3496
3497 // created null dn.
3498 CDir *dir = dn->get_dir();
3499 CInode *diri = dir->get_inode();
3500 rdlocks.insert(&diri->authlock);
3501 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3502 return;
3503
3504 if (!check_access(mdr, diri, access))
3505 return;
3506
3507 if (!check_fragment_space(mdr, dir))
3508 return;
3509
3510 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3511
3512 if (!dnl->is_null()) {
3513 // it existed.
3514 assert(req->head.args.open.flags & CEPH_O_EXCL);
3515 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
3516 mdr->tracei = dnl->get_inode();
3517 mdr->tracedn = dn;
3518 respond_to_request(mdr, -EEXIST);
3519 return;
3520 }
3521
3522 // create inode.
3523 SnapRealm *realm = diri->find_snaprealm(); // use directory's realm; inode isn't attached yet.
3524 snapid_t follows = realm->get_newest_seq();
3525
3526 CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
3527 req->head.args.open.mode | S_IFREG, &layout);
3528 assert(in);
3529
3530 // it's a file.
3531 dn->push_projected_linkage(in);
3532
3533 in->inode.version = dn->pre_dirty();
3534 if (layout.pool_id != mdcache->default_file_layout.pool_id)
3535 in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
3536 in->inode.update_backtrace();
3537 if (cmode & CEPH_FILE_MODE_WR) {
3538 in->inode.client_ranges[client].range.first = 0;
3539 in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
3540 in->inode.client_ranges[client].follows = follows;
3541 }
3542 in->inode.rstat.rfiles = 1;
3543
3544 assert(dn->first == follows+1);
3545 in->first = dn->first;
3546
3547 // prepare finisher
3548 mdr->ls = mdlog->get_current_segment();
3549 EUpdate *le = new EUpdate(mdlog, "openc");
3550 mdlog->start_entry(le);
3551 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
3552 journal_allocated_inos(mdr, &le->metablob);
3553 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
3554 le->metablob.add_primary_dentry(dn, in, true, true, true);
3555
3556 // do the open
3557 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
3558 in->authlock.set_state(LOCK_EXCL);
3559 in->xattrlock.set_state(LOCK_EXCL);
3560
3561 // make sure this inode gets into the journal
3562 le->metablob.add_opened_ino(in->ino());
3563 LogSegment *ls = mds->mdlog->get_current_segment();
3564 ls->open_files.push_back(&in->item_open_file);
3565
3566 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in, follows);
3567
3568 if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
3569 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
3570 // add the file created flag onto the reply if create_flags features is supported
3571 ::encode(in->inode.ino, mdr->reply_extra_bl);
3572 }
3573
3574 journal_and_reply(mdr, in, dn, le, fin);
3575
3576 // We hit_dir (via hit_inode) in our finish callback, but by then we might
3577 // have overshot the split size (multiple opencs in flight), so here is
3578 // an early chance to split the dir if this openc makes it oversized.
3579 mds->balancer->maybe_fragment(dir, false);
3580 }
3581
3582
3583
3584 void Server::handle_client_readdir(MDRequestRef& mdr)
3585 {
3586 MClientRequest *req = mdr->client_request;
3587 client_t client = req->get_source().num();
3588 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3589 CInode *diri = rdlock_path_pin_ref(mdr, 0, rdlocks, false, true);
3590 if (!diri) return;
3591
3592 // it's a directory, right?
3593 if (!diri->is_dir()) {
3594 // not a dir
3595 dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
3596 respond_to_request(mdr, -ENOTDIR);
3597 return;
3598 }
3599
3600 rdlocks.insert(&diri->filelock);
3601 rdlocks.insert(&diri->dirfragtreelock);
3602
3603 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3604 return;
3605
3606 if (!check_access(mdr, diri, MAY_READ))
3607 return;
3608
3609 // which frag?
3610 frag_t fg = (__u32)req->head.args.readdir.frag;
3611 unsigned req_flags = (__u32)req->head.args.readdir.flags;
3612 string offset_str = req->get_path2();
3613
3614 __u32 offset_hash = 0;
3615 if (!offset_str.empty())
3616 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
3617 else
3618 offset_hash = (__u32)req->head.args.readdir.offset_hash;
3619
3620 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
3621 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
3622
3623 // does the frag exist?
3624 if (diri->dirfragtree[fg.value()] != fg) {
3625 frag_t newfg;
3626 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3627 if (fg.contains((unsigned)offset_hash)) {
3628 newfg = diri->dirfragtree[offset_hash];
3629 } else {
3630 // client actually wants next frag
3631 newfg = diri->dirfragtree[fg.value()];
3632 }
3633 } else {
3634 offset_str.clear();
3635 newfg = diri->dirfragtree[fg.value()];
3636 }
3637 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
3638 fg = newfg;
3639 }
3640
3641 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
3642 if (!dir) return;
3643
3644 // ok!
3645 dout(10) << "handle_client_readdir on " << *dir << dendl;
3646 assert(dir->is_auth());
3647
3648 if (!dir->is_complete()) {
3649 if (dir->is_frozen()) {
3650 dout(7) << "dir is frozen " << *dir << dendl;
3651 mds->locker->drop_locks(mdr.get());
3652 mdr->drop_local_auth_pins();
3653 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3654 return;
3655 }
3656 // fetch
3657 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
3658 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
3659 return;
3660 }
3661
3662 #ifdef MDS_VERIFY_FRAGSTAT
3663 dir->verify_fragstat();
3664 #endif
3665
3666 utime_t now = ceph_clock_now();
3667 mdr->set_mds_stamp(now);
3668
3669 snapid_t snapid = mdr->snapid;
3670 dout(10) << "snapid " << snapid << dendl;
3671
3672 SnapRealm *realm = diri->find_snaprealm();
3673
3674 unsigned max = req->head.args.readdir.max_entries;
3675 if (!max)
3676 max = dir->get_num_any(); // whatever, something big.
3677 unsigned max_bytes = req->head.args.readdir.max_bytes;
3678 if (!max_bytes)
3679 // make sure at least one item can be encoded
3680 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
3681
3682 // start final blob
3683 bufferlist dirbl;
3684 dir->encode_dirstat(dirbl, mds->get_nodeid());
3685
3686 // count bytes available.
3687 // this isn't perfect, but we should capture the main variable/unbounded size items!
3688 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
3689 int bytes_left = max_bytes - front_bytes;
3690 bytes_left -= realm->get_snap_trace().length();
3691
3692 // build dir contents
3693 bufferlist dnbl;
3694 __u32 numfiles = 0;
3695 bool start = !offset_hash && offset_str.empty();
3696 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
3697 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
3698 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
3699 bool end = (it == dir->end());
3700 for (; !end && numfiles < max; end = (it == dir->end())) {
3701 CDentry *dn = it->second;
3702 ++it;
3703
3704 if (dn->state_test(CDentry::STATE_PURGING))
3705 continue;
3706
3707 bool dnp = dn->use_projected(client, mdr);
3708 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
3709
3710 if (dnl->is_null())
3711 continue;
3712
3713 if (dn->last < snapid || dn->first > snapid) {
3714 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
3715 continue;
3716 }
3717
3718 if (!start) {
3719 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
3720 if (!(offset_key < dn->key()))
3721 continue;
3722 }
3723
3724 CInode *in = dnl->get_inode();
3725
3726 if (in && in->ino() == CEPH_INO_CEPH)
3727 continue;
3728
3729 // remote link?
3730 // better for the MDS to do the work, if we think the client will stat any of these files.
3731 if (dnl->is_remote() && !in) {
3732 in = mdcache->get_inode(dnl->get_remote_ino());
3733 if (in) {
3734 dn->link_remote(dnl, in);
3735 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
3736 dout(10) << "skipping bad remote ino on " << *dn << dendl;
3737 continue;
3738 } else {
3739 // touch everything i _do_ have
3740 for (CDir::map_t::iterator p = dir->begin(); p != dir->end(); ++p)
3741 if (!p->second->get_linkage()->is_null())
3742 mdcache->lru.lru_touch(p->second);
3743
3744 // already issued caps and leases, reply immediately.
3745 if (dnbl.length() > 0) {
3746 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
3747 dout(10) << " open remote dentry after caps were issued, stopping at "
3748 << dnbl.length() << " < " << bytes_left << dendl;
3749 break;
3750 }
3751
3752 mds->locker->drop_locks(mdr.get());
3753 mdr->drop_local_auth_pins();
3754 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
3755 return;
3756 }
3757 }
3758 assert(in);
3759
3760 if ((int)(dnbl.length() + dn->name.length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
3761 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
3762 break;
3763 }
3764
3765 unsigned start_len = dnbl.length();
3766
3767 // dentry
3768 dout(12) << "including dn " << *dn << dendl;
3769 ::encode(dn->name, dnbl);
3770 mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session);
3771
3772 // inode
3773 dout(12) << "including inode " << *in << dendl;
3774 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
3775 if (r < 0) {
3776 // chop off dn->name, lease
3777 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
3778 bufferlist keep;
3779 keep.substr_of(dnbl, 0, start_len);
3780 dnbl.swap(keep);
3781 break;
3782 }
3783 assert(r >= 0);
3784 numfiles++;
3785
3786 // touch dn
3787 mdcache->lru.lru_touch(dn);
3788 }
3789
3790 __u16 flags = 0;
3791 if (end) {
3792 flags = CEPH_READDIR_FRAG_END;
3793 if (start)
3794 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
3795 }
3796 // client only understand END and COMPLETE flags ?
3797 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3798 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
3799 }
3800
3801 // finish final blob
3802 ::encode(numfiles, dirbl);
3803 ::encode(flags, dirbl);
3804 dirbl.claim_append(dnbl);
3805
3806 // yay, reply
3807 dout(10) << "reply to " << *req << " readdir num=" << numfiles
3808 << " bytes=" << dirbl.length()
3809 << " start=" << (int)start
3810 << " end=" << (int)end
3811 << dendl;
3812 mdr->reply_extra_bl = dirbl;
3813
3814 // bump popularity. NOTE: this doesn't quite capture it.
3815 mds->balancer->hit_dir(now, dir, META_POP_IRD, -1, numfiles);
3816
3817 // reply
3818 mdr->tracei = diri;
3819 respond_to_request(mdr, 0);
3820 }
3821
3822
3823
3824 // ===============================================================================
3825 // INODE UPDATES
3826
3827
3828 /*
3829 * finisher for basic inode updates
3830 */
3831 class C_MDS_inode_update_finish : public ServerLogContext {
3832 CInode *in;
3833 bool truncating_smaller, changed_ranges;
3834 public:
3835 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
3836 bool sm=false, bool cr=false) :
3837 ServerLogContext(s, r), in(i), truncating_smaller(sm), changed_ranges(cr) { }
3838 void finish(int r) override {
3839 assert(r == 0);
3840
3841 // apply
3842 in->pop_and_dirty_projected_inode(mdr->ls);
3843 mdr->apply();
3844
3845 // notify any clients
3846 if (truncating_smaller && in->inode.is_truncating()) {
3847 get_mds()->locker->issue_truncate(in);
3848 get_mds()->mdcache->truncate_inode(in, mdr->ls);
3849 }
3850
3851 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
3852
3853 server->respond_to_request(mdr, 0);
3854
3855 if (changed_ranges)
3856 get_mds()->locker->share_inode_max_size(in);
3857 }
3858 };
3859
3860 void Server::handle_client_file_setlock(MDRequestRef& mdr)
3861 {
3862 MClientRequest *req = mdr->client_request;
3863 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3864
3865 // get the inode to operate on, and set up any locks needed for that
3866 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3867 if (!cur)
3868 return;
3869
3870 xlocks.insert(&cur->flocklock);
3871 /* acquire_locks will return true if it gets the locks. If it fails,
3872 it will redeliver this request at a later date, so drop the request.
3873 */
3874 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
3875 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
3876 return;
3877 }
3878
3879 // copy the lock change into a ceph_filelock so we can store/apply it
3880 ceph_filelock set_lock;
3881 set_lock.start = req->head.args.filelock_change.start;
3882 set_lock.length = req->head.args.filelock_change.length;
3883 set_lock.client = req->get_orig_source().num();
3884 set_lock.owner = req->head.args.filelock_change.owner;
3885 set_lock.pid = req->head.args.filelock_change.pid;
3886 set_lock.type = req->head.args.filelock_change.type;
3887 bool will_wait = req->head.args.filelock_change.wait;
3888
3889 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
3890
3891 ceph_lock_state_t *lock_state = NULL;
3892 bool interrupt = false;
3893
3894 // get the appropriate lock state
3895 switch (req->head.args.filelock_change.rule) {
3896 case CEPH_LOCK_FLOCK_INTR:
3897 interrupt = true;
3898 // fall-thru
3899 case CEPH_LOCK_FLOCK:
3900 lock_state = cur->get_flock_lock_state();
3901 break;
3902
3903 case CEPH_LOCK_FCNTL_INTR:
3904 interrupt = true;
3905 // fall-thru
3906 case CEPH_LOCK_FCNTL:
3907 lock_state = cur->get_fcntl_lock_state();
3908 break;
3909
3910 default:
3911 dout(10) << "got unknown lock type " << set_lock.type
3912 << ", dropping request!" << dendl;
3913 respond_to_request(mdr, -EOPNOTSUPP);
3914 return;
3915 }
3916
3917 dout(10) << " state prior to lock change: " << *lock_state << dendl;
3918 if (CEPH_LOCK_UNLOCK == set_lock.type) {
3919 list<ceph_filelock> activated_locks;
3920 list<MDSInternalContextBase*> waiters;
3921 if (lock_state->is_waiting(set_lock)) {
3922 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
3923 lock_state->remove_waiting(set_lock);
3924 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
3925 } else if (!interrupt) {
3926 dout(10) << " unlock attempt on " << set_lock << dendl;
3927 lock_state->remove_lock(set_lock, activated_locks);
3928 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
3929 }
3930 mds->queue_waiters(waiters);
3931
3932 respond_to_request(mdr, 0);
3933 } else {
3934 dout(10) << " lock attempt on " << set_lock << dendl;
3935 bool deadlock = false;
3936 if (mdr->more()->flock_was_waiting &&
3937 !lock_state->is_waiting(set_lock)) {
3938 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
3939 respond_to_request(mdr, -EINTR);
3940 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
3941 dout(10) << " it failed on this attempt" << dendl;
3942 // couldn't set lock right now
3943 if (deadlock) {
3944 respond_to_request(mdr, -EDEADLK);
3945 } else if (!will_wait) {
3946 respond_to_request(mdr, -EWOULDBLOCK);
3947 } else {
3948 dout(10) << " added to waiting list" << dendl;
3949 assert(lock_state->is_waiting(set_lock));
3950 mdr->more()->flock_was_waiting = true;
3951 mds->locker->drop_locks(mdr.get());
3952 mdr->drop_local_auth_pins();
3953 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
3954 }
3955 } else
3956 respond_to_request(mdr, 0);
3957 }
3958 dout(10) << " state after lock change: " << *lock_state << dendl;
3959 }
3960
3961 void Server::handle_client_file_readlock(MDRequestRef& mdr)
3962 {
3963 MClientRequest *req = mdr->client_request;
3964 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3965
3966 // get the inode to operate on, and set up any locks needed for that
3967 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3968 if (!cur)
3969 return;
3970
3971 /* acquire_locks will return true if it gets the locks. If it fails,
3972 it will redeliver this request at a later date, so drop the request.
3973 */
3974 rdlocks.insert(&cur->flocklock);
3975 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
3976 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
3977 return;
3978 }
3979
3980 // copy the lock change into a ceph_filelock so we can store/apply it
3981 ceph_filelock checking_lock;
3982 checking_lock.start = req->head.args.filelock_change.start;
3983 checking_lock.length = req->head.args.filelock_change.length;
3984 checking_lock.client = req->get_orig_source().num();
3985 checking_lock.owner = req->head.args.filelock_change.owner;
3986 checking_lock.pid = req->head.args.filelock_change.pid;
3987 checking_lock.type = req->head.args.filelock_change.type;
3988
3989 // get the appropriate lock state
3990 ceph_lock_state_t *lock_state = NULL;
3991 switch (req->head.args.filelock_change.rule) {
3992 case CEPH_LOCK_FLOCK:
3993 lock_state = cur->get_flock_lock_state();
3994 break;
3995
3996 case CEPH_LOCK_FCNTL:
3997 lock_state = cur->get_fcntl_lock_state();
3998 break;
3999
4000 default:
4001 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
4002 respond_to_request(mdr, -EINVAL);
4003 return;
4004 }
4005 lock_state->look_for_lock(checking_lock);
4006
4007 bufferlist lock_bl;
4008 ::encode(checking_lock, lock_bl);
4009
4010 mdr->reply_extra_bl = lock_bl;
4011 respond_to_request(mdr, 0);
4012 }
4013
4014 void Server::handle_client_setattr(MDRequestRef& mdr)
4015 {
4016 MClientRequest *req = mdr->client_request;
4017 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4018 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4019 if (!cur) return;
4020
4021 if (mdr->snapid != CEPH_NOSNAP) {
4022 respond_to_request(mdr, -EROFS);
4023 return;
4024 }
4025 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
4026 respond_to_request(mdr, -EPERM);
4027 return;
4028 }
4029
4030 __u32 mask = req->head.args.setattr.mask;
4031 __u32 access_mask = MAY_WRITE;
4032
4033 // xlock inode
4034 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
4035 xlocks.insert(&cur->authlock);
4036 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
4037 xlocks.insert(&cur->filelock);
4038 if (mask & CEPH_SETATTR_CTIME)
4039 wrlocks.insert(&cur->versionlock);
4040
4041 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4042 return;
4043
4044 if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
4045 access_mask |= MAY_CHOWN;
4046
4047 if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
4048 access_mask |= MAY_CHGRP;
4049
4050 if (!check_access(mdr, cur, access_mask))
4051 return;
4052
4053 // trunc from bigger -> smaller?
4054 inode_t *pi = cur->get_projected_inode();
4055
4056 uint64_t old_size = MAX(pi->size, req->head.args.setattr.old_size);
4057
4058 // ENOSPC on growing file while full, but allow shrinks
4059 if (is_full && req->head.args.setattr.size > old_size) {
4060 dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
4061 respond_to_request(mdr, -ENOSPC);
4062 return;
4063 }
4064
4065 bool truncating_smaller = false;
4066 if (mask & CEPH_SETATTR_SIZE) {
4067 truncating_smaller = req->head.args.setattr.size < old_size;
4068 if (truncating_smaller && pi->is_truncating()) {
4069 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4070 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4071 mds->locker->drop_locks(mdr.get());
4072 mdr->drop_local_auth_pins();
4073 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4074 return;
4075 }
4076 }
4077
4078 bool changed_ranges = false;
4079
4080 // project update
4081 mdr->ls = mdlog->get_current_segment();
4082 EUpdate *le = new EUpdate(mdlog, "setattr");
4083 mdlog->start_entry(le);
4084
4085 pi = cur->project_inode();
4086
4087 if (mask & CEPH_SETATTR_UID)
4088 pi->uid = req->head.args.setattr.uid;
4089 if (mask & CEPH_SETATTR_GID)
4090 pi->gid = req->head.args.setattr.gid;
4091
4092 if (mask & CEPH_SETATTR_MODE)
4093 pi->mode = (pi->mode & ~07777) | (req->head.args.setattr.mode & 07777);
4094 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
4095 S_ISREG(pi->mode) &&
4096 (pi->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
4097 pi->mode &= ~(S_ISUID|S_ISGID);
4098 }
4099
4100 if (mask & CEPH_SETATTR_MTIME)
4101 pi->mtime = req->head.args.setattr.mtime;
4102 if (mask & CEPH_SETATTR_ATIME)
4103 pi->atime = req->head.args.setattr.atime;
4104 if (mask & CEPH_SETATTR_BTIME)
4105 pi->btime = req->head.args.setattr.btime;
4106 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
4107 pi->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
4108 if (mask & CEPH_SETATTR_SIZE) {
4109 if (truncating_smaller) {
4110 pi->truncate(old_size, req->head.args.setattr.size);
4111 le->metablob.add_truncate_start(cur->ino());
4112 } else {
4113 pi->size = req->head.args.setattr.size;
4114 pi->rstat.rbytes = pi->size;
4115 }
4116 pi->mtime = mdr->get_op_stamp();
4117
4118 // adjust client's max_size?
4119 map<client_t,client_writeable_range_t> new_ranges;
4120 bool max_increased = false;
4121 mds->locker->calc_new_client_ranges(cur, pi->size, &new_ranges, &max_increased);
4122 if (pi->client_ranges != new_ranges) {
4123 dout(10) << " client_ranges " << pi->client_ranges << " -> " << new_ranges << dendl;
4124 pi->client_ranges = new_ranges;
4125 changed_ranges = true;
4126 }
4127 }
4128
4129 pi->version = cur->pre_dirty();
4130 pi->ctime = mdr->get_op_stamp();
4131 pi->change_attr++;
4132
4133 // log + wait
4134 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4135 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4136 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4137
4138 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
4139 truncating_smaller, changed_ranges));
4140
4141 // flush immediately if there are readers/writers waiting
4142 if (xlocks.count(&cur->filelock) &&
4143 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
4144 mds->mdlog->flush();
4145 }
4146
4147 /* Takes responsibility for mdr */
4148 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
4149 {
4150 CInode *in = mdr->in[0];
4151 client_t client = mdr->get_client();
4152 assert(in);
4153
4154 dout(10) << "do_open_truncate " << *in << dendl;
4155
4156 SnapRealm *realm = in->find_snaprealm();
4157 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
4158
4159 mdr->ls = mdlog->get_current_segment();
4160 EUpdate *le = new EUpdate(mdlog, "open_truncate");
4161 mdlog->start_entry(le);
4162
4163 // prepare
4164 inode_t *pi = in->project_inode();
4165 pi->version = in->pre_dirty();
4166 pi->mtime = pi->ctime = mdr->get_op_stamp();
4167 pi->change_attr++;
4168
4169 uint64_t old_size = MAX(pi->size, mdr->client_request->head.args.open.old_size);
4170 if (old_size > 0) {
4171 pi->truncate(old_size, 0);
4172 le->metablob.add_truncate_start(in->ino());
4173 }
4174
4175 bool changed_ranges = false;
4176 if (cmode & CEPH_FILE_MODE_WR) {
4177 pi->client_ranges[client].range.first = 0;
4178 pi->client_ranges[client].range.last = pi->get_layout_size_increment();
4179 pi->client_ranges[client].follows = in->find_snaprealm()->get_newest_seq();
4180 changed_ranges = true;
4181 }
4182
4183 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
4184
4185 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
4186 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
4187
4188 // make sure ino gets into the journal
4189 le->metablob.add_opened_ino(in->ino());
4190 LogSegment *ls = mds->mdlog->get_current_segment();
4191 ls->open_files.push_back(&in->item_open_file);
4192
4193 mdr->o_trunc = true;
4194
4195 CDentry *dn = 0;
4196 if (mdr->client_request->get_dentry_wanted()) {
4197 assert(mdr->dn[0].size());
4198 dn = mdr->dn[0].back();
4199 }
4200
4201 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
4202 changed_ranges));
4203 // Although the `open` part can give an early reply, the truncation won't
4204 // happen until our EUpdate is persistent, to give the client a prompt
4205 // response we must also flush that event.
4206 mdlog->flush();
4207 }
4208
4209
4210 /* This function cleans up the passed mdr */
4211 void Server::handle_client_setlayout(MDRequestRef& mdr)
4212 {
4213 MClientRequest *req = mdr->client_request;
4214 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4215 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4216 if (!cur) return;
4217
4218 if (mdr->snapid != CEPH_NOSNAP) {
4219 respond_to_request(mdr, -EROFS);
4220 return;
4221 }
4222 if (!cur->is_file()) {
4223 respond_to_request(mdr, -EINVAL);
4224 return;
4225 }
4226 if (cur->get_projected_inode()->size ||
4227 cur->get_projected_inode()->truncate_seq > 1) {
4228 respond_to_request(mdr, -ENOTEMPTY);
4229 return;
4230 }
4231
4232 // validate layout
4233 file_layout_t layout = cur->get_projected_inode()->layout;
4234 // save existing layout for later
4235 const auto old_layout = layout;
4236
4237 int access = MAY_WRITE;
4238
4239 if (req->head.args.setlayout.layout.fl_object_size > 0)
4240 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4241 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4242 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4243 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4244 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4245 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4246 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4247
4248 // make sure we have as new a map as the client
4249 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4250 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4251 return;
4252 }
4253 }
4254
4255 // Don't permit layout modifications without 'p' caps
4256 if (layout != old_layout) {
4257 access |= MAY_SET_VXATTR;
4258 }
4259
4260 if (!layout.is_valid()) {
4261 dout(10) << "bad layout" << dendl;
4262 respond_to_request(mdr, -EINVAL);
4263 return;
4264 }
4265 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4266 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4267 respond_to_request(mdr, -EINVAL);
4268 return;
4269 }
4270
4271 xlocks.insert(&cur->filelock);
4272 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4273 return;
4274
4275 if (!check_access(mdr, cur, access))
4276 return;
4277
4278 // project update
4279 inode_t *pi = cur->project_inode();
4280 pi->layout = layout;
4281 // add the old pool to the inode
4282 pi->add_old_pool(old_layout.pool_id);
4283 pi->version = cur->pre_dirty();
4284 pi->ctime = mdr->get_op_stamp();
4285 pi->change_attr++;
4286
4287 // log + wait
4288 mdr->ls = mdlog->get_current_segment();
4289 EUpdate *le = new EUpdate(mdlog, "setlayout");
4290 mdlog->start_entry(le);
4291 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4292 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4293 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4294
4295 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4296 }
4297
4298 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
4299 {
4300 MClientRequest *req = mdr->client_request;
4301 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4302 file_layout_t *dir_layout = NULL;
4303 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4304 if (!cur) return;
4305
4306 if (mdr->snapid != CEPH_NOSNAP) {
4307 respond_to_request(mdr, -EROFS);
4308 return;
4309 }
4310
4311 if (!cur->is_dir()) {
4312 respond_to_request(mdr, -ENOTDIR);
4313 return;
4314 }
4315
4316 xlocks.insert(&cur->policylock);
4317 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4318 return;
4319
4320 // validate layout
4321 const inode_t *old_pi = cur->get_projected_inode();
4322 file_layout_t layout;
4323 if (old_pi->has_layout())
4324 layout = old_pi->layout;
4325 else if (dir_layout)
4326 layout = *dir_layout;
4327 else
4328 layout = mdcache->default_file_layout;
4329
4330 // Level of access required to complete
4331 int access = MAY_WRITE;
4332
4333 const auto old_layout = layout;
4334
4335 if (req->head.args.setlayout.layout.fl_object_size > 0)
4336 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4337 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4338 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4339 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4340 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4341 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4342 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4343 // make sure we have as new a map as the client
4344 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4345 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4346 return;
4347 }
4348 }
4349
4350 if (layout != old_layout) {
4351 access |= MAY_SET_VXATTR;
4352 }
4353
4354 if (!layout.is_valid()) {
4355 dout(10) << "bad layout" << dendl;
4356 respond_to_request(mdr, -EINVAL);
4357 return;
4358 }
4359 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4360 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4361 respond_to_request(mdr, -EINVAL);
4362 return;
4363 }
4364
4365 if (!check_access(mdr, cur, access))
4366 return;
4367
4368 inode_t *pi = cur->project_inode();
4369 pi->layout = layout;
4370 pi->version = cur->pre_dirty();
4371
4372 // log + wait
4373 mdr->ls = mdlog->get_current_segment();
4374 EUpdate *le = new EUpdate(mdlog, "setlayout");
4375 mdlog->start_entry(le);
4376 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4377 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4378 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4379
4380 mdr->no_early_reply = true;
4381 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4382 }
4383
4384 // XATTRS
4385
4386 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
4387 file_layout_t *layout, bool validate)
4388 {
4389 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
4390 try {
4391 if (name == "layout") {
4392 string::iterator begin = value.begin();
4393 string::iterator end = value.end();
4394 keys_and_values<string::iterator> p; // create instance of parser
4395 std::map<string, string> m; // map to receive results
4396 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4397 return -EINVAL;
4398 }
4399 string left(begin, end);
4400 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4401 if (begin != end)
4402 return -EINVAL;
4403 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4404 // Skip validation on each attr, we do it once at the end (avoid
4405 // rejecting intermediate states if the overall result is ok)
4406 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
4407 osdmap, layout, false);
4408 if (r < 0)
4409 return r;
4410 }
4411 } else if (name == "layout.object_size") {
4412 layout->object_size = boost::lexical_cast<unsigned>(value);
4413 } else if (name == "layout.stripe_unit") {
4414 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
4415 } else if (name == "layout.stripe_count") {
4416 layout->stripe_count = boost::lexical_cast<unsigned>(value);
4417 } else if (name == "layout.pool") {
4418 try {
4419 layout->pool_id = boost::lexical_cast<unsigned>(value);
4420 } catch (boost::bad_lexical_cast const&) {
4421 int64_t pool = osdmap.lookup_pg_pool_name(value);
4422 if (pool < 0) {
4423 dout(10) << " unknown pool " << value << dendl;
4424 return -ENOENT;
4425 }
4426 layout->pool_id = pool;
4427 }
4428 } else if (name == "layout.pool_namespace") {
4429 layout->pool_ns = value;
4430 } else {
4431 dout(10) << " unknown layout vxattr " << name << dendl;
4432 return -EINVAL;
4433 }
4434 } catch (boost::bad_lexical_cast const&) {
4435 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4436 return -EINVAL;
4437 }
4438
4439 if (validate && !layout->is_valid()) {
4440 dout(10) << "bad layout" << dendl;
4441 return -EINVAL;
4442 }
4443 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
4444 dout(10) << " invalid data pool " << layout->pool_id << dendl;
4445 return -EINVAL;
4446 }
4447 return 0;
4448 }
4449
4450 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
4451 {
4452 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
4453 try {
4454 if (name == "quota") {
4455 string::iterator begin = value.begin();
4456 string::iterator end = value.end();
4457 keys_and_values<string::iterator> p; // create instance of parser
4458 std::map<string, string> m; // map to receive results
4459 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4460 return -EINVAL;
4461 }
4462 string left(begin, end);
4463 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4464 if (begin != end)
4465 return -EINVAL;
4466 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4467 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
4468 if (r < 0)
4469 return r;
4470 }
4471 } else if (name == "quota.max_bytes") {
4472 int64_t q = boost::lexical_cast<int64_t>(value);
4473 if (q < 0)
4474 return -EINVAL;
4475 quota->max_bytes = q;
4476 } else if (name == "quota.max_files") {
4477 int64_t q = boost::lexical_cast<int64_t>(value);
4478 if (q < 0)
4479 return -EINVAL;
4480 quota->max_files = q;
4481 } else {
4482 dout(10) << " unknown quota vxattr " << name << dendl;
4483 return -EINVAL;
4484 }
4485 } catch (boost::bad_lexical_cast const&) {
4486 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4487 return -EINVAL;
4488 }
4489
4490 if (!quota->is_valid()) {
4491 dout(10) << "bad quota" << dendl;
4492 return -EINVAL;
4493 }
4494 return 0;
4495 }
4496
4497 /*
4498 * Verify that the file layout attribute carried by client
4499 * is well-formatted.
4500 * Return 0 on success, otherwise this function takes
4501 * responsibility for the passed mdr.
4502 */
4503 int Server::check_layout_vxattr(MDRequestRef& mdr,
4504 string name,
4505 string value,
4506 file_layout_t *layout)
4507 {
4508 MClientRequest *req = mdr->client_request;
4509 epoch_t epoch;
4510 int r;
4511
4512 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4513 r = parse_layout_vxattr(name, value, osdmap, layout);
4514 epoch = osdmap.get_epoch();
4515 });
4516
4517 if (r == -ENOENT) {
4518
4519 // we don't have the specified pool, make sure our map
4520 // is newer than or as new as the client.
4521 epoch_t req_epoch = req->get_osdmap_epoch();
4522
4523 if (req_epoch > epoch) {
4524
4525 // well, our map is older. consult mds.
4526 Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
4527
4528 if (!mds->objecter->wait_for_map(req_epoch, fin))
4529 return r; // wait, fin will retry this request later
4530
4531 delete fin;
4532
4533 // now we have at least as new a map as the client, try again.
4534 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4535 r = parse_layout_vxattr(name, value, osdmap, layout);
4536 epoch = osdmap.get_epoch();
4537 });
4538
4539 assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
4540
4541 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
4542
4543 // For compatibility with client w/ old code, we still need get the
4544 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
4545 // we can remove those code.
4546 mdr->waited_for_osdmap = true;
4547 mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
4548 mds, new C_MDS_RetryRequest(mdcache, mdr)));
4549 return r;
4550 }
4551 }
4552
4553 if (r < 0) {
4554
4555 if (r == -ENOENT)
4556 r = -EINVAL;
4557
4558 respond_to_request(mdr, r);
4559 return r;
4560 }
4561
4562 // all is well
4563 return 0;
4564 }
4565
4566 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
4567 file_layout_t *dir_layout,
4568 set<SimpleLock*> rdlocks,
4569 set<SimpleLock*> wrlocks,
4570 set<SimpleLock*> xlocks)
4571 {
4572 MClientRequest *req = mdr->client_request;
4573 string name(req->get_path2());
4574 bufferlist bl = req->get_data();
4575 string value (bl.c_str(), bl.length());
4576 dout(10) << "handle_set_vxattr " << name
4577 << " val " << value.length()
4578 << " bytes on " << *cur
4579 << dendl;
4580
4581 inode_t *pi = NULL;
4582 string rest;
4583
4584 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
4585 return;
4586 }
4587
4588 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
4589 if (!cur->is_dir()) {
4590 respond_to_request(mdr, -EINVAL);
4591 return;
4592 }
4593
4594 file_layout_t layout;
4595 if (cur->get_projected_inode()->has_layout())
4596 layout = cur->get_projected_inode()->layout;
4597 else if (dir_layout)
4598 layout = *dir_layout;
4599 else
4600 layout = mdcache->default_file_layout;
4601
4602 rest = name.substr(name.find("layout"));
4603 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4604 return;
4605
4606 xlocks.insert(&cur->policylock);
4607 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4608 return;
4609
4610 pi = cur->project_inode();
4611 pi->layout = layout;
4612 mdr->no_early_reply = true;
4613 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
4614 if (!cur->is_file()) {
4615 respond_to_request(mdr, -EINVAL);
4616 return;
4617 }
4618 if (cur->get_projected_inode()->size ||
4619 cur->get_projected_inode()->truncate_seq > 1) {
4620 respond_to_request(mdr, -ENOTEMPTY);
4621 return;
4622 }
4623 file_layout_t layout = cur->get_projected_inode()->layout;
4624 rest = name.substr(name.find("layout"));
4625 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4626 return;
4627
4628 xlocks.insert(&cur->filelock);
4629 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4630 return;
4631
4632 pi = cur->project_inode();
4633 int64_t old_pool = pi->layout.pool_id;
4634 pi->add_old_pool(old_pool);
4635 pi->layout = layout;
4636 pi->ctime = mdr->get_op_stamp();
4637 } else if (name.compare(0, 10, "ceph.quota") == 0) {
4638 if (!cur->is_dir() || cur->is_root()) {
4639 respond_to_request(mdr, -EINVAL);
4640 return;
4641 }
4642
4643 quota_info_t quota = cur->get_projected_inode()->quota;
4644
4645 rest = name.substr(name.find("quota"));
4646 int r = parse_quota_vxattr(rest, value, &quota);
4647 if (r < 0) {
4648 respond_to_request(mdr, r);
4649 return;
4650 }
4651
4652 xlocks.insert(&cur->policylock);
4653 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4654 return;
4655
4656 pi = cur->project_inode();
4657 pi->quota = quota;
4658 mdr->no_early_reply = true;
4659 } else if (name.find("ceph.dir.pin") == 0) {
4660 if (!cur->is_dir() || cur->is_root()) {
4661 respond_to_request(mdr, -EINVAL);
4662 return;
4663 }
4664
4665 mds_rank_t rank;
4666 try {
4667 rank = boost::lexical_cast<mds_rank_t>(value);
4668 if (rank < 0) rank = MDS_RANK_NONE;
4669 } catch (boost::bad_lexical_cast const&) {
4670 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4671 respond_to_request(mdr, -EINVAL);
4672 return;
4673 }
4674
4675 xlocks.insert(&cur->policylock);
4676 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4677 return;
4678
4679 pi = cur->project_inode();
4680 cur->set_export_pin(rank);
4681 } else {
4682 dout(10) << " unknown vxattr " << name << dendl;
4683 respond_to_request(mdr, -EINVAL);
4684 return;
4685 }
4686
4687 pi->change_attr++;
4688 pi->ctime = mdr->get_op_stamp();
4689 pi->version = cur->pre_dirty();
4690 if (cur->is_file())
4691 pi->update_backtrace();
4692
4693 // log + wait
4694 mdr->ls = mdlog->get_current_segment();
4695 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
4696 mdlog->start_entry(le);
4697 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4698 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4699 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4700
4701 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4702 return;
4703 }
4704
4705 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
4706 file_layout_t *dir_layout,
4707 set<SimpleLock*> rdlocks,
4708 set<SimpleLock*> wrlocks,
4709 set<SimpleLock*> xlocks)
4710 {
4711 MClientRequest *req = mdr->client_request;
4712 string name(req->get_path2());
4713
4714 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
4715
4716 if (name == "ceph.dir.layout") {
4717 if (!cur->is_dir()) {
4718 respond_to_request(mdr, -ENODATA);
4719 return;
4720 }
4721 if (cur->is_root()) {
4722 dout(10) << "can't remove layout policy on the root directory" << dendl;
4723 respond_to_request(mdr, -EINVAL);
4724 return;
4725 }
4726
4727 if (!cur->get_projected_inode()->has_layout()) {
4728 respond_to_request(mdr, -ENODATA);
4729 return;
4730 }
4731
4732 xlocks.insert(&cur->policylock);
4733 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4734 return;
4735
4736 inode_t *pi = cur->project_inode();
4737 pi->clear_layout();
4738 pi->version = cur->pre_dirty();
4739
4740 // log + wait
4741 mdr->ls = mdlog->get_current_segment();
4742 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
4743 mdlog->start_entry(le);
4744 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4745 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4746 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4747
4748 mdr->no_early_reply = true;
4749 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4750 return;
4751 } else if (name == "ceph.dir.layout.pool_namespace"
4752 || name == "ceph.file.layout.pool_namespace") {
4753 // Namespace is the only layout field that has a meaningful
4754 // null/none value (empty string, means default layout). Is equivalent
4755 // to a setxattr with empty string: pass through the empty payload of
4756 // the rmxattr request to do this.
4757 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4758 return;
4759 }
4760
4761 respond_to_request(mdr, -ENODATA);
4762 }
4763
4764 class C_MDS_inode_xattr_update_finish : public ServerLogContext {
4765 CInode *in;
4766 public:
4767
4768 C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
4769 ServerLogContext(s, r), in(i) { }
4770 void finish(int r) override {
4771 assert(r == 0);
4772
4773 // apply
4774 in->pop_and_dirty_projected_inode(mdr->ls);
4775
4776 mdr->apply();
4777
4778 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
4779
4780 server->respond_to_request(mdr, 0);
4781 }
4782 };
4783
4784 void Server::handle_client_setxattr(MDRequestRef& mdr)
4785 {
4786 MClientRequest *req = mdr->client_request;
4787 string name(req->get_path2());
4788 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4789 CInode *cur;
4790
4791 file_layout_t *dir_layout = NULL;
4792 if (name.compare(0, 15, "ceph.dir.layout") == 0)
4793 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4794 else
4795 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4796 if (!cur)
4797 return;
4798
4799 if (mdr->snapid != CEPH_NOSNAP) {
4800 respond_to_request(mdr, -EROFS);
4801 return;
4802 }
4803
4804 int flags = req->head.args.setxattr.flags;
4805
4806 // magic ceph.* namespace?
4807 if (name.compare(0, 5, "ceph.") == 0) {
4808 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4809 return;
4810 }
4811
4812 xlocks.insert(&cur->xattrlock);
4813 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4814 return;
4815
4816 if (!check_access(mdr, cur, MAY_WRITE))
4817 return;
4818
4819 map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
4820 size_t len = req->get_data().length();
4821 size_t inc = len + name.length();
4822
4823 // check xattrs kv pairs size
4824 size_t cur_xattrs_size = 0;
4825 for (const auto& p : *pxattrs) {
4826 if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
4827 continue;
4828 }
4829 cur_xattrs_size += p.first.length() + p.second.length();
4830 }
4831
4832 if (((cur_xattrs_size + inc) > g_conf->mds_max_xattr_pairs_size)) {
4833 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
4834 << cur_xattrs_size << ", inc " << inc << dendl;
4835 respond_to_request(mdr, -ENOSPC);
4836 return;
4837 }
4838
4839 if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(name)) {
4840 dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
4841 respond_to_request(mdr, -EEXIST);
4842 return;
4843 }
4844 if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(name)) {
4845 dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
4846 respond_to_request(mdr, -ENODATA);
4847 return;
4848 }
4849
4850 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
4851
4852 // project update
4853 map<string,bufferptr> *px = new map<string,bufferptr>;
4854 inode_t *pi = cur->project_inode(px);
4855 pi->version = cur->pre_dirty();
4856 pi->ctime = mdr->get_op_stamp();
4857 pi->change_attr++;
4858 pi->xattr_version++;
4859 px->erase(name);
4860 if (!(flags & CEPH_XATTR_REMOVE)) {
4861 (*px)[name] = buffer::create(len);
4862 if (len)
4863 req->get_data().copy(0, len, (*px)[name].c_str());
4864 }
4865
4866 // log + wait
4867 mdr->ls = mdlog->get_current_segment();
4868 EUpdate *le = new EUpdate(mdlog, "setxattr");
4869 mdlog->start_entry(le);
4870 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4871 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4872 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4873
4874 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4875 }
4876
4877 void Server::handle_client_removexattr(MDRequestRef& mdr)
4878 {
4879 MClientRequest *req = mdr->client_request;
4880 string name(req->get_path2());
4881 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4882 file_layout_t *dir_layout = NULL;
4883 CInode *cur;
4884 if (name == "ceph.dir.layout")
4885 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4886 else
4887 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4888 if (!cur)
4889 return;
4890
4891 if (mdr->snapid != CEPH_NOSNAP) {
4892 respond_to_request(mdr, -EROFS);
4893 return;
4894 }
4895
4896 if (name.compare(0, 5, "ceph.") == 0) {
4897 handle_remove_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4898 return;
4899 }
4900
4901 xlocks.insert(&cur->xattrlock);
4902 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4903 return;
4904
4905 map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
4906 if (pxattrs->count(name) == 0) {
4907 dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
4908 respond_to_request(mdr, -ENODATA);
4909 return;
4910 }
4911
4912 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
4913
4914 // project update
4915 map<string,bufferptr> *px = new map<string,bufferptr>;
4916 inode_t *pi = cur->project_inode(px);
4917 pi->version = cur->pre_dirty();
4918 pi->ctime = mdr->get_op_stamp();
4919 pi->change_attr++;
4920 pi->xattr_version++;
4921 px->erase(name);
4922
4923 // log + wait
4924 mdr->ls = mdlog->get_current_segment();
4925 EUpdate *le = new EUpdate(mdlog, "removexattr");
4926 mdlog->start_entry(le);
4927 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4928 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4929 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4930
4931 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4932 }
4933
4934
4935 // =================================================================
4936 // DIRECTORY and NAMESPACE OPS
4937
4938
4939 // ------------------------------------------------
4940
4941 // MKNOD
4942
4943 class C_MDS_mknod_finish : public ServerLogContext {
4944 CDentry *dn;
4945 CInode *newi;
4946 public:
4947 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4948 ServerLogContext(s, r), dn(d), newi(ni) {}
4949 void finish(int r) override {
4950 assert(r == 0);
4951
4952 // link the inode
4953 dn->pop_projected_linkage();
4954
4955 // be a bit hacky with the inode version, here.. we decrement it
4956 // just to keep mark_dirty() happen. (we didn't bother projecting
4957 // a new version of hte inode since it's just been created)
4958 newi->inode.version--;
4959 newi->mark_dirty(newi->inode.version + 1, mdr->ls);
4960 newi->_mark_dirty_parent(mdr->ls, true);
4961
4962 // mkdir?
4963 if (newi->inode.is_dir()) {
4964 CDir *dir = newi->get_dirfrag(frag_t());
4965 assert(dir);
4966 dir->fnode.version--;
4967 dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
4968 dir->mark_new(mdr->ls);
4969 }
4970
4971 mdr->apply();
4972
4973 MDRequestRef null_ref;
4974 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4975
4976 if (newi->inode.is_file())
4977 get_mds()->locker->share_inode_max_size(newi);
4978
4979 // hit pop
4980 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
4981
4982 // reply
4983 server->respond_to_request(mdr, 0);
4984 }
4985 };
4986
4987
4988 void Server::handle_client_mknod(MDRequestRef& mdr)
4989 {
4990 MClientRequest *req = mdr->client_request;
4991 client_t client = mdr->get_client();
4992 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4993 file_layout_t *dir_layout = NULL;
4994 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false,
4995 &dir_layout);
4996 if (!dn) return;
4997 if (mdr->snapid != CEPH_NOSNAP) {
4998 respond_to_request(mdr, -EROFS);
4999 return;
5000 }
5001 CInode *diri = dn->get_dir()->get_inode();
5002 rdlocks.insert(&diri->authlock);
5003 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5004 return;
5005
5006 if (!check_access(mdr, diri, MAY_WRITE))
5007 return;
5008
5009 if (!check_fragment_space(mdr, dn->get_dir()))
5010 return;
5011
5012 unsigned mode = req->head.args.mknod.mode;
5013 if ((mode & S_IFMT) == 0)
5014 mode |= S_IFREG;
5015
5016 // set layout
5017 file_layout_t layout;
5018 if (dir_layout && S_ISREG(mode))
5019 layout = *dir_layout;
5020 else
5021 layout = mdcache->default_file_layout;
5022
5023 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5024 snapid_t follows = realm->get_newest_seq();
5025 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
5026 mode, &layout);
5027 assert(newi);
5028
5029 dn->push_projected_linkage(newi);
5030
5031 newi->inode.rdev = req->head.args.mknod.rdev;
5032 newi->inode.version = dn->pre_dirty();
5033 newi->inode.rstat.rfiles = 1;
5034 if (layout.pool_id != mdcache->default_file_layout.pool_id)
5035 newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
5036 newi->inode.update_backtrace();
5037
5038 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5039 // want to write to it (e.g., if they are reexporting NFS)
5040 if (S_ISREG(newi->inode.mode)) {
5041 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
5042 newi->inode.client_ranges[client].range.first = 0;
5043 newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
5044 newi->inode.client_ranges[client].follows = follows;
5045
5046 // issue a cap on the file
5047 int cmode = CEPH_FILE_MODE_RDWR;
5048 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5049 if (cap) {
5050 cap->set_wanted(0);
5051
5052 // put locks in excl mode
5053 newi->filelock.set_state(LOCK_EXCL);
5054 newi->authlock.set_state(LOCK_EXCL);
5055 newi->xattrlock.set_state(LOCK_EXCL);
5056 }
5057 }
5058
5059 assert(dn->first == follows + 1);
5060 newi->first = dn->first;
5061
5062 dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
5063
5064 // prepare finisher
5065 mdr->ls = mdlog->get_current_segment();
5066 EUpdate *le = new EUpdate(mdlog, "mknod");
5067 mdlog->start_entry(le);
5068 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5069 journal_allocated_inos(mdr, &le->metablob);
5070
5071 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
5072 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5073 le->metablob.add_primary_dentry(dn, newi, true, true, true);
5074
5075 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5076 }
5077
5078
5079
5080 // MKDIR
5081 /* This function takes responsibility for the passed mdr*/
5082 void Server::handle_client_mkdir(MDRequestRef& mdr)
5083 {
5084 MClientRequest *req = mdr->client_request;
5085 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5086 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5087 if (!dn) return;
5088 if (mdr->snapid != CEPH_NOSNAP) {
5089 respond_to_request(mdr, -EROFS);
5090 return;
5091 }
5092 CDir *dir = dn->get_dir();
5093 CInode *diri = dir->get_inode();
5094 rdlocks.insert(&diri->authlock);
5095 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5096 return;
5097
5098 // mkdir check access
5099 if (!check_access(mdr, diri, MAY_WRITE))
5100 return;
5101
5102 if (!check_fragment_space(mdr, dir))
5103 return;
5104
5105 // new inode
5106 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5107 snapid_t follows = realm->get_newest_seq();
5108
5109 unsigned mode = req->head.args.mkdir.mode;
5110 mode &= ~S_IFMT;
5111 mode |= S_IFDIR;
5112 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5113 assert(newi);
5114
5115 // it's a directory.
5116 dn->push_projected_linkage(newi);
5117
5118 newi->inode.version = dn->pre_dirty();
5119 newi->inode.rstat.rsubdirs = 1;
5120 newi->inode.update_backtrace();
5121
5122 dout(12) << " follows " << follows << dendl;
5123 assert(dn->first == follows + 1);
5124 newi->first = dn->first;
5125
5126 // ...and that new dir is empty.
5127 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
5128 newdir->state_set(CDir::STATE_CREATING);
5129 newdir->mark_complete();
5130 newdir->fnode.version = newdir->pre_dirty();
5131
5132 // prepare finisher
5133 mdr->ls = mdlog->get_current_segment();
5134 EUpdate *le = new EUpdate(mdlog, "mkdir");
5135 mdlog->start_entry(le);
5136 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5137 journal_allocated_inos(mdr, &le->metablob);
5138 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5139 le->metablob.add_primary_dentry(dn, newi, true, true);
5140 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
5141
5142 // issue a cap on the directory
5143 int cmode = CEPH_FILE_MODE_RDWR;
5144 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5145 if (cap) {
5146 cap->set_wanted(0);
5147
5148 // put locks in excl mode
5149 newi->filelock.set_state(LOCK_EXCL);
5150 newi->authlock.set_state(LOCK_EXCL);
5151 newi->xattrlock.set_state(LOCK_EXCL);
5152 }
5153
5154 // make sure this inode gets into the journal
5155 le->metablob.add_opened_ino(newi->ino());
5156 LogSegment *ls = mds->mdlog->get_current_segment();
5157 ls->open_files.push_back(&newi->item_open_file);
5158
5159 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5160 }
5161
5162
5163 // SYMLINK
5164
5165 void Server::handle_client_symlink(MDRequestRef& mdr)
5166 {
5167 MClientRequest *req = mdr->client_request;
5168 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5169 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5170 if (!dn) return;
5171 if (mdr->snapid != CEPH_NOSNAP) {
5172 respond_to_request(mdr, -EROFS);
5173 return;
5174 }
5175 CDir *dir = dn->get_dir();
5176 CInode *diri = dir->get_inode();
5177 rdlocks.insert(&diri->authlock);
5178 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5179 return;
5180
5181 if (!check_access(mdr, diri, MAY_WRITE))
5182 return;
5183
5184 if (!check_fragment_space(mdr, dir))
5185 return;
5186
5187 unsigned mode = S_IFLNK | 0777;
5188 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5189 assert(newi);
5190
5191 // it's a symlink
5192 dn->push_projected_linkage(newi);
5193
5194 newi->symlink = req->get_path2();
5195 newi->inode.size = newi->symlink.length();
5196 newi->inode.rstat.rbytes = newi->inode.size;
5197 newi->inode.rstat.rfiles = 1;
5198 newi->inode.version = dn->pre_dirty();
5199 newi->inode.update_backtrace();
5200
5201 newi->first = dn->first;
5202
5203 // prepare finisher
5204 mdr->ls = mdlog->get_current_segment();
5205 EUpdate *le = new EUpdate(mdlog, "symlink");
5206 mdlog->start_entry(le);
5207 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5208 journal_allocated_inos(mdr, &le->metablob);
5209 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5210 le->metablob.add_primary_dentry(dn, newi, true, true);
5211
5212 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5213 }
5214
5215
5216
5217
5218
5219 // LINK
5220
5221 void Server::handle_client_link(MDRequestRef& mdr)
5222 {
5223 MClientRequest *req = mdr->client_request;
5224
5225 dout(7) << "handle_client_link " << req->get_filepath()
5226 << " to " << req->get_filepath2()
5227 << dendl;
5228
5229 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5230
5231 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5232 if (!dn) return;
5233 CInode *targeti = rdlock_path_pin_ref(mdr, 1, rdlocks, false);
5234 if (!targeti) return;
5235 if (mdr->snapid != CEPH_NOSNAP) {
5236 respond_to_request(mdr, -EROFS);
5237 return;
5238 }
5239
5240 CDir *dir = dn->get_dir();
5241 dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
5242 dout(7) << "target is " << *targeti << dendl;
5243 if (targeti->is_dir()) {
5244 // if srcdn is replica, need to make sure its linkage is correct
5245 vector<CDentry*>& trace = mdr->dn[1];
5246 if (trace.empty() ||
5247 trace.back()->is_auth() ||
5248 trace.back()->lock.can_read(mdr->get_client())) {
5249 dout(7) << "target is a dir, failing..." << dendl;
5250 respond_to_request(mdr, -EINVAL);
5251 return;
5252 }
5253 }
5254
5255 xlocks.insert(&targeti->linklock);
5256
5257 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5258 return;
5259
5260 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5261 if (!check_access(mdr, targeti, MAY_WRITE))
5262 return;
5263
5264 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
5265 return;
5266
5267 if (!check_fragment_space(mdr, dir))
5268 return;
5269 }
5270
5271 // go!
5272 assert(g_conf->mds_kill_link_at != 1);
5273
5274 // local or remote?
5275 if (targeti->is_auth())
5276 _link_local(mdr, dn, targeti);
5277 else
5278 _link_remote(mdr, true, dn, targeti);
5279 }
5280
5281
5282 class C_MDS_link_local_finish : public ServerLogContext {
5283 CDentry *dn;
5284 CInode *targeti;
5285 version_t dnpv;
5286 version_t tipv;
5287 public:
5288 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
5289 version_t dnpv_, version_t tipv_) :
5290 ServerLogContext(s, r), dn(d), targeti(ti),
5291 dnpv(dnpv_), tipv(tipv_) { }
5292 void finish(int r) override {
5293 assert(r == 0);
5294 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv);
5295 }
5296 };
5297
5298
5299 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
5300 {
5301 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
5302
5303 mdr->ls = mdlog->get_current_segment();
5304
5305 // predirty NEW dentry
5306 version_t dnpv = dn->pre_dirty();
5307 version_t tipv = targeti->pre_dirty();
5308
5309 // project inode update
5310 inode_t *pi = targeti->project_inode();
5311 pi->nlink++;
5312 pi->ctime = mdr->get_op_stamp();
5313 pi->change_attr++;
5314 pi->version = tipv;
5315
5316 // log + wait
5317 EUpdate *le = new EUpdate(mdlog, "link_local");
5318 mdlog->start_entry(le);
5319 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5320 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
5321 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
5322 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5323 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
5324
5325 // do this after predirty_*, to avoid funky extra dnl arg
5326 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5327
5328 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv));
5329 }
5330
5331 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
5332 version_t dnpv, version_t tipv)
5333 {
5334 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
5335
5336 // link and unlock the NEW dentry
5337 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
5338 if (!dnl->get_inode())
5339 dn->link_remote(dnl, targeti);
5340 dn->mark_dirty(dnpv, mdr->ls);
5341
5342 // target inode
5343 targeti->pop_and_dirty_projected_inode(mdr->ls);
5344
5345 mdr->apply();
5346
5347 MDRequestRef null_ref;
5348 mdcache->send_dentry_link(dn, null_ref);
5349
5350 // bump target popularity
5351 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5352 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
5353
5354 // reply
5355 respond_to_request(mdr, 0);
5356 }
5357
5358
5359 // link / unlink remote
5360
5361 class C_MDS_link_remote_finish : public ServerLogContext {
5362 bool inc;
5363 CDentry *dn;
5364 CInode *targeti;
5365 version_t dpv;
5366 public:
5367 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
5368 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
5369 dpv(d->get_projected_version()) {}
5370 void finish(int r) override {
5371 assert(r == 0);
5372 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
5373 }
5374 };
5375
5376 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
5377 {
5378 dout(10) << "_link_remote "
5379 << (inc ? "link ":"unlink ")
5380 << *dn << " to " << *targeti << dendl;
5381
5382 // 1. send LinkPrepare to dest (journal nlink++ prepare)
5383 mds_rank_t linkauth = targeti->authority().first;
5384 if (mdr->more()->witnessed.count(linkauth) == 0) {
5385 if (mds->is_cluster_degraded() &&
5386 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
5387 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
5388 if (mdr->more()->waiting_on_slave.empty())
5389 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
5390 return;
5391 }
5392
5393 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
5394 int op;
5395 if (inc)
5396 op = MMDSSlaveRequest::OP_LINKPREP;
5397 else
5398 op = MMDSSlaveRequest::OP_UNLINKPREP;
5399 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, op);
5400 targeti->set_object_info(req->get_object_info());
5401 req->op_stamp = mdr->get_op_stamp();
5402 mds->send_message_mds(req, linkauth);
5403
5404 assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
5405 mdr->more()->waiting_on_slave.insert(linkauth);
5406 return;
5407 }
5408 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
5409
5410 assert(g_conf->mds_kill_link_at != 2);
5411
5412 mdr->set_mds_stamp(ceph_clock_now());
5413
5414 // add to event
5415 mdr->ls = mdlog->get_current_segment();
5416 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
5417 mdlog->start_entry(le);
5418 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5419 if (!mdr->more()->witnessed.empty()) {
5420 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
5421 le->reqid = mdr->reqid;
5422 le->had_slaves = true;
5423 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
5424 }
5425
5426 if (inc) {
5427 dn->pre_dirty();
5428 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
5429 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5430 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5431 } else {
5432 dn->pre_dirty();
5433 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
5434 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
5435 le->metablob.add_null_dentry(dn, true);
5436 dn->push_projected_linkage();
5437 }
5438
5439 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
5440 }
5441
5442 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
5443 CDentry *dn, CInode *targeti,
5444 version_t dpv)
5445 {
5446 dout(10) << "_link_remote_finish "
5447 << (inc ? "link ":"unlink ")
5448 << *dn << " to " << *targeti << dendl;
5449
5450 assert(g_conf->mds_kill_link_at != 3);
5451
5452 if (!mdr->more()->witnessed.empty())
5453 mdcache->logged_master_update(mdr->reqid);
5454
5455 if (inc) {
5456 // link the new dentry
5457 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
5458 if (!dnl->get_inode())
5459 dn->link_remote(dnl, targeti);
5460 dn->mark_dirty(dpv, mdr->ls);
5461 } else {
5462 // unlink main dentry
5463 dn->get_dir()->unlink_inode(dn);
5464 dn->pop_projected_linkage();
5465 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
5466 }
5467
5468 mdr->apply();
5469
5470 MDRequestRef null_ref;
5471 if (inc)
5472 mdcache->send_dentry_link(dn, null_ref);
5473 else
5474 mdcache->send_dentry_unlink(dn, NULL, null_ref);
5475
5476 // bump target popularity
5477 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5478 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
5479
5480 // reply
5481 respond_to_request(mdr, 0);
5482
5483 if (!inc)
5484 // removing a new dn?
5485 dn->get_dir()->try_remove_unlinked_dn(dn);
5486 }
5487
5488
5489 // remote linking/unlinking
5490
5491 class C_MDS_SlaveLinkPrep : public ServerLogContext {
5492 CInode *targeti;
5493 public:
5494 C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t) :
5495 ServerLogContext(s, r), targeti(t) { }
5496 void finish(int r) override {
5497 assert(r == 0);
5498 server->_logged_slave_link(mdr, targeti);
5499 }
5500 };
5501
5502 class C_MDS_SlaveLinkCommit : public ServerContext {
5503 MDRequestRef mdr;
5504 CInode *targeti;
5505 public:
5506 C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
5507 ServerContext(s), mdr(r), targeti(t) { }
5508 void finish(int r) override {
5509 server->_commit_slave_link(mdr, r, targeti);
5510 }
5511 };
5512
5513 /* This function DOES put the mdr->slave_request before returning*/
5514 void Server::handle_slave_link_prep(MDRequestRef& mdr)
5515 {
5516 dout(10) << "handle_slave_link_prep " << *mdr
5517 << " on " << mdr->slave_request->get_object_info()
5518 << dendl;
5519
5520 assert(g_conf->mds_kill_link_at != 4);
5521
5522 CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
5523 assert(targeti);
5524 dout(10) << "targeti " << *targeti << dendl;
5525 CDentry *dn = targeti->get_parent_dn();
5526 CDentry::linkage_t *dnl = dn->get_linkage();
5527 assert(dnl->is_primary());
5528
5529 mdr->set_op_stamp(mdr->slave_request->op_stamp);
5530
5531 mdr->auth_pin(targeti);
5532
5533 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
5534 assert(g_conf->mds_kill_link_at != 5);
5535
5536 // journal it
5537 mdr->ls = mdlog->get_current_segment();
5538 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
5539 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
5540 mdlog->start_entry(le);
5541
5542 inode_t *pi = dnl->get_inode()->project_inode();
5543
5544 // update journaled target inode
5545 bool inc;
5546 if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
5547 inc = true;
5548 pi->nlink++;
5549 } else {
5550 inc = false;
5551 pi->nlink--;
5552 }
5553
5554 link_rollback rollback;
5555 rollback.reqid = mdr->reqid;
5556 rollback.ino = targeti->ino();
5557 rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
5558 const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
5559 rollback.old_dir_mtime = pf->fragstat.mtime;
5560 rollback.old_dir_rctime = pf->rstat.rctime;
5561 rollback.was_inc = inc;
5562 ::encode(rollback, le->rollback);
5563 mdr->more()->rollback_bl = le->rollback;
5564
5565 pi->ctime = mdr->get_op_stamp();
5566 pi->version = targeti->pre_dirty();
5567
5568 dout(10) << " projected inode " << pi << " v " << pi->version << dendl;
5569
5570 // commit case
5571 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
5572 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
5573
5574 // set up commit waiter
5575 mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
5576
5577 mdr->more()->slave_update_journaled = true;
5578 submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti),
5579 mdr, __func__);
5580 mdlog->flush();
5581 }
5582
5583 void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti)
5584 {
5585 dout(10) << "_logged_slave_link " << *mdr
5586 << " " << *targeti << dendl;
5587
5588 assert(g_conf->mds_kill_link_at != 6);
5589
5590 // update the target
5591 targeti->pop_and_dirty_projected_inode(mdr->ls);
5592 mdr->apply();
5593
5594 // hit pop
5595 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5596
5597 // done.
5598 mdr->slave_request->put();
5599 mdr->slave_request = 0;
5600
5601 // ack
5602 if (!mdr->aborted) {
5603 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5604 MMDSSlaveRequest::OP_LINKPREPACK);
5605 mds->send_message_mds(reply, mdr->slave_to_mds);
5606 } else {
5607 dout(10) << " abort flag set, finishing" << dendl;
5608 mdcache->request_finish(mdr);
5609 }
5610 }
5611
5612
5613 struct C_MDS_CommittedSlave : public ServerLogContext {
5614 C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
5615 void finish(int r) override {
5616 server->_committed_slave(mdr);
5617 }
5618 };
5619
5620 void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
5621 {
5622 dout(10) << "_commit_slave_link " << *mdr
5623 << " r=" << r
5624 << " " << *targeti << dendl;
5625
5626 assert(g_conf->mds_kill_link_at != 7);
5627
5628 if (r == 0) {
5629 // drop our pins, etc.
5630 mdr->cleanup();
5631
5632 // write a commit to the journal
5633 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
5634 ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
5635 mdlog->start_entry(le);
5636 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
5637 mdlog->flush();
5638 } else {
5639 do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
5640 }
5641 }
5642
5643 void Server::_committed_slave(MDRequestRef& mdr)
5644 {
5645 dout(10) << "_committed_slave " << *mdr << dendl;
5646
5647 assert(g_conf->mds_kill_link_at != 8);
5648
5649 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5650 MMDSSlaveRequest::OP_COMMITTED);
5651 mds->send_message_mds(req, mdr->slave_to_mds);
5652 mdcache->request_finish(mdr);
5653 }
5654
5655 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
5656 MutationRef mut;
5657 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r) : ServerLogContext(s, r), mut(m) {}
5658 void finish(int r) override {
5659 server->_link_rollback_finish(mut, mdr);
5660 }
5661 };
5662
5663 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
5664 {
5665 link_rollback rollback;
5666 bufferlist::iterator p = rbl.begin();
5667 ::decode(rollback, p);
5668
5669 dout(10) << "do_link_rollback on " << rollback.reqid
5670 << (rollback.was_inc ? " inc":" dec")
5671 << " ino " << rollback.ino
5672 << dendl;
5673
5674 assert(g_conf->mds_kill_link_at != 9);
5675
5676 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
5677 assert(mdr || mds->is_resolve());
5678
5679 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
5680 mut->ls = mds->mdlog->get_current_segment();
5681
5682 CInode *in = mdcache->get_inode(rollback.ino);
5683 assert(in);
5684 dout(10) << " target is " << *in << dendl;
5685 assert(!in->is_projected()); // live slave request hold versionlock xlock.
5686
5687 inode_t *pi = in->project_inode();
5688 pi->version = in->pre_dirty();
5689 mut->add_projected_inode(in);
5690
5691 // parent dir rctime
5692 CDir *parent = in->get_projected_parent_dn()->get_dir();
5693 fnode_t *pf = parent->project_fnode();
5694 mut->add_projected_fnode(parent);
5695 pf->version = parent->pre_dirty();
5696 if (pf->fragstat.mtime == pi->ctime) {
5697 pf->fragstat.mtime = rollback.old_dir_mtime;
5698 if (pf->rstat.rctime == pi->ctime)
5699 pf->rstat.rctime = rollback.old_dir_rctime;
5700 mut->add_updated_lock(&parent->get_inode()->filelock);
5701 mut->add_updated_lock(&parent->get_inode()->nestlock);
5702 }
5703
5704 // inode
5705 pi->ctime = rollback.old_ctime;
5706 if (rollback.was_inc)
5707 pi->nlink--;
5708 else
5709 pi->nlink++;
5710
5711 // journal it
5712 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
5713 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
5714 mdlog->start_entry(le);
5715 le->commit.add_dir_context(parent);
5716 le->commit.add_dir(parent, true);
5717 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
5718
5719 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr),
5720 mdr, __func__);
5721 mdlog->flush();
5722 }
5723
5724 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr)
5725 {
5726 dout(10) << "_link_rollback_finish" << dendl;
5727
5728 assert(g_conf->mds_kill_link_at != 10);
5729
5730 mut->apply();
5731 if (mdr)
5732 mdcache->request_finish(mdr);
5733
5734 mdcache->finish_rollback(mut->reqid);
5735
5736 mut->cleanup();
5737 }
5738
5739
5740 /* This function DOES NOT put the passed message before returning*/
5741 void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *m)
5742 {
5743 dout(10) << "handle_slave_link_prep_ack " << *mdr
5744 << " " << *m << dendl;
5745 mds_rank_t from = mds_rank_t(m->get_source().num());
5746
5747 assert(g_conf->mds_kill_link_at != 11);
5748
5749 // note slave
5750 mdr->more()->slaves.insert(from);
5751
5752 // witnessed!
5753 assert(mdr->more()->witnessed.count(from) == 0);
5754 mdr->more()->witnessed.insert(from);
5755 assert(!m->is_not_journaled());
5756 mdr->more()->has_journaled_slaves = true;
5757
5758 // remove from waiting list
5759 assert(mdr->more()->waiting_on_slave.count(from));
5760 mdr->more()->waiting_on_slave.erase(from);
5761
5762 assert(mdr->more()->waiting_on_slave.empty());
5763
5764 dispatch_client_request(mdr); // go again!
5765 }
5766
5767
5768
5769
5770
5771 // UNLINK
5772
5773 void Server::handle_client_unlink(MDRequestRef& mdr)
5774 {
5775 MClientRequest *req = mdr->client_request;
5776 client_t client = mdr->get_client();
5777
5778 // rmdir or unlink?
5779 bool rmdir = false;
5780 if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true;
5781
5782 if (req->get_filepath().depth() == 0) {
5783 respond_to_request(mdr, -EINVAL);
5784 return;
5785 }
5786
5787 // traverse to path
5788 vector<CDentry*> trace;
5789 CInode *in;
5790 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(), &trace, &in, MDS_TRAVERSE_FORWARD);
5791 if (r > 0) return;
5792 if (r < 0) {
5793 if (r == -ESTALE) {
5794 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
5795 mdcache->find_ino_peers(req->get_filepath().get_ino(), new C_MDS_TryFindInode(this, mdr));
5796 return;
5797 }
5798 respond_to_request(mdr, r);
5799 return;
5800 }
5801 if (mdr->snapid != CEPH_NOSNAP) {
5802 respond_to_request(mdr, -EROFS);
5803 return;
5804 }
5805
5806 CDentry *dn = trace[trace.size()-1];
5807 assert(dn);
5808 if (!dn->is_auth()) {
5809 mdcache->request_forward(mdr, dn->authority().first);
5810 return;
5811 }
5812
5813 CInode *diri = dn->get_dir()->get_inode();
5814
5815 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
5816 assert(!dnl->is_null());
5817
5818 if (rmdir) {
5819 dout(7) << "handle_client_rmdir on " << *dn << dendl;
5820 } else {
5821 dout(7) << "handle_client_unlink on " << *dn << dendl;
5822 }
5823 dout(7) << "dn links to " << *in << dendl;
5824
5825 // rmdir vs is_dir
5826 if (in->is_dir()) {
5827 if (rmdir) {
5828 // do empty directory checks
5829 if (_dir_is_nonempty_unlocked(mdr, in)) {
5830 respond_to_request(mdr, -ENOTEMPTY);
5831 return;
5832 }
5833 } else {
5834 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
5835 respond_to_request(mdr, -EISDIR);
5836 return;
5837 }
5838 } else {
5839 if (rmdir) {
5840 // unlink
5841 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
5842 respond_to_request(mdr, -ENOTDIR);
5843 return;
5844 }
5845 }
5846
5847 // -- create stray dentry? --
5848 CDentry *straydn = NULL;
5849 if (dnl->is_primary()) {
5850 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
5851 if (!straydn)
5852 return;
5853 dout(10) << " straydn is " << *straydn << dendl;
5854 } else if (mdr->straydn) {
5855 mdr->unpin(mdr->straydn);
5856 mdr->straydn = NULL;
5857 }
5858
5859 // lock
5860 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5861
5862 for (int i=0; i<(int)trace.size()-1; i++)
5863 rdlocks.insert(&trace[i]->lock);
5864 xlocks.insert(&dn->lock);
5865 wrlocks.insert(&diri->filelock);
5866 wrlocks.insert(&diri->nestlock);
5867 xlocks.insert(&in->linklock);
5868 if (straydn) {
5869 wrlocks.insert(&straydn->get_dir()->inode->filelock);
5870 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
5871 xlocks.insert(&straydn->lock);
5872 }
5873 if (in->is_dir())
5874 rdlocks.insert(&in->filelock); // to verify it's empty
5875 mds->locker->include_snap_rdlocks(rdlocks, dnl->get_inode());
5876
5877 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5878 return;
5879
5880 if (in->is_dir() &&
5881 _dir_is_nonempty(mdr, in)) {
5882 respond_to_request(mdr, -ENOTEMPTY);
5883 return;
5884 }
5885
5886 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5887 if (!check_access(mdr, diri, MAY_WRITE))
5888 return;
5889 }
5890
5891 // yay!
5892 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
5893 // subtree root auths need to be witnesses
5894 set<mds_rank_t> witnesses;
5895 in->list_replicas(witnesses);
5896 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
5897
5898 for (set<mds_rank_t>::iterator p = witnesses.begin();
5899 p != witnesses.end();
5900 ++p) {
5901 if (mdr->more()->witnessed.count(*p)) {
5902 dout(10) << " already witnessed by mds." << *p << dendl;
5903 } else if (mdr->more()->waiting_on_slave.count(*p)) {
5904 dout(10) << " already waiting on witness mds." << *p << dendl;
5905 } else {
5906 if (!_rmdir_prepare_witness(mdr, *p, trace, straydn))
5907 return;
5908 }
5909 }
5910 if (!mdr->more()->waiting_on_slave.empty())
5911 return; // we're waiting for a witness.
5912 }
5913
5914 // ok!
5915 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
5916 _link_remote(mdr, false, dn, dnl->get_inode());
5917 else
5918 _unlink_local(mdr, dn, straydn);
5919 }
5920
5921 class C_MDS_unlink_local_finish : public ServerLogContext {
5922 CDentry *dn;
5923 CDentry *straydn;
5924 version_t dnpv; // deleted dentry
5925 public:
5926 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
5927 ServerLogContext(s, r), dn(d), straydn(sd),
5928 dnpv(d->get_projected_version()) {}
5929 void finish(int r) override {
5930 assert(r == 0);
5931 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
5932 }
5933 };
5934
5935 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
5936 {
5937 dout(10) << "_unlink_local " << *dn << dendl;
5938
5939 CDentry::linkage_t *dnl = dn->get_projected_linkage();
5940 CInode *in = dnl->get_inode();
5941
5942 SnapRealm *realm = in->find_snaprealm();
5943 snapid_t follows = realm->get_newest_seq();
5944
5945 // ok, let's do it.
5946 mdr->ls = mdlog->get_current_segment();
5947
5948 // prepare log entry
5949 EUpdate *le = new EUpdate(mdlog, "unlink_local");
5950 mdlog->start_entry(le);
5951 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5952 if (!mdr->more()->witnessed.empty()) {
5953 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
5954 le->reqid = mdr->reqid;
5955 le->had_slaves = true;
5956 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
5957 }
5958
5959 if (straydn) {
5960 assert(dnl->is_primary());
5961 straydn->push_projected_linkage(in);
5962 straydn->first = follows + 1;
5963 }
5964
5965 // the unlinked dentry
5966 dn->pre_dirty();
5967
5968 inode_t *pi = in->project_inode();
5969 dn->make_path_string(pi->stray_prior_path, true);
5970 mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
5971 pi->version = in->pre_dirty();
5972 pi->ctime = mdr->get_op_stamp();
5973 pi->change_attr++;
5974 pi->nlink--;
5975 if (pi->nlink == 0)
5976 in->state_set(CInode::STATE_ORPHAN);
5977
5978 if (dnl->is_primary()) {
5979 // primary link. add stray dentry.
5980 assert(straydn);
5981 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
5982 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5983
5984 // project snaprealm, too
5985 if (in->snaprealm || follows + 1 > in->get_oldest_snap())
5986 in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
5987
5988 pi->update_backtrace();
5989 le->metablob.add_primary_dentry(straydn, in, true, true);
5990 } else {
5991 // remote link. update remote inode.
5992 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
5993 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5994 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5995 }
5996
5997 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
5998 le->metablob.add_null_dentry(dn, true);
5999
6000 if (in->is_dir()) {
6001 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6002 le->metablob.renamed_dirino = in->ino();
6003 }
6004
6005 dn->push_projected_linkage();
6006
6007 if (in->is_dir()) {
6008 assert(straydn);
6009 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6010
6011 in->maybe_export_pin(true);
6012 }
6013
6014 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
6015 }
6016
6017 void Server::_unlink_local_finish(MDRequestRef& mdr,
6018 CDentry *dn, CDentry *straydn,
6019 version_t dnpv)
6020 {
6021 dout(10) << "_unlink_local_finish " << *dn << dendl;
6022
6023 if (!mdr->more()->witnessed.empty())
6024 mdcache->logged_master_update(mdr->reqid);
6025
6026 // unlink main dentry
6027 dn->get_dir()->unlink_inode(dn);
6028 dn->pop_projected_linkage();
6029
6030 // relink as stray? (i.e. was primary link?)
6031 CInode *strayin = NULL;
6032 bool snap_is_new = false;
6033 if (straydn) {
6034 dout(20) << " straydn is " << *straydn << dendl;
6035 CDentry::linkage_t *straydnl = straydn->pop_projected_linkage();
6036 strayin = straydnl->get_inode();
6037
6038 snap_is_new = strayin->snaprealm ? true : false;
6039 mdcache->touch_dentry_bottom(straydn);
6040 }
6041
6042 dn->mark_dirty(dnpv, mdr->ls);
6043 mdr->apply();
6044
6045 if (snap_is_new) //only new if strayin exists
6046 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, true);
6047
6048 mdcache->send_dentry_unlink(dn, straydn, mdr);
6049
6050 // update subtree map?
6051 if (straydn && strayin->is_dir())
6052 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
6053
6054 // bump pop
6055 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
6056
6057 // reply
6058 respond_to_request(mdr, 0);
6059
6060 // removing a new dn?
6061 dn->get_dir()->try_remove_unlinked_dn(dn);
6062
6063 // clean up ?
6064 // respond_to_request() drops locks. So stray reintegration can race with us.
6065 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6066 // Tip off the MDCache that this dentry is a stray that
6067 // might be elegible for purge.
6068 mdcache->notify_stray(straydn);
6069 }
6070 }
6071
6072 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
6073 {
6074 if (mds->is_cluster_degraded() &&
6075 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6076 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
6077 if (mdr->more()->waiting_on_slave.empty())
6078 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6079 return false;
6080 }
6081
6082 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
6083 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6084 MMDSSlaveRequest::OP_RMDIRPREP);
6085 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
6086 for (auto dn : trace)
6087 req->srcdnpath.push_dentry(dn->name);
6088 mdcache->replicate_stray(straydn, who, req->stray);
6089
6090 req->op_stamp = mdr->get_op_stamp();
6091 mds->send_message_mds(req, who);
6092
6093 assert(mdr->more()->waiting_on_slave.count(who) == 0);
6094 mdr->more()->waiting_on_slave.insert(who);
6095 return true;
6096 }
6097
6098 struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
6099 CDentry *dn, *straydn;
6100 C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
6101 : ServerLogContext(s, r), dn(d), straydn(st) {}
6102 void finish(int r) override {
6103 server->_logged_slave_rmdir(mdr, dn, straydn);
6104 }
6105 };
6106
6107 struct C_MDS_SlaveRmdirCommit : public ServerContext {
6108 MDRequestRef mdr;
6109 CDentry *straydn;
6110 C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
6111 : ServerContext(s), mdr(r), straydn(sd) { }
6112 void finish(int r) override {
6113 server->_commit_slave_rmdir(mdr, r, straydn);
6114 }
6115 };
6116
6117 void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
6118 {
6119 dout(10) << "handle_slave_rmdir_prep " << *mdr
6120 << " " << mdr->slave_request->srcdnpath
6121 << " to " << mdr->slave_request->destdnpath
6122 << dendl;
6123
6124 vector<CDentry*> trace;
6125 filepath srcpath(mdr->slave_request->srcdnpath);
6126 dout(10) << " src " << srcpath << dendl;
6127 CInode *in;
6128 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK);
6129 if (r > 0) return;
6130 if (r == -ESTALE) {
6131 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
6132 mdr->slave_to_mds);
6133 return;
6134 }
6135 assert(r == 0);
6136 CDentry *dn = trace[trace.size()-1];
6137 dout(10) << " dn " << *dn << dendl;
6138 mdr->pin(dn);
6139
6140 assert(mdr->straydn);
6141 CDentry *straydn = mdr->straydn;
6142 dout(10) << " straydn " << *straydn << dendl;
6143
6144 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6145
6146 rmdir_rollback rollback;
6147 rollback.reqid = mdr->reqid;
6148 rollback.src_dir = dn->get_dir()->dirfrag();
6149 rollback.src_dname = dn->name;
6150 rollback.dest_dir = straydn->get_dir()->dirfrag();
6151 rollback.dest_dname = straydn->name;
6152 ::encode(rollback, mdr->more()->rollback_bl);
6153 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
6154
6155 // set up commit waiter
6156 mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
6157
6158 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
6159 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
6160 dn->get_dir()->unlink_inode(dn);
6161 straydn->get_dir()->link_primary_inode(straydn, in);
6162
6163 assert(straydn->first >= in->first);
6164 in->first = straydn->first;
6165
6166 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), false);
6167
6168 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6169 MMDSSlaveRequest::OP_RMDIRPREPACK);
6170 reply->mark_not_journaled();
6171 mds->send_message_mds(reply, mdr->slave_to_mds);
6172
6173 // send caps to auth (if we're not already)
6174 if (in->is_any_caps() && !in->state_test(CInode::STATE_EXPORTINGCAPS))
6175 mdcache->migrator->export_caps(in);
6176
6177 mdcache->touch_dentry_bottom(straydn); // move stray to end of lru
6178
6179 mdr->slave_request->put();
6180 mdr->slave_request = 0;
6181 mdr->straydn = 0;
6182 return;
6183 }
6184
6185 straydn->push_projected_linkage(in);
6186 dn->push_projected_linkage();
6187
6188 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
6189 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
6190 mdlog->start_entry(le);
6191 le->rollback = mdr->more()->rollback_bl;
6192
6193 le->commit.add_dir_context(straydn->get_dir());
6194 le->commit.add_primary_dentry(straydn, in, true);
6195 // slave: no need to journal original dentry
6196
6197 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6198 le->commit.renamed_dirino = in->ino();
6199
6200 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6201
6202 mdr->more()->slave_update_journaled = true;
6203 submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
6204 mdr, __func__);
6205 mdlog->flush();
6206 }
6207
6208 void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
6209 {
6210 dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
6211
6212 // update our cache now, so we are consistent with what is in the journal
6213 // when we journal a subtree map
6214 CInode *in = dn->get_linkage()->get_inode();
6215 dn->get_dir()->unlink_inode(dn);
6216 straydn->pop_projected_linkage();
6217 dn->pop_projected_linkage();
6218 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), true);
6219
6220 // done.
6221 mdr->slave_request->put();
6222 mdr->slave_request = 0;
6223 mdr->straydn = 0;
6224
6225 if (!mdr->aborted) {
6226 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6227 MMDSSlaveRequest::OP_RMDIRPREPACK);
6228 mds->send_message_mds(reply, mdr->slave_to_mds);
6229 } else {
6230 dout(10) << " abort flag set, finishing" << dendl;
6231 mdcache->request_finish(mdr);
6232 }
6233 }
6234
6235 void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
6236 {
6237 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
6238 << " " << *ack << dendl;
6239
6240 mds_rank_t from = mds_rank_t(ack->get_source().num());
6241
6242 mdr->more()->slaves.insert(from);
6243 mdr->more()->witnessed.insert(from);
6244 if (!ack->is_not_journaled())
6245 mdr->more()->has_journaled_slaves = true;
6246
6247 // remove from waiting list
6248 assert(mdr->more()->waiting_on_slave.count(from));
6249 mdr->more()->waiting_on_slave.erase(from);
6250
6251 if (mdr->more()->waiting_on_slave.empty())
6252 dispatch_client_request(mdr); // go again!
6253 else
6254 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
6255 }
6256
6257 void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
6258 {
6259 dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
6260
6261 if (r == 0) {
6262 if (mdr->more()->slave_update_journaled) {
6263 CInode *strayin = straydn->get_projected_linkage()->get_inode();
6264 if (strayin && !strayin->snaprealm)
6265 mdcache->clear_dirty_bits_for_stray(strayin);
6266 }
6267
6268 mdr->cleanup();
6269
6270 if (mdr->more()->slave_update_journaled) {
6271 // write a commit to the journal
6272 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
6273 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
6274 ESlaveUpdate::RMDIR);
6275 mdlog->start_entry(le);
6276 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
6277 mdlog->flush();
6278 } else {
6279 _committed_slave(mdr);
6280 }
6281 } else {
6282 // abort
6283 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
6284 }
6285 }
6286
6287 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
6288 metareqid_t reqid;
6289 CDentry *dn;
6290 CDentry *straydn;
6291 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
6292 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
6293 void finish(int r) override {
6294 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
6295 }
6296 };
6297
6298 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
6299 {
6300 // unlink the other rollback methods, the rmdir rollback is only
6301 // needed to record the subtree changes in the journal for inode
6302 // replicas who are auth for empty dirfrags. no actual changes to
6303 // the file system are taking place here, so there is no Mutation.
6304
6305 rmdir_rollback rollback;
6306 bufferlist::iterator p = rbl.begin();
6307 ::decode(rollback, p);
6308
6309 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
6310 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
6311 assert(mdr || mds->is_resolve());
6312
6313 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
6314 if (!dir)
6315 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
6316 assert(dir);
6317 CDentry *dn = dir->lookup(rollback.src_dname);
6318 assert(dn);
6319 dout(10) << " dn " << *dn << dendl;
6320 dir = mdcache->get_dirfrag(rollback.dest_dir);
6321 assert(dir);
6322 CDentry *straydn = dir->lookup(rollback.dest_dname);
6323 assert(straydn);
6324 dout(10) << " straydn " << *dn << dendl;
6325 CInode *in = straydn->get_linkage()->get_inode();
6326
6327 if (mdr && !mdr->more()->slave_update_journaled) {
6328 assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
6329
6330 straydn->get_dir()->unlink_inode(straydn);
6331 dn->get_dir()->link_primary_inode(dn, in);
6332
6333 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), false);
6334
6335 mdcache->request_finish(mdr);
6336 mdcache->finish_rollback(rollback.reqid);
6337 return;
6338 }
6339
6340 dn->push_projected_linkage(in);
6341 straydn->push_projected_linkage();
6342
6343 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
6344 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
6345 mdlog->start_entry(le);
6346
6347 le->commit.add_dir_context(dn->get_dir());
6348 le->commit.add_primary_dentry(dn, in, true);
6349 // slave: no need to journal straydn
6350
6351 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6352 le->commit.renamed_dirino = in->ino();
6353
6354 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
6355
6356 submit_mdlog_entry(le,
6357 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
6358 dn, straydn),
6359 mdr, __func__);
6360 mdlog->flush();
6361 }
6362
6363 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
6364 {
6365 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
6366
6367 straydn->get_dir()->unlink_inode(straydn);
6368 dn->pop_projected_linkage();
6369 straydn->pop_projected_linkage();
6370
6371 CInode *in = dn->get_linkage()->get_inode();
6372 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), true);
6373 if (mds->is_resolve()) {
6374 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
6375 mdcache->try_trim_non_auth_subtree(root);
6376 }
6377
6378 if (mdr)
6379 mdcache->request_finish(mdr);
6380
6381 mdcache->finish_rollback(reqid);
6382 }
6383
6384
6385 /** _dir_is_nonempty[_unlocked]
6386 *
6387 * check if a directory is non-empty (i.e. we can rmdir it).
6388 *
6389 * the unlocked varient this is a fastpath check. we can't really be
6390 * sure until we rdlock the filelock.
6391 */
6392 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
6393 {
6394 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
6395 assert(in->is_auth());
6396
6397 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
6398 return true; // in a snapshot!
6399
6400 list<CDir*> ls;
6401 in->get_dirfrags(ls);
6402 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6403 CDir *dir = *p;
6404 // is the frag obviously non-empty?
6405 if (dir->is_auth()) {
6406 if (dir->get_projected_fnode()->fragstat.size()) {
6407 dout(10) << "dir_is_nonempty_unlocked dirstat has "
6408 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
6409 return true;
6410 }
6411 }
6412 }
6413
6414 return false;
6415 }
6416
6417 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
6418 {
6419 dout(10) << "dir_is_nonempty " << *in << dendl;
6420 assert(in->is_auth());
6421 assert(in->filelock.can_read(mdr->get_client()));
6422
6423 frag_info_t dirstat;
6424 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
6425
6426 list<CDir*> ls;
6427 in->get_dirfrags(ls);
6428 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6429 CDir *dir = *p;
6430 const fnode_t *pf = dir->get_projected_fnode();
6431 if (pf->fragstat.size()) {
6432 dout(10) << "dir_is_nonempty dirstat has "
6433 << pf->fragstat.size() << " items " << *dir << dendl;
6434 return true;
6435 }
6436
6437 if (pf->accounted_fragstat.version == dirstat_version)
6438 dirstat.add(pf->accounted_fragstat);
6439 else
6440 dirstat.add(pf->fragstat);
6441 }
6442
6443 return dirstat.size() != in->get_projected_inode()->dirstat.size();
6444 }
6445
6446
6447 // ======================================================
6448
6449
6450 class C_MDS_rename_finish : public ServerLogContext {
6451 CDentry *srcdn;
6452 CDentry *destdn;
6453 CDentry *straydn;
6454 public:
6455 C_MDS_rename_finish(Server *s, MDRequestRef& r,
6456 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
6457 ServerLogContext(s, r),
6458 srcdn(sdn), destdn(ddn), straydn(stdn) { }
6459 void finish(int r) override {
6460 assert(r == 0);
6461 server->_rename_finish(mdr, srcdn, destdn, straydn);
6462 }
6463 };
6464
6465
6466 /** handle_client_rename
6467 *
6468 * rename master is the destdn auth. this is because cached inodes
6469 * must remain connected. thus, any replica of srci, must also
6470 * replicate destdn, and possibly straydn, so that srci (and
6471 * destdn->inode) remain connected during the rename.
6472 *
6473 * to do this, we freeze srci, then master (destdn auth) verifies that
6474 * all other nodes have also replciated destdn and straydn. note that
6475 * destdn replicas need not also replicate srci. this only works when
6476 * destdn is master.
6477 *
6478 * This function takes responsibility for the passed mdr.
6479 */
6480 void Server::handle_client_rename(MDRequestRef& mdr)
6481 {
6482 MClientRequest *req = mdr->client_request;
6483 dout(7) << "handle_client_rename " << *req << dendl;
6484
6485 filepath destpath = req->get_filepath();
6486 filepath srcpath = req->get_filepath2();
6487 if (destpath.depth() == 0 || srcpath.depth() == 0) {
6488 respond_to_request(mdr, -EINVAL);
6489 return;
6490 }
6491 const string &destname = destpath.last_dentry();
6492
6493 vector<CDentry*>& srctrace = mdr->dn[1];
6494 vector<CDentry*>& desttrace = mdr->dn[0];
6495
6496 set<SimpleLock*> rdlocks, wrlocks, xlocks;
6497
6498 CDentry *destdn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, true, false, true);
6499 if (!destdn) return;
6500 dout(10) << " destdn " << *destdn << dendl;
6501 if (mdr->snapid != CEPH_NOSNAP) {
6502 respond_to_request(mdr, -EROFS);
6503 return;
6504 }
6505 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
6506 CDir *destdir = destdn->get_dir();
6507 assert(destdir->is_auth());
6508
6509 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER);
6510 if (r > 0)
6511 return; // delayed
6512 if (r < 0) {
6513 if (r == -ESTALE) {
6514 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
6515 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
6516 } else {
6517 dout(10) << "FAIL on error " << r << dendl;
6518 respond_to_request(mdr, r);
6519 }
6520 return;
6521
6522 }
6523 assert(!srctrace.empty());
6524 CDentry *srcdn = srctrace[srctrace.size()-1];
6525 dout(10) << " srcdn " << *srcdn << dendl;
6526 if (srcdn->last != CEPH_NOSNAP) {
6527 respond_to_request(mdr, -EROFS);
6528 return;
6529 }
6530 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
6531 CInode *srci = srcdnl->get_inode();
6532 dout(10) << " srci " << *srci << dendl;
6533
6534 CInode *oldin = 0;
6535 if (!destdnl->is_null()) {
6536 //dout(10) << "dest dn exists " << *destdn << dendl;
6537 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
6538 if (!oldin) return;
6539 dout(10) << " oldin " << *oldin << dendl;
6540
6541 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
6542 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
6543 respond_to_request(mdr, -ENOTEMPTY);
6544 return;
6545 }
6546
6547 // if srcdn is replica, need to make sure its linkage is correct
6548 if (srcdn->is_auth() ||
6549 srcdn->lock.can_read(mdr->get_client()) ||
6550 (srcdn->lock.is_xlocked() && srcdn->lock.get_xlock_by() == mdr)) {
6551 // mv /some/thing /to/some/existing_other_thing
6552 if (oldin->is_dir() && !srci->is_dir()) {
6553 respond_to_request(mdr, -EISDIR);
6554 return;
6555 }
6556 if (!oldin->is_dir() && srci->is_dir()) {
6557 respond_to_request(mdr, -ENOTDIR);
6558 return;
6559 }
6560 if (srci == oldin && !srcdn->get_dir()->inode->is_stray()) {
6561 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
6562 return;
6563 }
6564 }
6565 }
6566
6567 // -- some sanity checks --
6568
6569 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
6570 if (destpath.get_ino() != srcpath.get_ino() &&
6571 !(req->get_source().is_mds() &&
6572 MDS_INO_IS_MDSDIR(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
6573 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
6574 CInode *destbase = desttrace[0]->get_dir()->get_inode();
6575 // ok, extend srctrace toward root until it is an ancestor of desttrace.
6576 while (srcbase != destbase &&
6577 !srcbase->is_projected_ancestor_of(destbase)) {
6578 CDentry *pdn = srcbase->get_projected_parent_dn();
6579 srctrace.insert(srctrace.begin(), pdn);
6580 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
6581 srcbase = pdn->get_dir()->get_inode();
6582 }
6583
6584 // then, extend destpath until it shares the same parent inode as srcpath.
6585 while (destbase != srcbase) {
6586 CDentry *pdn = destbase->get_projected_parent_dn();
6587 desttrace.insert(desttrace.begin(), pdn);
6588 rdlocks.insert(&pdn->lock);
6589 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
6590 destbase = pdn->get_dir()->get_inode();
6591 }
6592 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
6593 }
6594
6595 // src == dest?
6596 if (srcdn->get_dir() == destdir && srcdn->name == destname) {
6597 dout(7) << "rename src=dest, noop" << dendl;
6598 respond_to_request(mdr, 0);
6599 return;
6600 }
6601
6602 // dest a child of src?
6603 // e.g. mv /usr /usr/foo
6604 CDentry *pdn = destdir->inode->get_projected_parent_dn();
6605 while (pdn) {
6606 if (pdn == srcdn) {
6607 dout(7) << "cannot rename item to be a child of itself" << dendl;
6608 respond_to_request(mdr, -EINVAL);
6609 return;
6610 }
6611 pdn = pdn->get_dir()->inode->parent;
6612 }
6613
6614 // is this a stray migration, reintegration or merge? (sanity checks!)
6615 if (mdr->reqid.name.is_mds() &&
6616 !(MDS_INO_IS_MDSDIR(srcpath.get_ino()) &&
6617 MDS_INO_IS_MDSDIR(destpath.get_ino())) &&
6618 !(destdnl->is_remote() &&
6619 destdnl->get_remote_ino() == srci->ino())) {
6620 respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
6621 return;
6622 }
6623
6624 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
6625 (srcdnl->is_primary() || destdnl->is_primary()));
6626 if (linkmerge)
6627 dout(10) << " this is a link merge" << dendl;
6628
6629 // -- create stray dentry? --
6630 CDentry *straydn = NULL;
6631 if (destdnl->is_primary() && !linkmerge) {
6632 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
6633 if (!straydn)
6634 return;
6635 dout(10) << " straydn is " << *straydn << dendl;
6636 } else if (mdr->straydn) {
6637 mdr->unpin(mdr->straydn);
6638 mdr->straydn = NULL;
6639 }
6640
6641 // -- prepare witness list --
6642 /*
6643 * NOTE: we use _all_ replicas as witnesses.
6644 * this probably isn't totally necessary (esp for file renames),
6645 * but if/when we change that, we have to make sure rejoin is
6646 * sufficiently robust to handle strong rejoins from survivors
6647 * with totally wrong dentry->inode linkage.
6648 * (currently, it can ignore rename effects, because the resolve
6649 * stage will sort them out.)
6650 */
6651 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
6652 if (srcdn->is_auth())
6653 srcdn->list_replicas(witnesses);
6654 else
6655 witnesses.insert(srcdn->authority().first);
6656 if (srcdnl->is_remote() && !srci->is_auth())
6657 witnesses.insert(srci->authority().first);
6658 destdn->list_replicas(witnesses);
6659 if (destdnl->is_remote() && !oldin->is_auth())
6660 witnesses.insert(oldin->authority().first);
6661 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
6662
6663
6664 // -- locks --
6665 map<SimpleLock*, mds_rank_t> remote_wrlocks;
6666
6667 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
6668 for (int i=0; i<(int)srctrace.size(); i++)
6669 rdlocks.insert(&srctrace[i]->lock);
6670 xlocks.insert(&srcdn->lock);
6671 mds_rank_t srcdirauth = srcdn->get_dir()->authority().first;
6672 if (srcdirauth != mds->get_nodeid()) {
6673 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
6674 remote_wrlocks[&srcdn->get_dir()->inode->filelock] = srcdirauth;
6675 remote_wrlocks[&srcdn->get_dir()->inode->nestlock] = srcdirauth;
6676 if (srci->is_dir())
6677 rdlocks.insert(&srci->dirfragtreelock);
6678 } else {
6679 wrlocks.insert(&srcdn->get_dir()->inode->filelock);
6680 wrlocks.insert(&srcdn->get_dir()->inode->nestlock);
6681 }
6682 mds->locker->include_snap_rdlocks(rdlocks, srcdn->get_dir()->inode);
6683
6684 // straydn?
6685 if (straydn) {
6686 wrlocks.insert(&straydn->get_dir()->inode->filelock);
6687 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
6688 xlocks.insert(&straydn->lock);
6689 }
6690
6691 // xlock versionlock on dentries if there are witnesses.
6692 // replicas can't see projected dentry linkages, and will get
6693 // confused if we try to pipeline things.
6694 if (!witnesses.empty()) {
6695 // take xlock on all projected ancestor dentries for srcdn and destdn.
6696 // this ensures the srcdn and destdn can be traversed to by the witnesses.
6697 for (int i= 0; i<(int)srctrace.size(); i++) {
6698 if (srctrace[i]->is_auth() && srctrace[i]->is_projected())
6699 xlocks.insert(&srctrace[i]->versionlock);
6700 }
6701 for (int i=0; i<(int)desttrace.size(); i++) {
6702 if (desttrace[i]->is_auth() && desttrace[i]->is_projected())
6703 xlocks.insert(&desttrace[i]->versionlock);
6704 }
6705 // xlock srci and oldin's primary dentries, so witnesses can call
6706 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
6707 // is traversed.
6708 if (srcdnl->is_remote())
6709 xlocks.insert(&srci->get_projected_parent_dn()->lock);
6710 if (destdnl->is_remote())
6711 xlocks.insert(&oldin->get_projected_parent_dn()->lock);
6712 }
6713
6714 // we need to update srci's ctime. xlock its least contended lock to do that...
6715 xlocks.insert(&srci->linklock);
6716
6717 // xlock oldin (for nlink--)
6718 if (oldin) {
6719 xlocks.insert(&oldin->linklock);
6720 if (oldin->is_dir())
6721 rdlocks.insert(&oldin->filelock);
6722 }
6723 if (srcdnl->is_primary() && srci->is_dir())
6724 // FIXME: this should happen whenever we are renamning between
6725 // realms, regardless of the file type
6726 // FIXME: If/when this changes, make sure to update the
6727 // "allowance" in handle_slave_rename_prep
6728 xlocks.insert(&srci->snaplock); // FIXME: an auth bcast could be sufficient?
6729 else
6730 rdlocks.insert(&srci->snaplock);
6731
6732 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL;
6733 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks,
6734 &remote_wrlocks, auth_pin_freeze))
6735 return;
6736
6737 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6738 if (!check_access(mdr, srcdn->get_dir()->get_inode(), MAY_WRITE))
6739 return;
6740
6741 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
6742 return;
6743
6744 if (!check_fragment_space(mdr, destdn->get_dir()))
6745 return;
6746
6747 if (!check_access(mdr, srci, MAY_WRITE))
6748 return;
6749 }
6750
6751 // with read lock, really verify oldin is empty
6752 if (oldin &&
6753 oldin->is_dir() &&
6754 _dir_is_nonempty(mdr, oldin)) {
6755 respond_to_request(mdr, -ENOTEMPTY);
6756 return;
6757 }
6758
6759 /* project_past_snaprealm_parent() will do this job
6760 *
6761 // moving between snaprealms?
6762 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
6763 SnapRealm *srcrealm = srci->find_snaprealm();
6764 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
6765 if (srcrealm != destrealm &&
6766 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
6767 destrealm->get_newest_seq() + 1 > srcdn->first)) {
6768 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
6769 mdcache->snaprealm_create(mdr, srci);
6770 return;
6771 }
6772 }
6773 */
6774
6775 assert(g_conf->mds_kill_rename_at != 1);
6776
6777 // -- open all srcdn inode frags, if any --
6778 // we need these open so that auth can properly delegate from inode to dirfrags
6779 // after the inode is _ours_.
6780 if (srcdnl->is_primary() &&
6781 !srcdn->is_auth() &&
6782 srci->is_dir()) {
6783 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
6784 mdr->set_stickydirs(srci);
6785
6786 list<frag_t> frags;
6787 srci->dirfragtree.get_leaves(frags);
6788 for (list<frag_t>::iterator p = frags.begin();
6789 p != frags.end();
6790 ++p) {
6791 CDir *dir = srci->get_dirfrag(*p);
6792 if (!dir) {
6793 dout(10) << " opening " << *p << " under " << *srci << dendl;
6794 mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr));
6795 return;
6796 }
6797 }
6798 }
6799
6800 // -- prepare witnesses --
6801
6802 // do srcdn auth last
6803 mds_rank_t last = MDS_RANK_NONE;
6804 if (!srcdn->is_auth()) {
6805 last = srcdn->authority().first;
6806 mdr->more()->srcdn_auth_mds = last;
6807 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
6808 // are involved in the rename operation.
6809 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
6810 dout(10) << " preparing ambiguous auth for srci" << dendl;
6811 assert(mdr->more()->is_remote_frozen_authpin);
6812 assert(mdr->more()->rename_inode == srci);
6813 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6814 return;
6815 }
6816 }
6817
6818 for (set<mds_rank_t>::iterator p = witnesses.begin();
6819 p != witnesses.end();
6820 ++p) {
6821 if (*p == last) continue; // do it last!
6822 if (mdr->more()->witnessed.count(*p)) {
6823 dout(10) << " already witnessed by mds." << *p << dendl;
6824 } else if (mdr->more()->waiting_on_slave.count(*p)) {
6825 dout(10) << " already waiting on witness mds." << *p << dendl;
6826 } else {
6827 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
6828 return;
6829 }
6830 }
6831 if (!mdr->more()->waiting_on_slave.empty())
6832 return; // we're waiting for a witness.
6833
6834 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
6835 dout(10) << " preparing last witness (srcdn auth)" << dendl;
6836 assert(mdr->more()->waiting_on_slave.count(last) == 0);
6837 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6838 return;
6839 }
6840
6841 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
6842 if (!mdr->more()->slaves.empty() && !srci->is_dir())
6843 assert(g_conf->mds_kill_rename_at != 3);
6844 if (!mdr->more()->slaves.empty() && srci->is_dir())
6845 assert(g_conf->mds_kill_rename_at != 4);
6846
6847 // -- declare now --
6848 mdr->set_mds_stamp(ceph_clock_now());
6849
6850 // -- prepare journal entry --
6851 mdr->ls = mdlog->get_current_segment();
6852 EUpdate *le = new EUpdate(mdlog, "rename");
6853 mdlog->start_entry(le);
6854 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6855 if (!mdr->more()->witnessed.empty()) {
6856 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6857
6858 le->reqid = mdr->reqid;
6859 le->had_slaves = true;
6860
6861 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6862 // no need to send frozen auth pin to recovring auth MDS of srci
6863 mdr->more()->is_remote_frozen_authpin = false;
6864 }
6865
6866 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
6867 if (le->client_map.length())
6868 le->cmapv = mds->sessionmap.get_projected();
6869
6870 // -- commit locally --
6871 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
6872
6873 journal_and_reply(mdr, srci, destdn, le, fin);
6874 }
6875
6876
6877 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
6878 {
6879 dout(10) << "_rename_finish " << *mdr << dendl;
6880
6881 if (!mdr->more()->witnessed.empty())
6882 mdcache->logged_master_update(mdr->reqid);
6883
6884 // apply
6885 _rename_apply(mdr, srcdn, destdn, straydn);
6886
6887 mdcache->send_dentry_link(destdn, mdr);
6888
6889 CDentry::linkage_t *destdnl = destdn->get_linkage();
6890 CInode *in = destdnl->get_inode();
6891 bool need_eval = mdr->more()->cap_imports.count(in);
6892
6893 // test hack: test slave commit
6894 if (!mdr->more()->slaves.empty() && !in->is_dir())
6895 assert(g_conf->mds_kill_rename_at != 5);
6896 if (!mdr->more()->slaves.empty() && in->is_dir())
6897 assert(g_conf->mds_kill_rename_at != 6);
6898
6899 // bump popularity
6900 mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
6901 if (destdnl->is_remote() && in->is_auth())
6902 mds->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
6903
6904 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
6905
6906 assert(g_conf->mds_kill_rename_at != 7);
6907
6908 // reply
6909 respond_to_request(mdr, 0);
6910
6911 if (need_eval)
6912 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
6913
6914 // clean up?
6915 // respond_to_request() drops locks. So stray reintegration can race with us.
6916 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6917 mdcache->notify_stray(straydn);
6918 }
6919 }
6920
6921
6922
6923 // helpers
6924
6925 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
6926 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
6927 {
6928 if (mds->is_cluster_degraded() &&
6929 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6930 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
6931 if (mdr->more()->waiting_on_slave.empty())
6932 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6933 return false;
6934 }
6935
6936 dout(10) << "_rename_prepare_witness mds." << who << dendl;
6937 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6938 MMDSSlaveRequest::OP_RENAMEPREP);
6939
6940 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
6941 for (auto dn : srctrace)
6942 req->srcdnpath.push_dentry(dn->name);
6943 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
6944 for (auto dn : dsttrace)
6945 req->destdnpath.push_dentry(dn->name);
6946 if (straydn)
6947 mdcache->replicate_stray(straydn, who, req->stray);
6948
6949 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
6950
6951 // srcdn auth will verify our current witness list is sufficient
6952 req->witnesses = witnesse;
6953
6954 req->op_stamp = mdr->get_op_stamp();
6955 mds->send_message_mds(req, who);
6956
6957 assert(mdr->more()->waiting_on_slave.count(who) == 0);
6958 mdr->more()->waiting_on_slave.insert(who);
6959 return true;
6960 }
6961
6962 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
6963 {
6964 version_t oldpv = mdr->more()->inode_import_v;
6965
6966 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
6967
6968 /* import node */
6969 bufferlist::iterator blp = mdr->more()->inode_import.begin();
6970
6971 // imported caps
6972 ::decode(mdr->more()->imported_client_map, blp);
6973 ::encode(mdr->more()->imported_client_map, *client_map_bl,
6974 mds->mdsmap->get_up_features());
6975 prepare_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
6976
6977 list<ScatterLock*> updated_scatterlocks;
6978 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
6979 mdr->more()->cap_imports, updated_scatterlocks);
6980
6981 // hack: force back to !auth and clean, temporarily
6982 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
6983 srcdnl->get_inode()->mark_clean();
6984
6985 return oldpv;
6986 }
6987
6988 bool Server::_need_force_journal(CInode *diri, bool empty)
6989 {
6990 list<CDir*> ls;
6991 diri->get_dirfrags(ls);
6992
6993 bool force_journal = false;
6994 if (empty) {
6995 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6996 if ((*p)->is_subtree_root() && (*p)->get_dir_auth().first == mds->get_nodeid()) {
6997 dout(10) << " frag " << (*p)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
6998 force_journal = true;
6999 break;
7000 } else
7001 dout(20) << " frag " << (*p)->get_frag() << " is not auth subtree dirfrag" << dendl;
7002 }
7003 } else {
7004 // see if any children of our frags are auth subtrees.
7005 list<CDir*> subtrees;
7006 mdcache->list_subtrees(subtrees);
7007 dout(10) << " subtrees " << subtrees << " frags " << ls << dendl;
7008 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7009 CDir *dir = *p;
7010 for (list<CDir*>::iterator q = subtrees.begin(); q != subtrees.end(); ++q) {
7011 if (dir->contains(*q)) {
7012 if ((*q)->get_dir_auth().first == mds->get_nodeid()) {
7013 dout(10) << " frag " << (*p)->get_frag() << " contains (maybe) auth subtree, will force journal "
7014 << **q << dendl;
7015 force_journal = true;
7016 break;
7017 } else
7018 dout(20) << " frag " << (*p)->get_frag() << " contains but isn't auth for " << **q << dendl;
7019 } else
7020 dout(20) << " frag " << (*p)->get_frag() << " does not contain " << **q << dendl;
7021 }
7022 if (force_journal)
7023 break;
7024 }
7025 }
7026 return force_journal;
7027 }
7028
7029 void Server::_rename_prepare(MDRequestRef& mdr,
7030 EMetaBlob *metablob, bufferlist *client_map_bl,
7031 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7032 {
7033 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7034 if (straydn)
7035 dout(10) << " straydn " << *straydn << dendl;
7036
7037 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7038 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7039 CInode *srci = srcdnl->get_inode();
7040 CInode *oldin = destdnl->get_inode();
7041
7042 // primary+remote link merge?
7043 bool linkmerge = (srci == destdnl->get_inode() &&
7044 (srcdnl->is_primary() || destdnl->is_primary()));
7045 bool silent = srcdn->get_dir()->inode->is_stray();
7046
7047 bool force_journal_dest = false;
7048 if (srci->is_dir() && !destdn->is_auth()) {
7049 if (srci->is_auth()) {
7050 // if we are auth for srci and exporting it, force journal because journal replay needs
7051 // the source inode to create auth subtrees.
7052 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
7053 force_journal_dest = true;
7054 } else
7055 force_journal_dest = _need_force_journal(srci, false);
7056 }
7057
7058 bool force_journal_stray = false;
7059 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
7060 force_journal_stray = _need_force_journal(oldin, true);
7061
7062 if (linkmerge)
7063 dout(10) << " merging remote and primary links to the same inode" << dendl;
7064 if (silent)
7065 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
7066 if (force_journal_dest)
7067 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
7068 if (force_journal_stray)
7069 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
7070
7071 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
7072 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
7073 metablob->renamed_dirino = srci->ino();
7074 } else if (oldin && oldin->is_dir() && force_journal_stray) {
7075 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
7076 metablob->renamed_dirino = oldin->ino();
7077 }
7078
7079 // prepare
7080 inode_t *pi = 0; // renamed inode
7081 inode_t *tpi = 0; // target/overwritten inode
7082
7083 // target inode
7084 if (!linkmerge) {
7085 if (destdnl->is_primary()) {
7086 assert(straydn); // moving to straydn.
7087 // link--, and move.
7088 if (destdn->is_auth()) {
7089 tpi = oldin->project_inode(); //project_snaprealm
7090 tpi->version = straydn->pre_dirty(tpi->version);
7091 tpi->update_backtrace();
7092 }
7093 straydn->push_projected_linkage(oldin);
7094 } else if (destdnl->is_remote()) {
7095 // nlink-- targeti
7096 if (oldin->is_auth()) {
7097 tpi = oldin->project_inode();
7098 tpi->version = oldin->pre_dirty();
7099 }
7100 }
7101 }
7102
7103 // dest
7104 if (srcdnl->is_remote()) {
7105 if (!linkmerge) {
7106 // destdn
7107 if (destdn->is_auth())
7108 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
7109 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7110 // srci
7111 if (srci->is_auth()) {
7112 pi = srci->project_inode();
7113 pi->version = srci->pre_dirty();
7114 }
7115 } else {
7116 dout(10) << " will merge remote onto primary link" << dendl;
7117 if (destdn->is_auth()) {
7118 pi = oldin->project_inode();
7119 pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
7120 }
7121 }
7122 } else { // primary
7123 if (destdn->is_auth()) {
7124 version_t oldpv;
7125 if (srcdn->is_auth())
7126 oldpv = srci->get_projected_version();
7127 else {
7128 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
7129
7130 // note which dirfrags have child subtrees in the journal
7131 // event, so that we can open those (as bounds) during replay.
7132 if (srci->is_dir()) {
7133 list<CDir*> ls;
7134 srci->get_dirfrags(ls);
7135 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7136 CDir *dir = *p;
7137 if (!dir->is_auth())
7138 metablob->renamed_dir_frags.push_back(dir->get_frag());
7139 }
7140 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
7141 }
7142 }
7143 pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
7144 // & srcdnl->snaprealm
7145 pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
7146 pi->update_backtrace();
7147 }
7148 destdn->push_projected_linkage(srci);
7149 }
7150
7151 // src
7152 if (srcdn->is_auth())
7153 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
7154 srcdn->push_projected_linkage(); // push null linkage
7155
7156 if (!silent) {
7157 if (pi) {
7158 pi->ctime = mdr->get_op_stamp();
7159 pi->change_attr++;
7160 if (linkmerge)
7161 pi->nlink--;
7162 }
7163 if (tpi) {
7164 tpi->ctime = mdr->get_op_stamp();
7165 tpi->change_attr++;
7166 destdn->make_path_string(tpi->stray_prior_path, true);
7167 tpi->nlink--;
7168 if (tpi->nlink == 0)
7169 oldin->state_set(CInode::STATE_ORPHAN);
7170 }
7171 }
7172
7173 // prepare nesting, mtime updates
7174 int predirty_dir = silent ? 0:PREDIRTY_DIR;
7175
7176 // guarantee stray dir is processed first during journal replay. unlink the old inode,
7177 // then link the source inode to destdn
7178 if (destdnl->is_primary()) {
7179 assert(straydn);
7180 if (straydn->is_auth()) {
7181 metablob->add_dir_context(straydn->get_dir());
7182 metablob->add_dir(straydn->get_dir(), true);
7183 }
7184 }
7185
7186 // sub off target
7187 if (destdn->is_auth() && !destdnl->is_null()) {
7188 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
7189 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
7190 if (destdnl->is_primary()) {
7191 assert(straydn);
7192 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
7193 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7194 }
7195 }
7196
7197 // move srcdn
7198 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
7199 int flags = predirty_dir | predirty_primary;
7200 if (srcdn->is_auth())
7201 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
7202 if (destdn->is_auth())
7203 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
7204
7205 SnapRealm *src_realm = srci->find_snaprealm();
7206 SnapRealm *dest_realm = destdn->get_dir()->inode->find_snaprealm();
7207 snapid_t next_dest_snap = dest_realm->get_newest_seq() + 1;
7208
7209 // add it all to the metablob
7210 // target inode
7211 if (!linkmerge) {
7212 if (destdnl->is_primary()) {
7213 assert(straydn);
7214 if (destdn->is_auth()) {
7215 // project snaprealm, too
7216 if (oldin->snaprealm || dest_realm->get_newest_seq() + 1 > oldin->get_oldest_snap())
7217 oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
7218 straydn->first = MAX(oldin->first, next_dest_snap);
7219 metablob->add_primary_dentry(straydn, oldin, true, true);
7220 } else if (force_journal_stray) {
7221 dout(10) << " forced journaling straydn " << *straydn << dendl;
7222 metablob->add_dir_context(straydn->get_dir());
7223 metablob->add_primary_dentry(straydn, oldin, true);
7224 }
7225 } else if (destdnl->is_remote()) {
7226 if (oldin->is_auth()) {
7227 // auth for targeti
7228 metablob->add_dir_context(oldin->get_projected_parent_dir());
7229 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
7230 CEPH_NOSNAP, 0, destdnl);
7231 metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
7232 }
7233 }
7234 }
7235
7236 // dest
7237 if (srcdnl->is_remote()) {
7238 if (!linkmerge) {
7239 if (destdn->is_auth() && !destdnl->is_null())
7240 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7241 else
7242 destdn->first = MAX(destdn->first, next_dest_snap);
7243
7244 if (destdn->is_auth())
7245 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7246 if (srci->get_projected_parent_dn()->is_auth()) { // it's remote
7247 metablob->add_dir_context(srci->get_projected_parent_dir());
7248 mdcache->journal_cow_dentry(mdr.get(), metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl);
7249 metablob->add_primary_dentry(srci->get_projected_parent_dn(), srci, true);
7250 }
7251 } else {
7252 if (destdn->is_auth() && !destdnl->is_null())
7253 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7254 else
7255 destdn->first = MAX(destdn->first, next_dest_snap);
7256
7257 if (destdn->is_auth())
7258 metablob->add_primary_dentry(destdn, destdnl->get_inode(), true, true);
7259 }
7260 } else if (srcdnl->is_primary()) {
7261 // project snap parent update?
7262 if (destdn->is_auth() && src_realm != dest_realm &&
7263 (srci->snaprealm || src_realm->get_newest_seq() + 1 > srci->get_oldest_snap()))
7264 srci->project_past_snaprealm_parent(dest_realm);
7265
7266 if (destdn->is_auth() && !destdnl->is_null())
7267 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7268 else
7269 destdn->first = MAX(destdn->first, next_dest_snap);
7270
7271 if (destdn->is_auth())
7272 metablob->add_primary_dentry(destdn, srci, true, true);
7273 else if (force_journal_dest) {
7274 dout(10) << " forced journaling destdn " << *destdn << dendl;
7275 metablob->add_dir_context(destdn->get_dir());
7276 metablob->add_primary_dentry(destdn, srci, true);
7277 if (srcdn->is_auth() && srci->is_dir()) {
7278 // journal new subtrees root dirfrags
7279 list<CDir*> ls;
7280 srci->get_dirfrags(ls);
7281 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7282 CDir *dir = *p;
7283 if (dir->is_auth())
7284 metablob->add_dir(dir, true);
7285 }
7286 }
7287 }
7288 }
7289
7290 // src
7291 if (srcdn->is_auth()) {
7292 dout(10) << " journaling srcdn " << *srcdn << dendl;
7293 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
7294 // also journal the inode in case we need do slave rename rollback. It is Ok to add
7295 // both primary and NULL dentries. Because during journal replay, null dentry is
7296 // processed after primary dentry.
7297 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
7298 metablob->add_primary_dentry(srcdn, srci, true);
7299 metablob->add_null_dentry(srcdn, true);
7300 } else
7301 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
7302
7303 // make renamed inode first track the dn
7304 if (srcdnl->is_primary() && destdn->is_auth())
7305 srci->first = destdn->first;
7306
7307 if (oldin && oldin->is_dir()) {
7308 assert(straydn);
7309 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
7310 }
7311 if (srci->is_dir())
7312 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
7313
7314 }
7315
7316
7317 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7318 {
7319 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7320 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
7321
7322 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7323 CDentry::linkage_t *destdnl = destdn->get_linkage();
7324
7325 CInode *oldin = destdnl->get_inode();
7326
7327 // primary+remote link merge?
7328 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7329 (srcdnl->is_primary() || destdnl->is_primary()));
7330
7331 // target inode
7332 if (!linkmerge) {
7333 if (destdnl->is_primary()) {
7334 assert(straydn);
7335 dout(10) << "straydn is " << *straydn << dendl;
7336 destdn->get_dir()->unlink_inode(destdn, false);
7337
7338 straydn->pop_projected_linkage();
7339 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7340 assert(!straydn->is_projected()); // no other projected
7341
7342 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
7343
7344 // nlink-- targeti
7345 if (destdn->is_auth()) {
7346 bool hadrealm = (oldin->snaprealm ? true : false);
7347 oldin->pop_and_dirty_projected_inode(mdr->ls);
7348 if (oldin->snaprealm && !hadrealm)
7349 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT);
7350 } else {
7351 // FIXME this snaprealm is not filled out correctly
7352 //oldin->open_snaprealm(); might be sufficient..
7353 }
7354 } else if (destdnl->is_remote()) {
7355 destdn->get_dir()->unlink_inode(destdn, false);
7356 if (oldin->is_auth())
7357 oldin->pop_and_dirty_projected_inode(mdr->ls);
7358 }
7359 }
7360
7361 // unlink src before we relink it at dest
7362 CInode *in = srcdnl->get_inode();
7363 assert(in);
7364
7365 bool srcdn_was_remote = srcdnl->is_remote();
7366 srcdn->get_dir()->unlink_inode(srcdn);
7367
7368 // dest
7369 if (srcdn_was_remote) {
7370 if (!linkmerge) {
7371 // destdn
7372 destdnl = destdn->pop_projected_linkage();
7373 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7374 assert(!destdn->is_projected()); // no other projected
7375
7376 destdn->link_remote(destdnl, in);
7377 if (destdn->is_auth())
7378 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
7379 // in
7380 if (in->is_auth())
7381 in->pop_and_dirty_projected_inode(mdr->ls);
7382 } else {
7383 dout(10) << "merging remote onto primary link" << dendl;
7384 oldin->pop_and_dirty_projected_inode(mdr->ls);
7385 }
7386 } else { // primary
7387 if (linkmerge) {
7388 dout(10) << "merging primary onto remote link" << dendl;
7389 destdn->get_dir()->unlink_inode(destdn, false);
7390 }
7391 destdnl = destdn->pop_projected_linkage();
7392 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7393 assert(!destdn->is_projected()); // no other projected
7394
7395 // srcdn inode import?
7396 if (!srcdn->is_auth() && destdn->is_auth()) {
7397 assert(mdr->more()->inode_import.length() > 0);
7398
7399 map<client_t,Capability::Import> imported_caps;
7400
7401 // finish cap imports
7402 finish_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
7403 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
7404 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
7405 mdr->more()->srcdn_auth_mds, true,
7406 mdr->more()->cap_imports[destdnl->get_inode()],
7407 imported_caps);
7408 }
7409
7410 mdr->more()->inode_import.clear();
7411 ::encode(imported_caps, mdr->more()->inode_import);
7412
7413 /* hack: add an auth pin for each xlock we hold. These were
7414 * remote xlocks previously but now they're local and
7415 * we're going to try and unpin when we xlock_finish. */
7416 for (set<SimpleLock *>::iterator i = mdr->xlocks.begin();
7417 i != mdr->xlocks.end();
7418 ++i)
7419 if ((*i)->get_parent() == destdnl->get_inode() &&
7420 !(*i)->is_locallock())
7421 mds->locker->xlock_import(*i);
7422
7423 // hack: fix auth bit
7424 in->state_set(CInode::STATE_AUTH);
7425
7426 mdr->clear_ambiguous_auth();
7427 }
7428
7429 if (destdn->is_auth()) {
7430 in->pop_and_dirty_projected_inode(mdr->ls);
7431
7432 } else {
7433 // FIXME: fix up snaprealm!
7434 }
7435 }
7436
7437 // src
7438 if (srcdn->is_auth())
7439 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
7440 srcdn->pop_projected_linkage();
7441 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7442 assert(!srcdn->is_projected()); // no other projected
7443
7444 // apply remaining projected inodes (nested)
7445 mdr->apply();
7446
7447 // update subtree map?
7448 if (destdnl->is_primary() && in->is_dir())
7449 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
7450
7451 if (straydn && oldin->is_dir())
7452 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
7453
7454 // removing a new dn?
7455 if (srcdn->is_auth())
7456 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
7457 }
7458
7459
7460
7461 // ------------
7462 // SLAVE
7463
7464 class C_MDS_SlaveRenamePrep : public ServerLogContext {
7465 CDentry *srcdn, *destdn, *straydn;
7466 public:
7467 C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7468 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
7469 void finish(int r) override {
7470 server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
7471 }
7472 };
7473
7474 class C_MDS_SlaveRenameCommit : public ServerContext {
7475 MDRequestRef mdr;
7476 CDentry *srcdn, *destdn, *straydn;
7477 public:
7478 C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7479 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
7480 void finish(int r) override {
7481 server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
7482 }
7483 };
7484
7485 class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
7486 MDRequestRef mdr;
7487 public:
7488 C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
7489 ServerContext(s), mdr(r) {}
7490 void finish(int r) override {
7491 server->_slave_rename_sessions_flushed(mdr);
7492 }
7493 };
7494
7495 /* This function DOES put the mdr->slave_request before returning*/
7496 void Server::handle_slave_rename_prep(MDRequestRef& mdr)
7497 {
7498 dout(10) << "handle_slave_rename_prep " << *mdr
7499 << " " << mdr->slave_request->srcdnpath
7500 << " to " << mdr->slave_request->destdnpath
7501 << dendl;
7502
7503 if (mdr->slave_request->is_interrupted()) {
7504 dout(10) << " slave request interrupted, sending noop reply" << dendl;
7505 MMDSSlaveRequest *reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7506 reply->mark_interrupted();
7507 mds->send_message_mds(reply, mdr->slave_to_mds);
7508 mdr->slave_request->put();
7509 mdr->slave_request = 0;
7510 return;
7511 }
7512
7513 // discover destdn
7514 filepath destpath(mdr->slave_request->destdnpath);
7515 dout(10) << " dest " << destpath << dendl;
7516 vector<CDentry*> trace;
7517 int r = mdcache->path_traverse(mdr, NULL, NULL, destpath, &trace, NULL, MDS_TRAVERSE_DISCOVERXLOCK);
7518 if (r > 0) return;
7519 if (r == -ESTALE) {
7520 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
7521 mdr->slave_to_mds);
7522 return;
7523 }
7524 assert(r == 0); // we shouldn't get an error here!
7525
7526 CDentry *destdn = trace[trace.size()-1];
7527 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7528 dout(10) << " destdn " << *destdn << dendl;
7529 mdr->pin(destdn);
7530
7531 // discover srcdn
7532 filepath srcpath(mdr->slave_request->srcdnpath);
7533 dout(10) << " src " << srcpath << dendl;
7534 CInode *srci = nullptr;
7535 r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &srci, MDS_TRAVERSE_DISCOVERXLOCK);
7536 if (r > 0) return;
7537 assert(r == 0);
7538
7539 // srcpath must not point to a null dentry
7540 assert(srci != nullptr);
7541
7542 CDentry *srcdn = trace[trace.size()-1];
7543 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7544 dout(10) << " srcdn " << *srcdn << dendl;
7545 mdr->pin(srcdn);
7546 mdr->pin(srci);
7547
7548 // stray?
7549 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7550 (srcdnl->is_primary() || destdnl->is_primary()));
7551 CDentry *straydn = mdr->straydn;
7552 if (destdnl->is_primary() && !linkmerge)
7553 assert(straydn);
7554
7555 mdr->set_op_stamp(mdr->slave_request->op_stamp);
7556 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
7557
7558 // set up commit waiter (early, to clean up any freezing etc we do)
7559 if (!mdr->more()->slave_commit)
7560 mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
7561
7562 // am i srcdn auth?
7563 if (srcdn->is_auth()) {
7564 set<mds_rank_t> srcdnrep;
7565 srcdn->list_replicas(srcdnrep);
7566
7567 bool reply_witness = false;
7568 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
7569 // freeze?
7570 // we need this to
7571 // - avoid conflicting lock state changes
7572 // - avoid concurrent updates to the inode
7573 // (this could also be accomplished with the versionlock)
7574 int allowance = 2; // 1 for the mdr auth_pin, 1 for the link lock
7575 allowance += srcdnl->get_inode()->is_dir(); // for the snap lock
7576 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
7577 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
7578
7579 // unfreeze auth pin after freezing the inode to avoid queueing waiters
7580 if (srcdnl->get_inode()->is_frozen_auth_pin())
7581 mdr->unfreeze_auth_pin();
7582
7583 if (!frozen_inode) {
7584 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
7585 return;
7586 }
7587
7588 /*
7589 * set ambiguous auth for srci
7590 * NOTE: we don't worry about ambiguous cache expire as we do
7591 * with subtree migrations because all slaves will pin
7592 * srcdn->get_inode() for duration of this rename.
7593 */
7594 mdr->set_ambiguous_auth(srcdnl->get_inode());
7595
7596 // just mark the source inode as ambiguous auth if more than two MDS are involved.
7597 // the master will send another OP_RENAMEPREP slave request later.
7598 if (mdr->slave_request->witnesses.size() > 1) {
7599 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
7600 reply_witness = true;
7601 }
7602
7603 // make sure bystanders have received all lock related messages
7604 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7605 if (*p == mdr->slave_to_mds ||
7606 (mds->is_cluster_degraded() &&
7607 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
7608 continue;
7609 MMDSSlaveRequest *notify = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7610 MMDSSlaveRequest::OP_RENAMENOTIFY);
7611 mds->send_message_mds(notify, *p);
7612 mdr->more()->waiting_on_slave.insert(*p);
7613 }
7614
7615 // make sure clients have received all cap related messages
7616 set<client_t> export_client_set;
7617 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
7618
7619 MDSGatherBuilder gather(g_ceph_context);
7620 flush_client_sessions(export_client_set, gather);
7621 if (gather.has_subs()) {
7622 mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
7623 gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
7624 gather.activate();
7625 }
7626 }
7627
7628 // is witness list sufficient?
7629 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7630 if (*p == mdr->slave_to_mds ||
7631 mdr->slave_request->witnesses.count(*p)) continue;
7632 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
7633 reply_witness = true;
7634 break;
7635 }
7636
7637 if (reply_witness) {
7638 assert(!srcdnrep.empty());
7639 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7640 MMDSSlaveRequest::OP_RENAMEPREPACK);
7641 reply->witnesses.swap(srcdnrep);
7642 mds->send_message_mds(reply, mdr->slave_to_mds);
7643 mdr->slave_request->put();
7644 mdr->slave_request = 0;
7645 return;
7646 }
7647 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
7648 if (!mdr->more()->waiting_on_slave.empty()) {
7649 dout(10) << " still waiting for rename notify acks from "
7650 << mdr->more()->waiting_on_slave << dendl;
7651 return;
7652 }
7653 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
7654 // set ambiguous auth for srci on witnesses
7655 mdr->set_ambiguous_auth(srcdnl->get_inode());
7656 }
7657
7658 // encode everything we'd need to roll this back... basically, just the original state.
7659 rename_rollback rollback;
7660
7661 rollback.reqid = mdr->reqid;
7662
7663 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
7664 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7665 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
7666 rollback.orig_src.dname = srcdn->name;
7667 if (srcdnl->is_primary())
7668 rollback.orig_src.ino = srcdnl->get_inode()->ino();
7669 else {
7670 assert(srcdnl->is_remote());
7671 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
7672 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
7673 }
7674
7675 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
7676 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7677 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
7678 rollback.orig_dest.dname = destdn->name;
7679 if (destdnl->is_primary())
7680 rollback.orig_dest.ino = destdnl->get_inode()->ino();
7681 else if (destdnl->is_remote()) {
7682 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
7683 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
7684 }
7685
7686 if (straydn) {
7687 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
7688 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
7689 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
7690 rollback.stray.dname = straydn->name;
7691 }
7692 ::encode(rollback, mdr->more()->rollback_bl);
7693 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7694
7695 // journal.
7696 mdr->ls = mdlog->get_current_segment();
7697 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
7698 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
7699 mdlog->start_entry(le);
7700 le->rollback = mdr->more()->rollback_bl;
7701
7702 bufferlist blah; // inode import data... obviously not used if we're the slave
7703 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
7704
7705 if (le->commit.empty()) {
7706 dout(10) << " empty metablob, skipping journal" << dendl;
7707 mdlog->cancel_entry(le);
7708 mdr->ls = NULL;
7709 _logged_slave_rename(mdr, srcdn, destdn, straydn);
7710 } else {
7711 mdr->more()->slave_update_journaled = true;
7712 submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
7713 mdr, __func__);
7714 mdlog->flush();
7715 }
7716 }
7717
7718 void Server::_logged_slave_rename(MDRequestRef& mdr,
7719 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7720 {
7721 dout(10) << "_logged_slave_rename " << *mdr << dendl;
7722
7723 // prepare ack
7724 MMDSSlaveRequest *reply = NULL;
7725 if (!mdr->aborted) {
7726 reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7727 if (!mdr->more()->slave_update_journaled)
7728 reply->mark_not_journaled();
7729 }
7730
7731 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7732 CDentry::linkage_t *destdnl = NULL;
7733 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
7734
7735 // export srci?
7736 if (srcdn->is_auth() && srcdnl->is_primary()) {
7737 // set export bounds for CInode::encode_export()
7738 list<CDir*> bounds;
7739 if (srcdnl->get_inode()->is_dir()) {
7740 srcdnl->get_inode()->get_dirfrags(bounds);
7741 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7742 (*p)->state_set(CDir::STATE_EXPORTBOUND);
7743 }
7744
7745 map<client_t,entity_inst_t> exported_client_map;
7746 bufferlist inodebl;
7747 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
7748 exported_client_map);
7749
7750 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7751 (*p)->state_clear(CDir::STATE_EXPORTBOUND);
7752
7753 if (reply) {
7754 ::encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
7755 reply->inode_export.claim_append(inodebl);
7756 reply->inode_export_v = srcdnl->get_inode()->inode.version;
7757 }
7758
7759 // remove mdr auth pin
7760 mdr->auth_unpin(srcdnl->get_inode());
7761 mdr->more()->is_inode_exporter = true;
7762
7763 if (srcdnl->get_inode()->is_dirty())
7764 srcdnl->get_inode()->mark_clean();
7765
7766 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
7767 }
7768
7769 // apply
7770 _rename_apply(mdr, srcdn, destdn, straydn);
7771
7772 destdnl = destdn->get_linkage();
7773
7774 // bump popularity
7775 mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
7776 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
7777 mds->balancer->hit_inode(mdr->get_mds_stamp(), destdnl->get_inode(),
7778 META_POP_IWR);
7779
7780 // done.
7781 mdr->slave_request->put();
7782 mdr->slave_request = 0;
7783 mdr->straydn = 0;
7784
7785 if (reply) {
7786 mds->send_message_mds(reply, mdr->slave_to_mds);
7787 } else {
7788 assert(mdr->aborted);
7789 dout(10) << " abort flag set, finishing" << dendl;
7790 mdcache->request_finish(mdr);
7791 }
7792 }
7793
7794 void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
7795 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7796 {
7797 dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
7798
7799 CDentry::linkage_t *destdnl = destdn->get_linkage();
7800
7801 list<MDSInternalContextBase*> finished;
7802 if (r == 0) {
7803 // unfreeze+singleauth inode
7804 // hmm, do i really need to delay this?
7805 if (mdr->more()->is_inode_exporter) {
7806
7807 CInode *in = destdnl->get_inode();
7808
7809 // drop our pins
7810 // we exported, clear out any xlocks that we moved to another MDS
7811 set<SimpleLock*>::iterator i = mdr->xlocks.begin();
7812 while (i != mdr->xlocks.end()) {
7813 SimpleLock *lock = *i++;
7814
7815 // we only care about xlocks on the exported inode
7816 if (lock->get_parent() == in &&
7817 !lock->is_locallock())
7818 mds->locker->xlock_export(lock, mdr.get());
7819 }
7820
7821 map<client_t,Capability::Import> peer_imported;
7822 bufferlist::iterator bp = mdr->more()->inode_import.begin();
7823 ::decode(peer_imported, bp);
7824
7825 dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl;
7826 mdcache->migrator->finish_export_inode(destdnl->get_inode(),
7827 mdr->get_mds_stamp(),
7828 mdr->slave_to_mds, peer_imported, finished);
7829 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
7830
7831 // unfreeze
7832 assert(destdnl->get_inode()->is_frozen_inode());
7833 destdnl->get_inode()->unfreeze_inode(finished);
7834 }
7835
7836 // singleauth
7837 if (mdr->more()->is_ambiguous_auth) {
7838 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
7839 mdr->more()->is_ambiguous_auth = false;
7840 }
7841
7842 if (straydn && mdr->more()->slave_update_journaled) {
7843 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7844 if (strayin && !strayin->snaprealm)
7845 mdcache->clear_dirty_bits_for_stray(strayin);
7846 }
7847
7848 mds->queue_waiters(finished);
7849 mdr->cleanup();
7850
7851 if (mdr->more()->slave_update_journaled) {
7852 // write a commit to the journal
7853 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
7854 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
7855 ESlaveUpdate::RENAME);
7856 mdlog->start_entry(le);
7857 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
7858 mdlog->flush();
7859 } else {
7860 _committed_slave(mdr);
7861 }
7862 } else {
7863
7864 // abort
7865 // rollback_bl may be empty if we froze the inode but had to provide an expanded
7866 // witness list from the master, and they failed before we tried prep again.
7867 if (mdr->more()->rollback_bl.length()) {
7868 if (mdr->more()->is_inode_exporter) {
7869 dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl;
7870 destdnl->get_inode()->abort_export();
7871 }
7872 if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
7873 mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
7874 // rollback but preserve the slave request
7875 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
7876 mdr->more()->rollback_bl.clear();
7877 } else
7878 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
7879 } else {
7880 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
7881 // singleauth
7882 if (mdr->more()->is_ambiguous_auth) {
7883 if (srcdn->is_auth())
7884 mdr->more()->rename_inode->unfreeze_inode(finished);
7885
7886 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
7887 mdr->more()->is_ambiguous_auth = false;
7888 }
7889 mds->queue_waiters(finished);
7890 mdcache->request_finish(mdr);
7891 }
7892 }
7893 }
7894
7895 void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
7896 bool isdir, int linkunlink, nest_info_t &rstat)
7897 {
7898 fnode_t *pf;
7899 pf = dir->project_fnode();
7900 mut->add_projected_fnode(dir);
7901 pf->version = dir->pre_dirty();
7902
7903 if (isdir) {
7904 pf->fragstat.nsubdirs += linkunlink;
7905 } else {
7906 pf->fragstat.nfiles += linkunlink;
7907 }
7908 if (r.ino) {
7909 pf->rstat.rbytes += linkunlink * rstat.rbytes;
7910 pf->rstat.rfiles += linkunlink * rstat.rfiles;
7911 pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
7912 pf->rstat.rsnaprealms += linkunlink * rstat.rsnaprealms;
7913 }
7914 if (pf->fragstat.mtime == ctime) {
7915 pf->fragstat.mtime = r.dirfrag_old_mtime;
7916 if (pf->rstat.rctime == ctime)
7917 pf->rstat.rctime = r.dirfrag_old_rctime;
7918 }
7919 mut->add_updated_lock(&dir->get_inode()->filelock);
7920 mut->add_updated_lock(&dir->get_inode()->nestlock);
7921 }
7922
7923 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
7924 MutationRef mut;
7925 CDentry *srcdn;
7926 version_t srcdnpv;
7927 CDentry *destdn;
7928 CDentry *straydn;
7929 bool finish_mdr;
7930 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
7931 CDentry *sd, version_t pv, CDentry *dd,
7932 CDentry *st, bool f) :
7933 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
7934 straydn(st), finish_mdr(f) {}
7935 void finish(int r) override {
7936 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
7937 destdn, straydn, finish_mdr);
7938 }
7939 };
7940
7941 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
7942 bool finish_mdr)
7943 {
7944 rename_rollback rollback;
7945 bufferlist::iterator p = rbl.begin();
7946 ::decode(rollback, p);
7947
7948 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
7949 // need to finish this update before sending resolve to claim the subtree
7950 mdcache->add_rollback(rollback.reqid, master);
7951
7952 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7953 mut->ls = mds->mdlog->get_current_segment();
7954
7955 CDentry *srcdn = NULL;
7956 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
7957 if (!srcdir)
7958 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
7959 if (srcdir) {
7960 dout(10) << " srcdir " << *srcdir << dendl;
7961 srcdn = srcdir->lookup(rollback.orig_src.dname);
7962 if (srcdn) {
7963 dout(10) << " srcdn " << *srcdn << dendl;
7964 assert(srcdn->get_linkage()->is_null());
7965 } else
7966 dout(10) << " srcdn not found" << dendl;
7967 } else
7968 dout(10) << " srcdir not found" << dendl;
7969
7970 CDentry *destdn = NULL;
7971 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
7972 if (!destdir)
7973 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
7974 if (destdir) {
7975 dout(10) << " destdir " << *destdir << dendl;
7976 destdn = destdir->lookup(rollback.orig_dest.dname);
7977 if (destdn)
7978 dout(10) << " destdn " << *destdn << dendl;
7979 else
7980 dout(10) << " destdn not found" << dendl;
7981 } else
7982 dout(10) << " destdir not found" << dendl;
7983
7984 CInode *in = NULL;
7985 if (rollback.orig_src.ino) {
7986 in = mdcache->get_inode(rollback.orig_src.ino);
7987 if (in && in->is_dir())
7988 assert(srcdn && destdn);
7989 } else
7990 in = mdcache->get_inode(rollback.orig_src.remote_ino);
7991
7992 CDir *straydir = NULL;
7993 CDentry *straydn = NULL;
7994 if (rollback.stray.dirfrag.ino) {
7995 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
7996 if (straydir) {
7997 dout(10) << "straydir " << *straydir << dendl;
7998 straydn = straydir->lookup(rollback.stray.dname);
7999 if (straydn) {
8000 dout(10) << " straydn " << *straydn << dendl;
8001 assert(straydn->get_linkage()->is_primary());
8002 } else
8003 dout(10) << " straydn not found" << dendl;
8004 } else
8005 dout(10) << "straydir not found" << dendl;
8006 }
8007
8008 CInode *target = NULL;
8009 if (rollback.orig_dest.ino) {
8010 target = mdcache->get_inode(rollback.orig_dest.ino);
8011 if (target)
8012 assert(destdn && straydn);
8013 } else if (rollback.orig_dest.remote_ino)
8014 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
8015
8016 // can't use is_auth() in the resolve stage
8017 mds_rank_t whoami = mds->get_nodeid();
8018 // slave
8019 assert(!destdn || destdn->authority().first != whoami);
8020 assert(!straydn || straydn->authority().first != whoami);
8021
8022 bool force_journal_src = false;
8023 bool force_journal_dest = false;
8024 if (in && in->is_dir() && srcdn->authority().first != whoami)
8025 force_journal_src = _need_force_journal(in, false);
8026 if (in && target && target->is_dir())
8027 force_journal_dest = _need_force_journal(in, true);
8028
8029 version_t srcdnpv = 0;
8030 // repair src
8031 if (srcdn) {
8032 if (srcdn->authority().first == whoami)
8033 srcdnpv = srcdn->pre_dirty();
8034 if (rollback.orig_src.ino) {
8035 assert(in);
8036 srcdn->push_projected_linkage(in);
8037 } else
8038 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
8039 rollback.orig_src.remote_d_type);
8040 }
8041
8042 inode_t *pi = 0;
8043 if (in) {
8044 if (in->authority().first == whoami) {
8045 pi = in->project_inode();
8046 mut->add_projected_inode(in);
8047 pi->version = in->pre_dirty();
8048 } else
8049 pi = in->get_projected_inode();
8050 if (pi->ctime == rollback.ctime)
8051 pi->ctime = rollback.orig_src.old_ctime;
8052 }
8053
8054 if (srcdn && srcdn->authority().first == whoami) {
8055 nest_info_t blah;
8056 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
8057 in ? in->is_dir() : false, 1, pi ? pi->accounted_rstat : blah);
8058 }
8059
8060 // repair dest
8061 if (destdn) {
8062 if (rollback.orig_dest.ino && target) {
8063 destdn->push_projected_linkage(target);
8064 } else if (rollback.orig_dest.remote_ino) {
8065 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
8066 rollback.orig_dest.remote_d_type);
8067 } else {
8068 // the dentry will be trimmed soon, it's ok to have wrong linkage
8069 if (rollback.orig_dest.ino)
8070 assert(mds->is_resolve());
8071 destdn->push_projected_linkage();
8072 }
8073 }
8074
8075 if (straydn)
8076 straydn->push_projected_linkage();
8077
8078 if (target) {
8079 inode_t *ti = NULL;
8080 if (target->authority().first == whoami) {
8081 ti = target->project_inode();
8082 mut->add_projected_inode(target);
8083 ti->version = target->pre_dirty();
8084 } else
8085 ti = target->get_projected_inode();
8086 if (ti->ctime == rollback.ctime)
8087 ti->ctime = rollback.orig_dest.old_ctime;
8088 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
8089 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
8090 assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
8091 else
8092 assert(rollback.orig_dest.remote_ino &&
8093 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
8094 } else
8095 ti->nlink++;
8096 }
8097
8098 if (srcdn)
8099 dout(0) << " srcdn back to " << *srcdn << dendl;
8100 if (in)
8101 dout(0) << " srci back to " << *in << dendl;
8102 if (destdn)
8103 dout(0) << " destdn back to " << *destdn << dendl;
8104 if (target)
8105 dout(0) << " desti back to " << *target << dendl;
8106
8107 // journal it
8108 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
8109 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
8110 mdlog->start_entry(le);
8111
8112 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
8113 le->commit.add_dir_context(srcdir);
8114 if (rollback.orig_src.ino)
8115 le->commit.add_primary_dentry(srcdn, 0, true);
8116 else
8117 le->commit.add_remote_dentry(srcdn, true);
8118 }
8119
8120 if (!rollback.orig_src.ino && // remote linkage
8121 in && in->authority().first == whoami) {
8122 le->commit.add_dir_context(in->get_projected_parent_dir());
8123 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
8124 }
8125
8126 if (force_journal_dest) {
8127 assert(rollback.orig_dest.ino);
8128 le->commit.add_dir_context(destdir);
8129 le->commit.add_primary_dentry(destdn, 0, true);
8130 }
8131
8132 // slave: no need to journal straydn
8133
8134 if (target && target != in && target->authority().first == whoami) {
8135 assert(rollback.orig_dest.remote_ino);
8136 le->commit.add_dir_context(target->get_projected_parent_dir());
8137 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
8138 }
8139
8140 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
8141 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
8142 le->commit.renamed_dirino = in->ino();
8143 if (srcdn->authority().first == whoami) {
8144 list<CDir*> ls;
8145 in->get_dirfrags(ls);
8146 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
8147 CDir *dir = *p;
8148 if (!dir->is_auth())
8149 le->commit.renamed_dir_frags.push_back(dir->get_frag());
8150 }
8151 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
8152 }
8153 } else if (force_journal_dest) {
8154 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
8155 le->commit.renamed_dirino = target->ino();
8156 }
8157
8158 if (target && target->is_dir()) {
8159 assert(destdn);
8160 mdcache->project_subtree_rename(target, straydir, destdir);
8161 }
8162
8163 if (in && in->is_dir()) {
8164 assert(srcdn);
8165 mdcache->project_subtree_rename(in, destdir, srcdir);
8166 }
8167
8168 if (mdr && !mdr->more()->slave_update_journaled) {
8169 assert(le->commit.empty());
8170 mdlog->cancel_entry(le);
8171 mut->ls = NULL;
8172 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, finish_mdr);
8173 } else {
8174 assert(!le->commit.empty());
8175 if (mdr)
8176 mdr->more()->slave_update_journaled = false;
8177 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr, srcdn, srcdnpv,
8178 destdn, straydn, finish_mdr);
8179 submit_mdlog_entry(le, fin, mdr, __func__);
8180 mdlog->flush();
8181 }
8182 }
8183
8184 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
8185 version_t srcdnpv, CDentry *destdn,
8186 CDentry *straydn, bool finish_mdr)
8187 {
8188 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
8189
8190 if (straydn) {
8191 straydn->get_dir()->unlink_inode(straydn);
8192 straydn->pop_projected_linkage();
8193 }
8194 if (destdn) {
8195 destdn->get_dir()->unlink_inode(destdn);
8196 destdn->pop_projected_linkage();
8197 }
8198 if (srcdn) {
8199 srcdn->pop_projected_linkage();
8200 if (srcdn->authority().first == mds->get_nodeid())
8201 srcdn->mark_dirty(srcdnpv, mut->ls);
8202 }
8203
8204 mut->apply();
8205
8206 if (srcdn && srcdn->get_linkage()->is_primary()) {
8207 CInode *in = srcdn->get_linkage()->get_inode();
8208 if (srcdn->authority().first == mds->get_nodeid())
8209 in->state_set(CInode::STATE_AUTH);
8210 // update subtree map?
8211 if (in && in->is_dir()) {
8212 assert(destdn);
8213 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
8214 }
8215 }
8216
8217 if (destdn) {
8218 CInode *oldin = destdn->get_linkage()->get_inode();
8219 // update subtree map?
8220 if (oldin && oldin->is_dir()) {
8221 assert(straydn);
8222 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
8223 }
8224 }
8225
8226 if (mds->is_resolve()) {
8227 CDir *root = NULL;
8228 if (straydn)
8229 root = mdcache->get_subtree_root(straydn->get_dir());
8230 else if (destdn)
8231 root = mdcache->get_subtree_root(destdn->get_dir());
8232 if (root)
8233 mdcache->try_trim_non_auth_subtree(root);
8234 }
8235
8236 if (mdr) {
8237 list<MDSInternalContextBase*> finished;
8238 if (mdr->more()->is_ambiguous_auth) {
8239 if (srcdn->is_auth())
8240 mdr->more()->rename_inode->unfreeze_inode(finished);
8241
8242 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8243 mdr->more()->is_ambiguous_auth = false;
8244 }
8245 mds->queue_waiters(finished);
8246 if (finish_mdr || mdr->aborted)
8247 mdcache->request_finish(mdr);
8248 else
8249 mdr->more()->slave_rolling_back = false;
8250 }
8251
8252 mdcache->finish_rollback(mut->reqid);
8253
8254 mut->cleanup();
8255 }
8256
8257 /* This function DOES put the passed message before returning*/
8258 void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8259 {
8260 dout(10) << "handle_slave_rename_prep_ack " << *mdr
8261 << " witnessed by " << ack->get_source()
8262 << " " << *ack << dendl;
8263 mds_rank_t from = mds_rank_t(ack->get_source().num());
8264
8265 // note slave
8266 mdr->more()->slaves.insert(from);
8267 if (mdr->more()->srcdn_auth_mds == from &&
8268 mdr->more()->is_remote_frozen_authpin &&
8269 !mdr->more()->is_ambiguous_auth) {
8270 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
8271 }
8272
8273 // witnessed? or add extra witnesses?
8274 assert(mdr->more()->witnessed.count(from) == 0);
8275 if (ack->is_interrupted()) {
8276 dout(10) << " slave request interrupted, noop" << dendl;
8277 } else if (ack->witnesses.empty()) {
8278 mdr->more()->witnessed.insert(from);
8279 if (!ack->is_not_journaled())
8280 mdr->more()->has_journaled_slaves = true;
8281 } else {
8282 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
8283 mdr->more()->extra_witnesses.swap(ack->witnesses);
8284 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
8285 }
8286
8287 // srci import?
8288 if (ack->inode_export.length()) {
8289 dout(10) << " got srci import" << dendl;
8290 mdr->more()->inode_import.claim(ack->inode_export);
8291 mdr->more()->inode_import_v = ack->inode_export_v;
8292 }
8293
8294 // remove from waiting list
8295 assert(mdr->more()->waiting_on_slave.count(from));
8296 mdr->more()->waiting_on_slave.erase(from);
8297
8298 if (mdr->more()->waiting_on_slave.empty())
8299 dispatch_client_request(mdr); // go again!
8300 else
8301 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
8302 }
8303
8304 void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8305 {
8306 dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
8307 << ack->get_source() << dendl;
8308 assert(mdr->is_slave());
8309 mds_rank_t from = mds_rank_t(ack->get_source().num());
8310
8311 if (mdr->more()->waiting_on_slave.count(from)) {
8312 mdr->more()->waiting_on_slave.erase(from);
8313
8314 if (mdr->more()->waiting_on_slave.empty()) {
8315 if (mdr->slave_request)
8316 dispatch_slave_request(mdr);
8317 } else
8318 dout(10) << " still waiting for rename notify acks from "
8319 << mdr->more()->waiting_on_slave << dendl;
8320 }
8321 }
8322
8323 void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
8324 {
8325 dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
8326
8327 if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
8328 mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
8329
8330 if (mdr->more()->waiting_on_slave.empty()) {
8331 if (mdr->slave_request)
8332 dispatch_slave_request(mdr);
8333 } else
8334 dout(10) << " still waiting for rename notify acks from "
8335 << mdr->more()->waiting_on_slave << dendl;
8336 }
8337 }
8338
8339 // snaps
8340 /* This function takes responsibility for the passed mdr*/
8341 void Server::handle_client_lssnap(MDRequestRef& mdr)
8342 {
8343 MClientRequest *req = mdr->client_request;
8344
8345 // traverse to path
8346 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8347 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8348 respond_to_request(mdr, -ESTALE);
8349 return;
8350 }
8351 if (!diri->is_auth()) {
8352 mdcache->request_forward(mdr, diri->authority().first);
8353 return;
8354 }
8355 if (!diri->is_dir()) {
8356 respond_to_request(mdr, -ENOTDIR);
8357 return;
8358 }
8359 dout(10) << "lssnap on " << *diri << dendl;
8360
8361 // lock snap
8362 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8363 mds->locker->include_snap_rdlocks(rdlocks, diri);
8364 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8365 return;
8366
8367 if (!check_access(mdr, diri, MAY_READ))
8368 return;
8369
8370 SnapRealm *realm = diri->find_snaprealm();
8371 map<snapid_t,SnapInfo*> infomap;
8372 realm->get_snap_info(infomap, diri->get_oldest_snap());
8373
8374 unsigned max_entries = req->head.args.readdir.max_entries;
8375 if (!max_entries)
8376 max_entries = infomap.size();
8377 int max_bytes = req->head.args.readdir.max_bytes;
8378 if (!max_bytes)
8379 // make sure at least one item can be encoded
8380 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
8381
8382 __u64 last_snapid = 0;
8383 string offset_str = req->get_path2();
8384 if (!offset_str.empty())
8385 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
8386
8387 bufferlist dirbl;
8388 encode_empty_dirstat(dirbl);
8389
8390 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
8391
8392 __u32 num = 0;
8393 bufferlist dnbl;
8394 map<snapid_t,SnapInfo*>::iterator p = infomap.upper_bound(last_snapid);
8395 for (; p != infomap.end() && num < max_entries; ++p) {
8396 dout(10) << p->first << " -> " << *p->second << dendl;
8397
8398 // actual
8399 string snap_name;
8400 if (p->second->ino == diri->ino())
8401 snap_name = p->second->name;
8402 else
8403 snap_name = p->second->get_long_name();
8404
8405 unsigned start_len = dnbl.length();
8406 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
8407 break;
8408
8409 ::encode(snap_name, dnbl);
8410 encode_infinite_lease(dnbl);
8411
8412 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
8413 if (r < 0) {
8414 bufferlist keep;
8415 keep.substr_of(dnbl, 0, start_len);
8416 dnbl.swap(keep);
8417 break;
8418 }
8419 ++num;
8420 }
8421
8422 ::encode(num, dirbl);
8423 __u16 flags = 0;
8424 if (p == infomap.end()) {
8425 flags = CEPH_READDIR_FRAG_END;
8426 if (last_snapid == 0)
8427 flags |= CEPH_READDIR_FRAG_COMPLETE;
8428 }
8429 ::encode(flags, dirbl);
8430 dirbl.claim_append(dnbl);
8431
8432 mdr->reply_extra_bl = dirbl;
8433 mdr->tracei = diri;
8434 respond_to_request(mdr, 0);
8435 }
8436
8437
8438 // MKSNAP
8439
8440 struct C_MDS_mksnap_finish : public ServerLogContext {
8441 CInode *diri;
8442 SnapInfo info;
8443 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
8444 ServerLogContext(s, r), diri(di), info(i) {}
8445 void finish(int r) override {
8446 server->_mksnap_finish(mdr, diri, info);
8447 }
8448 };
8449
8450 /* This function takes responsibility for the passed mdr*/
8451 void Server::handle_client_mksnap(MDRequestRef& mdr)
8452 {
8453 if (!mds->mdsmap->allows_snaps()) {
8454 // you can't make snapshots until you set an option right now
8455 respond_to_request(mdr, -EPERM);
8456 return;
8457 }
8458
8459 MClientRequest *req = mdr->client_request;
8460 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8461 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8462 respond_to_request(mdr, -ESTALE);
8463 return;
8464 }
8465
8466 if (!diri->is_auth()) { // fw to auth?
8467 mdcache->request_forward(mdr, diri->authority().first);
8468 return;
8469 }
8470
8471 // dir only
8472 if (!diri->is_dir()) {
8473 respond_to_request(mdr, -ENOTDIR);
8474 return;
8475 }
8476 if (diri->is_system() && !diri->is_root()) {
8477 // no snaps in system dirs (root is ok)
8478 respond_to_request(mdr, -EPERM);
8479 return;
8480 }
8481
8482 const string &snapname = req->get_filepath().last_dentry();
8483
8484 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8485 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8486 respond_to_request(mdr, -EPERM);
8487 return;
8488 }
8489
8490 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
8491
8492 // lock snap
8493 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8494
8495 mds->locker->include_snap_rdlocks(rdlocks, diri);
8496 rdlocks.erase(&diri->snaplock);
8497 xlocks.insert(&diri->snaplock);
8498
8499 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8500 return;
8501
8502 if (!check_access(mdr, diri, MAY_WRITE))
8503 return;
8504
8505 // make sure name is unique
8506 if (diri->snaprealm &&
8507 diri->snaprealm->exists(snapname)) {
8508 respond_to_request(mdr, -EEXIST);
8509 return;
8510 }
8511 if (snapname.length() == 0 ||
8512 snapname[0] == '_') {
8513 respond_to_request(mdr, -EINVAL);
8514 return;
8515 }
8516
8517 // allocate a snapid
8518 if (!mdr->more()->stid) {
8519 // prepare an stid
8520 mds->snapclient->prepare_create(diri->ino(), snapname,
8521 mdr->get_mds_stamp(),
8522 &mdr->more()->stid, &mdr->more()->snapidbl,
8523 new C_MDS_RetryRequest(mdcache, mdr));
8524 return;
8525 }
8526
8527 version_t stid = mdr->more()->stid;
8528 snapid_t snapid;
8529 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8530 ::decode(snapid, p);
8531 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
8532
8533 // journal
8534 SnapInfo info;
8535 info.ino = diri->ino();
8536 info.snapid = snapid;
8537 info.name = snapname;
8538 info.stamp = mdr->get_op_stamp();
8539
8540 inode_t *pi = diri->project_inode();
8541 pi->ctime = info.stamp;
8542 pi->version = diri->pre_dirty();
8543
8544 // project the snaprealm
8545 sr_t *newsnap = diri->project_snaprealm(snapid);
8546 newsnap->snaps[snapid] = info;
8547 newsnap->seq = snapid;
8548 newsnap->last_created = snapid;
8549
8550 // journal the inode changes
8551 mdr->ls = mdlog->get_current_segment();
8552 EUpdate *le = new EUpdate(mdlog, "mksnap");
8553 mdlog->start_entry(le);
8554
8555 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8556 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8557 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8558 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8559
8560 // journal the snaprealm changes
8561 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
8562 mdr, __func__);
8563 mdlog->flush();
8564 }
8565
8566 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
8567 {
8568 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
8569
8570 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
8571
8572 diri->pop_and_dirty_projected_inode(mdr->ls);
8573 mdr->apply();
8574
8575 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8576
8577 // create snap
8578 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8579
8580 mdcache->do_realm_invalidate_and_update_notify(diri, op);
8581
8582 // yay
8583 mdr->in[0] = diri;
8584 mdr->snapid = info.snapid;
8585 mdr->tracei = diri;
8586 respond_to_request(mdr, 0);
8587 }
8588
8589
8590 // RMSNAP
8591
8592 struct C_MDS_rmsnap_finish : public ServerLogContext {
8593 CInode *diri;
8594 snapid_t snapid;
8595 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8596 ServerLogContext(s, r), diri(di), snapid(sn) {}
8597 void finish(int r) override {
8598 server->_rmsnap_finish(mdr, diri, snapid);
8599 }
8600 };
8601
8602 /* This function takes responsibility for the passed mdr*/
8603 void Server::handle_client_rmsnap(MDRequestRef& mdr)
8604 {
8605 MClientRequest *req = mdr->client_request;
8606
8607 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8608 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8609 respond_to_request(mdr, -ESTALE);
8610 return;
8611 }
8612 if (!diri->is_auth()) { // fw to auth?
8613 mdcache->request_forward(mdr, diri->authority().first);
8614 return;
8615 }
8616 if (!diri->is_dir()) {
8617 respond_to_request(mdr, -ENOTDIR);
8618 return;
8619 }
8620
8621 const string &snapname = req->get_filepath().last_dentry();
8622
8623 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8624 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8625 respond_to_request(mdr, -EPERM);
8626 return;
8627 }
8628
8629 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
8630
8631 // does snap exist?
8632 if (snapname.length() == 0 || snapname[0] == '_') {
8633 respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
8634 return;
8635 }
8636 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
8637 respond_to_request(mdr, -ENOENT);
8638 return;
8639 }
8640 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
8641 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
8642
8643 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8644 mds->locker->include_snap_rdlocks(rdlocks, diri);
8645 rdlocks.erase(&diri->snaplock);
8646 xlocks.insert(&diri->snaplock);
8647
8648 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8649 return;
8650
8651 if (!check_access(mdr, diri, MAY_WRITE))
8652 return;
8653
8654 // prepare
8655 if (!mdr->more()->stid) {
8656 mds->snapclient->prepare_destroy(diri->ino(), snapid,
8657 &mdr->more()->stid, &mdr->more()->snapidbl,
8658 new C_MDS_RetryRequest(mdcache, mdr));
8659 return;
8660 }
8661 version_t stid = mdr->more()->stid;
8662 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8663 snapid_t seq;
8664 ::decode(seq, p);
8665 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8666
8667 // journal
8668 inode_t *pi = diri->project_inode();
8669 pi->version = diri->pre_dirty();
8670 pi->ctime = mdr->get_op_stamp();
8671
8672 mdr->ls = mdlog->get_current_segment();
8673 EUpdate *le = new EUpdate(mdlog, "rmsnap");
8674 mdlog->start_entry(le);
8675
8676 // project the snaprealm
8677 sr_t *newnode = diri->project_snaprealm();
8678 newnode->snaps.erase(snapid);
8679 newnode->seq = seq;
8680 newnode->last_destroyed = seq;
8681
8682 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8683 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8684 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8685 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8686
8687 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
8688 mdr, __func__);
8689 mdlog->flush();
8690 }
8691
8692 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8693 {
8694 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
8695 snapid_t stid = mdr->more()->stid;
8696 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8697 snapid_t seq;
8698 ::decode(seq, p);
8699
8700 diri->pop_and_dirty_projected_inode(mdr->ls);
8701 mdr->apply();
8702
8703 mds->snapclient->commit(stid, mdr->ls);
8704
8705 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8706
8707 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
8708
8709 // yay
8710 mdr->in[0] = diri;
8711 respond_to_request(mdr, 0);
8712
8713 // purge snapshot data
8714 if (diri->snaprealm->have_past_parents_open())
8715 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
8716 }
8717
8718 struct C_MDS_renamesnap_finish : public ServerLogContext {
8719 CInode *diri;
8720 snapid_t snapid;
8721 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8722 ServerLogContext(s, r), diri(di), snapid(sn) {}
8723 void finish(int r) override {
8724 server->_renamesnap_finish(mdr, diri, snapid);
8725 }
8726 };
8727
8728 /* This function takes responsibility for the passed mdr*/
8729 void Server::handle_client_renamesnap(MDRequestRef& mdr)
8730 {
8731 MClientRequest *req = mdr->client_request;
8732 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
8733 respond_to_request(mdr, -EINVAL);
8734 return;
8735 }
8736
8737 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8738 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8739 respond_to_request(mdr, -ESTALE);
8740 return;
8741 }
8742
8743 if (!diri->is_auth()) { // fw to auth?
8744 mdcache->request_forward(mdr, diri->authority().first);
8745 return;
8746 }
8747
8748 if (!diri->is_dir()) { // dir only
8749 respond_to_request(mdr, -ENOTDIR);
8750 return;
8751 }
8752
8753 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid ||
8754 mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8755 respond_to_request(mdr, -EPERM);
8756 return;
8757 }
8758
8759 const string &dstname = req->get_filepath().last_dentry();
8760 const string &srcname = req->get_filepath2().last_dentry();
8761 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
8762
8763 if (srcname.length() == 0 || srcname[0] == '_') {
8764 respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
8765 return;
8766 }
8767 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
8768 respond_to_request(mdr, -ENOENT);
8769 return;
8770 }
8771 if (dstname.length() == 0 || dstname[0] == '_') {
8772 respond_to_request(mdr, -EINVAL);
8773 return;
8774 }
8775 if (diri->snaprealm->exists(dstname)) {
8776 respond_to_request(mdr, -EEXIST);
8777 return;
8778 }
8779
8780 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
8781 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
8782
8783 // lock snap
8784 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8785
8786 mds->locker->include_snap_rdlocks(rdlocks, diri);
8787 rdlocks.erase(&diri->snaplock);
8788 xlocks.insert(&diri->snaplock);
8789
8790 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8791 return;
8792
8793 if (!check_access(mdr, diri, MAY_WRITE))
8794 return;
8795
8796 // prepare
8797 if (!mdr->more()->stid) {
8798 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
8799 &mdr->more()->stid, &mdr->more()->snapidbl,
8800 new C_MDS_RetryRequest(mdcache, mdr));
8801 return;
8802 }
8803
8804 version_t stid = mdr->more()->stid;
8805 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8806 snapid_t seq;
8807 ::decode(seq, p);
8808 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8809
8810 // journal
8811 inode_t *pi = diri->project_inode();
8812 pi->ctime = mdr->get_op_stamp();
8813 pi->version = diri->pre_dirty();
8814
8815 // project the snaprealm
8816 sr_t *newsnap = diri->project_snaprealm();
8817 assert(newsnap->snaps.count(snapid));
8818 newsnap->snaps[snapid].name = dstname;
8819
8820 // journal the inode changes
8821 mdr->ls = mdlog->get_current_segment();
8822 EUpdate *le = new EUpdate(mdlog, "renamesnap");
8823 mdlog->start_entry(le);
8824
8825 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8826 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8827 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8828 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8829
8830 // journal the snaprealm changes
8831 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
8832 mdr, __func__);
8833 mdlog->flush();
8834 }
8835
8836 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8837 {
8838 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
8839
8840 diri->pop_and_dirty_projected_inode(mdr->ls);
8841 mdr->apply();
8842
8843 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8844
8845 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8846
8847 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE, true);
8848
8849 // yay
8850 mdr->in[0] = diri;
8851 mdr->tracei = diri;
8852 mdr->snapid = snapid;
8853 respond_to_request(mdr, 0);
8854 }
8855
8856 /**
8857 * Return true if server is in state RECONNECT and this
8858 * client has not yet reconnected.
8859 */
8860 bool Server::waiting_for_reconnect(client_t c) const
8861 {
8862 return client_reconnect_gather.count(c) > 0;
8863 }
8864
8865 void Server::dump_reconnect_status(Formatter *f) const
8866 {
8867 f->open_object_section("reconnect_status");
8868 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
8869 f->close_section();
8870 }