]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
update sources to 12.2.2
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20
21 #include "MDSRank.h"
22 #include "Server.h"
23 #include "Locker.h"
24 #include "MDCache.h"
25 #include "MDLog.h"
26 #include "Migrator.h"
27 #include "MDBalancer.h"
28 #include "InoTable.h"
29 #include "SnapClient.h"
30 #include "Mutation.h"
31
32 #include "msg/Messenger.h"
33
34 #include "osdc/Objecter.h"
35
36 #include "messages/MClientSession.h"
37 #include "messages/MClientRequest.h"
38 #include "messages/MClientReply.h"
39 #include "messages/MClientReconnect.h"
40 #include "messages/MClientCaps.h"
41 #include "messages/MClientSnap.h"
42
43 #include "messages/MMDSSlaveRequest.h"
44
45 #include "messages/MLock.h"
46
47 #include "events/EUpdate.h"
48 #include "events/ESlaveUpdate.h"
49 #include "events/ESession.h"
50 #include "events/EOpen.h"
51 #include "events/ECommitted.h"
52
53 #include "include/filepath.h"
54 #include "common/errno.h"
55 #include "common/Timer.h"
56 #include "common/perf_counters.h"
57 #include "include/compat.h"
58 #include "osd/OSDMap.h"
59
60 #include <errno.h>
61
62 #include <list>
63 #include <iostream>
64 using namespace std;
65
66 #include "common/config.h"
67
68 #define dout_context g_ceph_context
69 #define dout_subsys ceph_subsys_mds
70 #undef dout_prefix
71 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
72
73 class ServerContext : public MDSInternalContextBase {
74 protected:
75 Server *server;
76 MDSRank *get_mds() override
77 {
78 return server->mds;
79 }
80
81 public:
82 explicit ServerContext(Server *s) : server(s) {
83 assert(server != NULL);
84 }
85 };
86
87 class ServerLogContext : public MDSLogContextBase {
88 protected:
89 Server *server;
90 MDSRank *get_mds() override
91 {
92 return server->mds;
93 }
94
95 MDRequestRef mdr;
96 void pre_finish(int r) override {
97 if (mdr)
98 mdr->mark_event("journal_committed: ");
99 }
100 public:
101 explicit ServerLogContext(Server *s) : server(s) {
102 assert(server != NULL);
103 }
104 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
105 assert(server != NULL);
106 }
107 };
108
109 void Server::create_logger()
110 {
111 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
112 plb.add_u64_counter(l_mdss_handle_client_request,"handle_client_request",
113 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
114 plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
115 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
116 plb.add_u64_counter(l_mdss_handle_client_session, "handle_client_session",
117 "Client session messages", "hcs", PerfCountersBuilder::PRIO_INTERESTING);
118 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", "Client requests dispatched");
119 plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request", "Server requests dispatched");
120 plb.add_u64_counter(l_mdss_req_lookuphash, "req_lookuphash",
121 "Request type lookup hash of inode");
122 plb.add_u64_counter(l_mdss_req_lookupino, "req_lookupino",
123 "Request type lookup inode");
124 plb.add_u64_counter(l_mdss_req_lookupparent, "req_lookupparent",
125 "Request type lookup parent");
126 plb.add_u64_counter(l_mdss_req_lookupname, "req_lookupname",
127 "Request type lookup name");
128 plb.add_u64_counter(l_mdss_req_lookup, "req_lookup",
129 "Request type lookup");
130 plb.add_u64_counter(l_mdss_req_lookupsnap, "req_lookupsnap",
131 "Request type lookup snapshot");
132 plb.add_u64_counter(l_mdss_req_getattr, "req_getattr",
133 "Request type get attribute");
134 plb.add_u64_counter(l_mdss_req_setattr, "req_setattr",
135 "Request type set attribute");
136 plb.add_u64_counter(l_mdss_req_setlayout, "req_setlayout",
137 "Request type set file layout");
138 plb.add_u64_counter(l_mdss_req_setdirlayout, "req_setdirlayout",
139 "Request type set directory layout");
140 plb.add_u64_counter(l_mdss_req_setxattr, "req_setxattr",
141 "Request type set extended attribute");
142 plb.add_u64_counter(l_mdss_req_rmxattr, "req_rmxattr",
143 "Request type remove extended attribute");
144 plb.add_u64_counter(l_mdss_req_readdir, "req_readdir",
145 "Request type read directory");
146 plb.add_u64_counter(l_mdss_req_setfilelock, "req_setfilelock",
147 "Request type set file lock");
148 plb.add_u64_counter(l_mdss_req_getfilelock, "req_getfilelock",
149 "Request type get file lock");
150 plb.add_u64_counter(l_mdss_req_create, "req_create",
151 "Request type create");
152 plb.add_u64_counter(l_mdss_req_open, "req_open",
153 "Request type open");
154 plb.add_u64_counter(l_mdss_req_mknod, "req_mknod",
155 "Request type make node");
156 plb.add_u64_counter(l_mdss_req_link, "req_link",
157 "Request type link");
158 plb.add_u64_counter(l_mdss_req_unlink, "req_unlink",
159 "Request type unlink");
160 plb.add_u64_counter(l_mdss_req_rmdir, "req_rmdir",
161 "Request type remove directory");
162 plb.add_u64_counter(l_mdss_req_rename, "req_rename",
163 "Request type rename");
164 plb.add_u64_counter(l_mdss_req_mkdir, "req_mkdir",
165 "Request type make directory");
166 plb.add_u64_counter(l_mdss_req_symlink, "req_symlink",
167 "Request type symbolic link");
168 plb.add_u64_counter(l_mdss_req_lssnap, "req_lssnap",
169 "Request type list snapshot");
170 plb.add_u64_counter(l_mdss_req_mksnap, "req_mksnap",
171 "Request type make snapshot");
172 plb.add_u64_counter(l_mdss_req_rmsnap, "req_rmsnap",
173 "Request type remove snapshot");
174 plb.add_u64_counter(l_mdss_req_renamesnap, "req_renamesnap",
175 "Request type rename snapshot");
176 logger = plb.create_perf_counters();
177 g_ceph_context->get_perfcounters_collection()->add(logger);
178 }
179
180 Server::Server(MDSRank *m) :
181 mds(m),
182 mdcache(mds->mdcache), mdlog(mds->mdlog),
183 logger(0),
184 is_full(false),
185 reconnect_done(NULL),
186 failed_reconnects(0),
187 reconnect_evicting(false),
188 terminating_sessions(false)
189 {
190 }
191
192
193 /* This function DOES put the passed message before returning*/
194 void Server::dispatch(Message *m)
195 {
196 switch (m->get_type()) {
197 case CEPH_MSG_CLIENT_RECONNECT:
198 handle_client_reconnect(static_cast<MClientReconnect*>(m));
199 return;
200 }
201
202 // active?
203 if (!mds->is_active() &&
204 !(mds->is_stopping() && m->get_source().is_mds())) {
205 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
206 (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT)) {
207 MClientRequest *req = static_cast<MClientRequest*>(m);
208 Session *session = get_session(req);
209 if (!session || session->is_closed()) {
210 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
211 req->put();
212 return;
213 }
214 bool queue_replay = false;
215 if (req->is_replay()) {
216 dout(3) << "queuing replayed op" << dendl;
217 queue_replay = true;
218 } else if (req->get_retry_attempt()) {
219 // process completed request in clientreplay stage. The completed request
220 // might have created new file/directorie. This guarantees MDS sends a reply
221 // to client before other request modifies the new file/directorie.
222 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
223 dout(3) << "queuing completed op" << dendl;
224 queue_replay = true;
225 }
226 // this request was created before the cap reconnect message, drop any embedded
227 // cap releases.
228 req->releases.clear();
229 }
230 if (queue_replay) {
231 req->mark_queued_for_replay();
232 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
233 return;
234 }
235 }
236
237 bool wait_for_active = true;
238 if (m->get_type() == MSG_MDS_SLAVE_REQUEST) {
239 // handle_slave_request() will wait if necessary
240 wait_for_active = false;
241 } else if (mds->is_clientreplay()) {
242 // session open requests need to be handled during replay,
243 // close requests need to be delayed
244 if ((m->get_type() == CEPH_MSG_CLIENT_SESSION &&
245 (static_cast<MClientSession*>(m))->get_op() != CEPH_SESSION_REQUEST_CLOSE)) {
246 wait_for_active = false;
247 } else if (m->get_type() == CEPH_MSG_CLIENT_REQUEST) {
248 MClientRequest *req = static_cast<MClientRequest*>(m);
249 if (req->is_queued_for_replay()) {
250 wait_for_active = false;
251 }
252 }
253 }
254 if (wait_for_active) {
255 dout(3) << "not active yet, waiting" << dendl;
256 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
257 return;
258 }
259 }
260
261 switch (m->get_type()) {
262 case CEPH_MSG_CLIENT_SESSION:
263 handle_client_session(static_cast<MClientSession*>(m));
264 return;
265 case CEPH_MSG_CLIENT_REQUEST:
266 handle_client_request(static_cast<MClientRequest*>(m));
267 return;
268 case MSG_MDS_SLAVE_REQUEST:
269 handle_slave_request(static_cast<MMDSSlaveRequest*>(m));
270 return;
271 default:
272 derr << "server unknown message " << m->get_type() << dendl;
273 assert(0 == "server unknown message");
274 }
275 }
276
277
278
279 // ----------------------------------------------------------
280 // SESSION management
281
282 class C_MDS_session_finish : public ServerLogContext {
283 Session *session;
284 uint64_t state_seq;
285 bool open;
286 version_t cmapv;
287 interval_set<inodeno_t> inos;
288 version_t inotablev;
289 Context *fin;
290 public:
291 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
292 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
293 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
294 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { }
295 void finish(int r) override {
296 assert(r == 0);
297 server->_session_logged(session, state_seq, open, cmapv, inos, inotablev);
298 if (fin) {
299 fin->complete(r);
300 }
301 }
302 };
303
304 Session *Server::get_session(Message *m)
305 {
306 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
307 if (session) {
308 dout(20) << "get_session have " << session << " " << session->info.inst
309 << " state " << session->get_state_name() << dendl;
310 session->put(); // not carry ref
311 } else {
312 dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
313 }
314 return session;
315 }
316
317 /* This function DOES put the passed message before returning*/
318 void Server::handle_client_session(MClientSession *m)
319 {
320 version_t pv;
321 bool blacklisted = false;
322 Session *session = get_session(m);
323
324 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
325 assert(m->get_source().is_client()); // should _not_ come from an mds!
326
327 if (!session) {
328 dout(0) << " ignoring sessionless msg " << *m << dendl;
329 m->put();
330 return;
331 }
332
333 if (logger)
334 logger->inc(l_mdss_handle_client_session);
335
336 uint64_t sseq = 0;
337 switch (m->get_op()) {
338 case CEPH_SESSION_REQUEST_OPEN:
339 if (session->is_opening() ||
340 session->is_open() ||
341 session->is_stale() ||
342 session->is_killing()) {
343 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
344 // set client metadata for session opened by prepare_force_open_sessions
345 if (!m->client_meta.empty())
346 session->set_client_metadata(m->client_meta);
347 m->put();
348 return;
349 }
350 assert(session->is_closed() ||
351 session->is_closing());
352
353 blacklisted = mds->objecter->with_osdmap(
354 [session](const OSDMap &osd_map) -> bool {
355 return osd_map.is_blacklisted(session->info.inst.addr);
356 });
357
358 if (blacklisted) {
359 dout(10) << "ignoring blacklisted client " << session->info.inst.addr << dendl;
360 m->put();
361 return;
362 }
363
364 session->set_client_metadata(m->client_meta);
365 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN "
366 << session->info.client_metadata.size() << " metadata entries:" << dendl;
367 for (map<string, string>::iterator i = session->info.client_metadata.begin();
368 i != session->info.client_metadata.end(); ++i) {
369 dout(20) << " " << i->first << ": " << i->second << dendl;
370 }
371
372 // Special case for the 'root' metadata path; validate that the claimed
373 // root is actually within the caps of the session
374 if (session->info.client_metadata.count("root")) {
375 const auto claimed_root = session->info.client_metadata.at("root");
376 // claimed_root has a leading "/" which we strip before passing
377 // into caps check
378 if (claimed_root.empty() || claimed_root[0] != '/' ||
379 !session->auth_caps.path_capable(claimed_root.substr(1))) {
380 derr << __func__ << " forbidden path claimed as mount root: "
381 << claimed_root << " by " << m->get_source() << dendl;
382 // Tell the client we're rejecting their open
383 mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
384 mds->clog->warn() << "client session with invalid root '" <<
385 claimed_root << "' denied (" << session->info.inst << ")";
386 session->clear();
387 // Drop out; don't record this session in SessionMap or journal it.
388 break;
389 }
390 }
391
392 if (session->is_closed())
393 mds->sessionmap.add_session(session);
394
395 pv = mds->sessionmap.mark_projected(session);
396 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
397 mds->sessionmap.touch_session(session);
398 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, m->client_meta),
399 new C_MDS_session_finish(this, session, sseq, true, pv));
400 mdlog->flush();
401 break;
402
403 case CEPH_SESSION_REQUEST_RENEWCAPS:
404 if (session->is_open() ||
405 session->is_stale()) {
406 mds->sessionmap.touch_session(session);
407 if (session->is_stale()) {
408 mds->sessionmap.set_state(session, Session::STATE_OPEN);
409 mds->locker->resume_stale_caps(session);
410 mds->sessionmap.touch_session(session);
411 }
412 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_RENEWCAPS, m->get_seq()));
413 } else {
414 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
415 }
416 break;
417
418 case CEPH_SESSION_REQUEST_CLOSE:
419 {
420 if (session->is_closed() ||
421 session->is_closing() ||
422 session->is_killing()) {
423 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
424 m->put();
425 return;
426 }
427 if (session->is_importing()) {
428 dout(10) << "ignoring close req on importing session" << dendl;
429 m->put();
430 return;
431 }
432 assert(session->is_open() ||
433 session->is_stale() ||
434 session->is_opening());
435 if (m->get_seq() < session->get_push_seq()) {
436 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
437 << ", dropping" << dendl;
438 m->put();
439 return;
440 }
441 // We are getting a seq that is higher than expected.
442 // Handle the same as any other seqn error.
443 //
444 if (m->get_seq() != session->get_push_seq()) {
445 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
446 << ", BUGGY!" << dendl;
447 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
448 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
449 m->put();
450 return;
451 }
452 journal_close_session(session, Session::STATE_CLOSING, NULL);
453 }
454 break;
455
456 case CEPH_SESSION_FLUSHMSG_ACK:
457 finish_flush_session(session, m->get_seq());
458 break;
459
460 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
461 mdlog->flush();
462 break;
463
464 default:
465 ceph_abort();
466 }
467 m->put();
468 }
469
470 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
471 {
472 for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
473 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
474 assert(session);
475 if (!session->is_open() ||
476 !session->connection.get() ||
477 !session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER))
478 continue;
479 version_t seq = session->wait_for_flush(gather.new_sub());
480 mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
481 }
482 }
483
484 void Server::finish_flush_session(Session *session, version_t seq)
485 {
486 list<MDSInternalContextBase*> finished;
487 session->finish_flush(seq, finished);
488 mds->queue_waiters(finished);
489 }
490
491 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
492 interval_set<inodeno_t>& inos, version_t piv)
493 {
494 dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close")
495 << " " << pv << dendl;
496
497 if (piv) {
498 assert(session->is_closing() || session->is_killing() ||
499 session->is_opening()); // re-open closing session
500 session->info.prealloc_inos.subtract(inos);
501 mds->inotable->apply_release_ids(inos);
502 assert(mds->inotable->get_version() == piv);
503 }
504
505 mds->sessionmap.mark_dirty(session);
506
507 // apply
508 if (session->get_state_seq() != state_seq) {
509 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
510 << ", noop" << dendl;
511 // close must have been canceled (by an import?), or any number of other things..
512 } else if (open) {
513 assert(session->is_opening());
514 mds->sessionmap.set_state(session, Session::STATE_OPEN);
515 mds->sessionmap.touch_session(session);
516 assert(session->connection != NULL);
517 session->connection->send_message(new MClientSession(CEPH_SESSION_OPEN));
518 if (mdcache->is_readonly())
519 session->connection->send_message(new MClientSession(CEPH_SESSION_FORCE_RO));
520 } else if (session->is_closing() ||
521 session->is_killing()) {
522 // kill any lingering capabilities, leases, requests
523 while (!session->caps.empty()) {
524 Capability *cap = session->caps.front();
525 CInode *in = cap->get_inode();
526 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
527 mds->locker->remove_client_cap(in, session->info.inst.name.num());
528 }
529 while (!session->leases.empty()) {
530 ClientLease *r = session->leases.front();
531 CDentry *dn = static_cast<CDentry*>(r->parent);
532 dout(20) << " killing client lease of " << *dn << dendl;
533 dn->remove_client_lease(r, mds->locker);
534 }
535 if (client_reconnect_gather.count(session->info.get_client())) {
536 dout(20) << " removing client from reconnect set" << dendl;
537 client_reconnect_gather.erase(session->info.get_client());
538
539 if (client_reconnect_gather.empty()) {
540 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
541 reconnect_gather_finish();
542 }
543 }
544
545 if (session->is_closing()) {
546 // mark con disposable. if there is a fault, we will get a
547 // reset and clean it up. if the client hasn't received the
548 // CLOSE message yet, they will reconnect and get an
549 // ms_handle_remote_reset() and realize they had in fact closed.
550 // do this *before* sending the message to avoid a possible
551 // race.
552 if (session->connection != NULL) {
553 // Conditional because terminate_sessions will indiscrimately
554 // put sessions in CLOSING whether they ever had a conn or not.
555 session->connection->mark_disposable();
556 }
557
558 // reset session
559 mds->send_message_client(new MClientSession(CEPH_SESSION_CLOSE), session);
560 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
561 session->clear();
562 mds->sessionmap.remove_session(session);
563 } else if (session->is_killing()) {
564 // destroy session, close connection
565 if (session->connection != NULL) {
566 session->connection->mark_down();
567 }
568 mds->sessionmap.remove_session(session);
569 } else {
570 ceph_abort();
571 }
572 } else {
573 ceph_abort();
574 }
575 }
576
577 /**
578 * Inject sessions from some source other than actual connections.
579 *
580 * For example:
581 * - sessions inferred from journal replay
582 * - sessions learned from other MDSs during rejoin
583 * - sessions learned from other MDSs during dir/caps migration
584 * - sessions learned from other MDSs during a cross-MDS rename
585 */
586 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
587 map<client_t,uint64_t>& sseqmap)
588 {
589 version_t pv = mds->sessionmap.get_projected();
590
591 dout(10) << "prepare_force_open_sessions " << pv
592 << " on " << cm.size() << " clients"
593 << dendl;
594 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
595
596 Session *session = mds->sessionmap.get_or_add_session(p->second);
597 pv = mds->sessionmap.mark_projected(session);
598 if (session->is_closed() ||
599 session->is_closing() ||
600 session->is_killing())
601 sseqmap[p->first] = mds->sessionmap.set_state(session, Session::STATE_OPENING);
602 else
603 assert(session->is_open() ||
604 session->is_opening() ||
605 session->is_stale());
606 session->inc_importing();
607 }
608 return pv;
609 }
610
611 void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm,
612 map<client_t,uint64_t>& sseqmap,
613 bool dec_import)
614 {
615 /*
616 * FIXME: need to carefully consider the race conditions between a
617 * client trying to close a session and an MDS doing an import
618 * trying to force open a session...
619 */
620 dout(10) << "finish_force_open_sessions on " << cm.size() << " clients,"
621 << " initial v " << mds->sessionmap.get_version() << dendl;
622
623
624 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
625
626 Session *session = mds->sessionmap.get_session(p->second.name);
627 assert(session);
628
629 if (sseqmap.count(p->first)) {
630 uint64_t sseq = sseqmap[p->first];
631 if (session->get_state_seq() != sseq) {
632 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
633 } else {
634 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
635 mds->sessionmap.set_state(session, Session::STATE_OPEN);
636 mds->sessionmap.touch_session(session);
637 mds->send_message_client(new MClientSession(CEPH_SESSION_OPEN), session);
638 if (mdcache->is_readonly())
639 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
640 }
641 } else {
642 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
643 assert(session->is_open() || session->is_stale());
644 }
645
646 if (dec_import) {
647 session->dec_importing();
648 }
649
650 mds->sessionmap.mark_dirty(session);
651 }
652
653 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
654 }
655
656 class C_MDS_TerminatedSessions : public ServerContext {
657 void finish(int r) override {
658 server->terminating_sessions = false;
659 }
660 public:
661 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
662 };
663
664 void Server::terminate_sessions()
665 {
666 dout(2) << "terminate_sessions" << dendl;
667
668 terminating_sessions = true;
669
670 // kill them off. clients will retry etc.
671 set<Session*> sessions;
672 mds->sessionmap.get_client_session_set(sessions);
673 for (set<Session*>::const_iterator p = sessions.begin();
674 p != sessions.end();
675 ++p) {
676 Session *session = *p;
677 if (session->is_closing() ||
678 session->is_killing() ||
679 session->is_closed())
680 continue;
681 journal_close_session(session, Session::STATE_CLOSING, NULL);
682 }
683
684 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
685 }
686
687
688 void Server::find_idle_sessions()
689 {
690 dout(10) << "find_idle_sessions. laggy until " << mds->get_laggy_until() << dendl;
691
692 // timeout/stale
693 // (caps go stale, lease die)
694 utime_t now = ceph_clock_now();
695 utime_t cutoff = now;
696 cutoff -= g_conf->mds_session_timeout;
697 while (1) {
698 Session *session = mds->sessionmap.get_oldest_session(Session::STATE_OPEN);
699 if (!session) break;
700 dout(20) << "laggiest active session is " << session->info.inst << dendl;
701 if (session->last_cap_renew >= cutoff) {
702 dout(20) << "laggiest active session is " << session->info.inst << " and sufficiently new ("
703 << session->last_cap_renew << ")" << dendl;
704 break;
705 }
706
707 dout(10) << "new stale session " << session->info.inst << " last " << session->last_cap_renew << dendl;
708 mds->sessionmap.set_state(session, Session::STATE_STALE);
709 mds->locker->revoke_stale_caps(session);
710 mds->locker->remove_stale_leases(session);
711 mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
712 finish_flush_session(session, session->get_push_seq());
713 }
714
715 // autoclose
716 cutoff = now;
717 cutoff -= g_conf->mds_session_autoclose;
718
719 // don't kick clients if we've been laggy
720 if (mds->get_laggy_until() > cutoff) {
721 dout(10) << " laggy_until " << mds->get_laggy_until() << " > cutoff " << cutoff
722 << ", not kicking any clients to be safe" << dendl;
723 return;
724 }
725
726 if (mds->sessionmap.get_sessions().size() == 1 &&
727 mds->mdsmap->get_num_in_mds() == 1) {
728 dout(20) << "not evicting a slow client, because there is only one"
729 << dendl;
730 return;
731 }
732
733 // Collect a list of sessions exceeding the autoclose threshold
734 std::vector<Session *> to_evict;
735 const auto sessions_p = mds->sessionmap.by_state.find(Session::STATE_STALE);
736 if (sessions_p == mds->sessionmap.by_state.end() || sessions_p->second->empty()) {
737 return;
738 }
739 const auto &stale_sessions = sessions_p->second;
740 assert(stale_sessions != nullptr);
741
742 for (const auto &session: *stale_sessions) {
743 if (session->is_importing()) {
744 dout(10) << "stopping at importing session " << session->info.inst << dendl;
745 break;
746 }
747 assert(session->is_stale());
748 if (session->last_cap_renew >= cutoff) {
749 dout(20) << "oldest stale session is " << session->info.inst << " and sufficiently new ("
750 << session->last_cap_renew << ")" << dendl;
751 break;
752 }
753
754 to_evict.push_back(session);
755 }
756
757 for (const auto &session: to_evict) {
758 utime_t age = now;
759 age -= session->last_cap_renew;
760 mds->clog->warn() << "evicting unresponsive client " << *session
761 << ", after " << age << " seconds";
762 dout(10) << "autoclosing stale session " << session->info.inst << " last "
763 << session->last_cap_renew << dendl;
764
765 if (g_conf->mds_session_blacklist_on_timeout) {
766 std::stringstream ss;
767 mds->evict_client(session->info.inst.name.num(), false, true,
768 ss, nullptr);
769 } else {
770 kill_session(session, NULL);
771 }
772 }
773 }
774
775 /*
776 * XXX bump in the interface here, not using an MDSInternalContextBase here
777 * because all the callers right now happen to use a SaferCond
778 */
779 void Server::kill_session(Session *session, Context *on_safe)
780 {
781 assert(mds->mds_lock.is_locked_by_me());
782
783 if ((session->is_opening() ||
784 session->is_open() ||
785 session->is_stale()) &&
786 !session->is_importing()) {
787 dout(10) << "kill_session " << session << dendl;
788 journal_close_session(session, Session::STATE_KILLING, on_safe);
789 } else {
790 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
791 assert(session->is_closing() ||
792 session->is_closed() ||
793 session->is_killing() ||
794 session->is_importing());
795 if (on_safe) {
796 on_safe->complete(0);
797 }
798 }
799 }
800
801 size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
802 {
803 std::list<Session*> victims;
804 const auto sessions = mds->sessionmap.get_sessions();
805 for (const auto p : sessions) {
806 if (!p.first.is_client()) {
807 // Do not apply OSDMap blacklist to MDS daemons, we find out
808 // about their death via MDSMap.
809 continue;
810 }
811
812 Session *s = p.second;
813 if (blacklist.count(s->info.inst.addr)) {
814 victims.push_back(s);
815 }
816 }
817
818 for (const auto s : victims) {
819 kill_session(s, nullptr);
820 }
821
822 dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
823
824 return victims.size();
825 }
826
827 void Server::journal_close_session(Session *session, int state, Context *on_safe)
828 {
829 uint64_t sseq = mds->sessionmap.set_state(session, state);
830 version_t pv = mds->sessionmap.mark_projected(session);
831 version_t piv = 0;
832
833 // release alloc and pending-alloc inos for this session
834 // and wipe out session state, in case the session close aborts for some reason
835 interval_set<inodeno_t> both;
836 both.insert(session->info.prealloc_inos);
837 both.insert(session->pending_prealloc_inos);
838 if (both.size()) {
839 mds->inotable->project_release_ids(both);
840 piv = mds->inotable->get_projected_version();
841 } else
842 piv = 0;
843
844 mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv),
845 new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe));
846 mdlog->flush();
847
848 // clean up requests, too
849 elist<MDRequestImpl*>::iterator p =
850 session->requests.begin(member_offset(MDRequestImpl,
851 item_session_request));
852 while (!p.end()) {
853 MDRequestRef mdr = mdcache->request_get((*p)->reqid);
854 ++p;
855 mdcache->request_kill(mdr);
856 }
857
858 finish_flush_session(session, session->get_push_seq());
859 }
860
861 void Server::reconnect_clients(MDSInternalContext *reconnect_done_)
862 {
863 reconnect_done = reconnect_done_;
864 mds->sessionmap.get_client_set(client_reconnect_gather);
865
866 if (client_reconnect_gather.empty()) {
867 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
868 reconnect_gather_finish();
869 return;
870 }
871
872 // clients will get the mdsmap and discover we're reconnecting via the monitor.
873
874 reconnect_start = ceph_clock_now();
875 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
876 mds->sessionmap.dump();
877 }
878
879 /* This function DOES put the passed message before returning*/
880 void Server::handle_client_reconnect(MClientReconnect *m)
881 {
882 dout(7) << "handle_client_reconnect " << m->get_source() << dendl;
883 client_t from = m->get_source().num();
884 Session *session = get_session(m);
885 assert(session);
886
887 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
888 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
889 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
890 return;
891 }
892
893 utime_t delay = ceph_clock_now();
894 delay -= reconnect_start;
895 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
896
897 bool deny = false;
898 if (!mds->is_reconnect()) {
899 // XXX maybe in the future we can do better than this?
900 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
901 mds->clog->info() << "denied reconnect attempt (mds is "
902 << ceph_mds_state_name(mds->get_state())
903 << ") from " << m->get_source_inst()
904 << " after " << delay << " (allowed interval " << g_conf->mds_reconnect_timeout << ")";
905 deny = true;
906 } else if (session->is_closed()) {
907 dout(1) << " session is closed, ignoring reconnect, sending close" << dendl;
908 mds->clog->info() << "denied reconnect attempt (mds is "
909 << ceph_mds_state_name(mds->get_state())
910 << ") from " << m->get_source_inst() << " (session is closed)";
911 deny = true;
912 } else if (mdcache->is_readonly()) {
913 dout(1) << " read-only FS, ignoring reconnect, sending close" << dendl;
914 mds->clog->info() << "denied reconnect attempt (mds is read-only)";
915 deny = true;
916 }
917
918 if (deny) {
919 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_CLOSE));
920 m->put();
921 return;
922 }
923
924 // notify client of success with an OPEN
925 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_OPEN));
926 session->last_cap_renew = ceph_clock_now();
927 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
928
929 // snaprealms
930 for (vector<ceph_mds_snaprealm_reconnect>::iterator p = m->realms.begin();
931 p != m->realms.end();
932 ++p) {
933 CInode *in = mdcache->get_inode(inodeno_t(p->ino));
934 if (in && in->state_test(CInode::STATE_PURGING))
935 continue;
936 if (in) {
937 assert(in->snaprealm);
938 if (in->snaprealm->have_past_parents_open()) {
939 dout(15) << "open snaprealm (w/ past parents) on " << *in << dendl;
940 mdcache->finish_snaprealm_reconnect(from, in->snaprealm, snapid_t(p->seq));
941 } else {
942 dout(15) << "open snaprealm (w/o past parents) on " << *in << dendl;
943 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
944 }
945 } else {
946 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(p->ino)
947 << " seq " << p->seq << dendl;
948 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
949 }
950 }
951
952 // caps
953 for (map<inodeno_t, cap_reconnect_t>::iterator p = m->caps.begin();
954 p != m->caps.end();
955 ++p) {
956 // make sure our last_cap_id is MAX over all issued caps
957 if (p->second.capinfo.cap_id > mdcache->last_cap_id)
958 mdcache->last_cap_id = p->second.capinfo.cap_id;
959
960 CInode *in = mdcache->get_inode(p->first);
961 if (in && in->state_test(CInode::STATE_PURGING))
962 continue;
963 if (in && in->is_auth()) {
964 // we recovered it, and it's ours. take note.
965 dout(15) << "open cap realm " << inodeno_t(p->second.capinfo.snaprealm)
966 << " on " << *in << dendl;
967 in->reconnect_cap(from, p->second, session);
968 mdcache->add_reconnected_cap(from, p->first, p->second);
969 recover_filelocks(in, p->second.flockbl, m->get_orig_source().num());
970 continue;
971 }
972
973 if (in && !in->is_auth()) {
974 // not mine.
975 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
976 // add to cap export list.
977 p->second.path.clear(); // we don't need path
978 mdcache->rejoin_export_caps(p->first, from, p->second,
979 in->authority().first);
980 } else {
981 // don't know if the inode is mine
982 dout(10) << "missing ino " << p->first << ", will load later" << dendl;
983 p->second.path.clear(); // we don't need path
984 mdcache->rejoin_recovered_caps(p->first, from, p->second, MDS_RANK_NONE);
985 }
986 }
987
988 // remove from gather set
989 client_reconnect_gather.erase(from);
990 if (client_reconnect_gather.empty())
991 reconnect_gather_finish();
992
993 m->put();
994 }
995
996
997
998 void Server::reconnect_gather_finish()
999 {
1000 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1001 assert(reconnect_done);
1002 reconnect_done->complete(0);
1003 reconnect_done = NULL;
1004 }
1005
1006 void Server::reconnect_tick()
1007 {
1008 if (reconnect_evicting) {
1009 dout(4) << "reconnect_tick: waiting for evictions" << dendl;
1010 return;
1011 }
1012
1013 utime_t reconnect_end = reconnect_start;
1014 reconnect_end += g_conf->mds_reconnect_timeout;
1015 if (ceph_clock_now() >= reconnect_end &&
1016 !client_reconnect_gather.empty()) {
1017 dout(10) << "reconnect timed out" << dendl;
1018
1019 // If we're doing blacklist evictions, use this to wait for them before
1020 // proceeding to reconnect_gather_finish
1021 MDSGatherBuilder gather(g_ceph_context);
1022
1023 for (set<client_t>::iterator p = client_reconnect_gather.begin();
1024 p != client_reconnect_gather.end();
1025 ++p) {
1026 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
1027 assert(session);
1028 dout(1) << "reconnect gave up on " << session->info.inst << dendl;
1029
1030 mds->clog->warn() << "evicting unresponsive client " << *session
1031 << ", after waiting " << g_conf->mds_reconnect_timeout
1032 << " seconds during MDS startup";
1033
1034 if (g_conf->mds_session_blacklist_on_timeout) {
1035 std::stringstream ss;
1036 mds->evict_client(session->info.inst.name.num(), false, true, ss,
1037 gather.new_sub());
1038 } else {
1039 kill_session(session, NULL);
1040 }
1041
1042 failed_reconnects++;
1043 }
1044 client_reconnect_gather.clear();
1045
1046 if (gather.has_subs()) {
1047 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1048 gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
1049 [this](int r){reconnect_gather_finish();})));
1050 gather.activate();
1051 reconnect_evicting = true;
1052 } else {
1053 reconnect_gather_finish();
1054 }
1055 }
1056 }
1057
1058 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1059 {
1060 if (!locks.length()) return;
1061 int numlocks;
1062 ceph_filelock lock;
1063 bufferlist::iterator p = locks.begin();
1064 ::decode(numlocks, p);
1065 for (int i = 0; i < numlocks; ++i) {
1066 ::decode(lock, p);
1067 lock.client = client;
1068 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1069 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1070 }
1071 ::decode(numlocks, p);
1072 for (int i = 0; i < numlocks; ++i) {
1073 ::decode(lock, p);
1074 lock.client = client;
1075 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1076 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1077 }
1078 }
1079
1080
1081 /**
1082 * Call this when the MDCache is oversized, to send requests to the clients
1083 * to trim some caps, and consequently unpin some inodes in the MDCache so
1084 * that it can trim too.
1085 */
1086 void Server::recall_client_state(void)
1087 {
1088 /* try to recall at least 80% of all caps */
1089 uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
1090 uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
1091 if (max_caps_per_client < min_caps_per_client) {
1092 dout(0) << "max_caps_per_client " << max_caps_per_client
1093 << " < min_caps_per_client " << min_caps_per_client << dendl;
1094 max_caps_per_client = min_caps_per_client + 1;
1095 }
1096
1097 /* unless this ratio is smaller: */
1098 /* ratio: determine the amount of caps to recall from each client. Use
1099 * percentage full over the cache reservation. Cap the ratio at 80% of client
1100 * caps. */
1101 double ratio = 1.0-fmin(0.80, mdcache->cache_toofull_ratio());
1102
1103 dout(10) << "recall_client_state " << ratio
1104 << ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
1105 << dendl;
1106
1107 set<Session*> sessions;
1108 mds->sessionmap.get_client_session_set(sessions);
1109 for (auto &session : sessions) {
1110 if (!session->is_open() ||
1111 !session->info.inst.name.is_client())
1112 continue;
1113
1114 dout(10) << " session " << session->info.inst
1115 << " caps " << session->caps.size()
1116 << ", leases " << session->leases.size()
1117 << dendl;
1118
1119 uint64_t newlim = MAX(MIN((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
1120 if (session->caps.size() > newlim) {
1121 MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
1122 m->head.max_caps = newlim;
1123 mds->send_message_client(m, session);
1124 session->notify_recall_sent(newlim);
1125 }
1126 }
1127 }
1128
1129 void Server::force_clients_readonly()
1130 {
1131 dout(10) << "force_clients_readonly" << dendl;
1132 set<Session*> sessions;
1133 mds->sessionmap.get_client_session_set(sessions);
1134 for (set<Session*>::const_iterator p = sessions.begin();
1135 p != sessions.end();
1136 ++p) {
1137 Session *session = *p;
1138 if (!session->info.inst.name.is_client() ||
1139 !(session->is_open() || session->is_stale()))
1140 continue;
1141 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
1142 }
1143 }
1144
1145 /*******
1146 * some generic stuff for finishing off requests
1147 */
1148 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1149 {
1150 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1151 assert(!mdr->has_completed);
1152
1153 // note trace items for eventual reply.
1154 mdr->tracei = in;
1155 if (in)
1156 mdr->pin(in);
1157
1158 mdr->tracedn = dn;
1159 if (dn)
1160 mdr->pin(dn);
1161
1162 early_reply(mdr, in, dn);
1163
1164 mdr->committing = true;
1165 submit_mdlog_entry(le, fin, mdr, __func__);
1166
1167 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1168 if (mds->queue_one_replay()) {
1169 dout(10) << " queued next replay op" << dendl;
1170 } else {
1171 dout(10) << " journaled last replay op, flushing" << dendl;
1172 mdlog->flush();
1173 }
1174 } else if (mdr->did_early_reply)
1175 mds->locker->drop_rdlocks(mdr.get());
1176 else
1177 mdlog->flush();
1178 }
1179
1180 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1181 const char *event)
1182 {
1183 if (mdr) {
1184 string event_str("submit entry: ");
1185 event_str += event;
1186 mdr->mark_event_string(event_str);
1187 }
1188 mdlog->submit_entry(le, fin);
1189 }
1190
1191 /*
1192 * send response built from mdr contents and error code; clean up mdr
1193 */
1194 void Server::respond_to_request(MDRequestRef& mdr, int r)
1195 {
1196 if (mdr->client_request) {
1197 reply_client_request(mdr, new MClientReply(mdr->client_request, r));
1198
1199 // add here to avoid counting ops multiple times (e.g., locks, loading)
1200 switch(mdr->client_request->get_op()) {
1201 case CEPH_MDS_OP_LOOKUPHASH:
1202 logger->inc(l_mdss_req_lookuphash);
1203 break;
1204 case CEPH_MDS_OP_LOOKUPINO:
1205 logger->inc(l_mdss_req_lookupino);
1206 break;
1207 case CEPH_MDS_OP_LOOKUPPARENT:
1208 logger->inc(l_mdss_req_lookupparent);
1209 break;
1210 case CEPH_MDS_OP_LOOKUPNAME:
1211 logger->inc(l_mdss_req_lookupname);
1212 break;
1213 case CEPH_MDS_OP_LOOKUP:
1214 logger->inc(l_mdss_req_lookup);
1215 break;
1216 case CEPH_MDS_OP_LOOKUPSNAP:
1217 logger->inc(l_mdss_req_lookupsnap);
1218 break;
1219 case CEPH_MDS_OP_GETATTR:
1220 logger->inc(l_mdss_req_getattr);
1221 break;
1222 case CEPH_MDS_OP_SETATTR:
1223 logger->inc(l_mdss_req_setattr);
1224 break;
1225 case CEPH_MDS_OP_SETLAYOUT:
1226 logger->inc(l_mdss_req_setlayout);
1227 break;
1228 case CEPH_MDS_OP_SETDIRLAYOUT:
1229 logger->inc(l_mdss_req_setdirlayout);
1230 break;
1231 case CEPH_MDS_OP_SETXATTR:
1232 logger->inc(l_mdss_req_setxattr);
1233 break;
1234 case CEPH_MDS_OP_RMXATTR:
1235 logger->inc(l_mdss_req_rmxattr);
1236 break;
1237 case CEPH_MDS_OP_READDIR:
1238 logger->inc(l_mdss_req_readdir);
1239 break;
1240 case CEPH_MDS_OP_SETFILELOCK:
1241 logger->inc(l_mdss_req_setfilelock);
1242 break;
1243 case CEPH_MDS_OP_GETFILELOCK:
1244 logger->inc(l_mdss_req_getfilelock);
1245 break;
1246 case CEPH_MDS_OP_CREATE:
1247 logger->inc(l_mdss_req_create);
1248 case CEPH_MDS_OP_OPEN:
1249 logger->inc(l_mdss_req_open);
1250 break;
1251 case CEPH_MDS_OP_MKNOD:
1252 logger->inc(l_mdss_req_mknod);
1253 break;
1254 case CEPH_MDS_OP_LINK:
1255 logger->inc(l_mdss_req_link);
1256 break;
1257 case CEPH_MDS_OP_UNLINK:
1258 logger->inc(l_mdss_req_unlink);
1259 break;
1260 case CEPH_MDS_OP_RMDIR:
1261 logger->inc(l_mdss_req_rmdir);
1262 break;
1263 case CEPH_MDS_OP_RENAME:
1264 logger->inc(l_mdss_req_rename);
1265 break;
1266 case CEPH_MDS_OP_MKDIR:
1267 logger->inc(l_mdss_req_mkdir);
1268 break;
1269 case CEPH_MDS_OP_SYMLINK:
1270 logger->inc(l_mdss_req_symlink);
1271 break;
1272 case CEPH_MDS_OP_LSSNAP:
1273 logger->inc(l_mdss_req_lssnap);
1274 break;
1275 case CEPH_MDS_OP_MKSNAP:
1276 logger->inc(l_mdss_req_mksnap);
1277 break;
1278 case CEPH_MDS_OP_RMSNAP:
1279 logger->inc(l_mdss_req_rmsnap);
1280 break;
1281 case CEPH_MDS_OP_RENAMESNAP:
1282 logger->inc(l_mdss_req_renamesnap);
1283 break;
1284 }
1285 } else if (mdr->internal_op > -1) {
1286 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1287 if (!mdr->internal_op_finish)
1288 assert(0 == "trying to respond to internal op without finisher");
1289 mdr->internal_op_finish->complete(r);
1290 mdcache->request_finish(mdr);
1291 }
1292 }
1293
1294 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
1295 {
1296 if (!g_conf->mds_early_reply)
1297 return;
1298
1299 if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
1300 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
1301 return;
1302 }
1303
1304 if (mdr->alloc_ino) {
1305 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
1306 return;
1307 }
1308
1309 MClientRequest *req = mdr->client_request;
1310 entity_inst_t client_inst = req->get_source_inst();
1311 if (client_inst.name.is_mds())
1312 return;
1313
1314 if (req->is_replay()) {
1315 dout(10) << " no early reply on replay op" << dendl;
1316 return;
1317 }
1318
1319
1320 MClientReply *reply = new MClientReply(req, 0);
1321 reply->set_unsafe();
1322
1323 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1324 //
1325 //_rename_finish() does not send dentry link/unlink message to replicas.
1326 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1327 // that have projected linkages from getting new replica.
1328 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
1329
1330 dout(10) << "early_reply " << reply->get_result()
1331 << " (" << cpp_strerror(reply->get_result())
1332 << ") " << *req << dendl;
1333
1334 if (tracei || tracedn) {
1335 if (tracei)
1336 mdr->cap_releases.erase(tracei->vino());
1337 if (tracedn)
1338 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1339
1340 set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid,
1341 req->get_dentry_wanted(), mdr);
1342 }
1343
1344 reply->set_extra_bl(mdr->reply_extra_bl);
1345 req->get_connection()->send_message(reply);
1346
1347 mdr->did_early_reply = true;
1348
1349 mds->logger->inc(l_mds_reply);
1350 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
1351 mds->logger->tinc(l_mds_reply_latency, lat);
1352 dout(20) << "lat " << lat << dendl;
1353
1354 mdr->mark_event("early_replied");
1355 }
1356
1357 /*
1358 * send given reply
1359 * include a trace to tracei
1360 * Clean up mdr
1361 */
1362 void Server::reply_client_request(MDRequestRef& mdr, MClientReply *reply)
1363 {
1364 assert(mdr.get());
1365 MClientRequest *req = mdr->client_request;
1366
1367 dout(7) << "reply_client_request " << reply->get_result()
1368 << " (" << cpp_strerror(reply->get_result())
1369 << ") " << *req << dendl;
1370
1371 mdr->mark_event("replying");
1372
1373 Session *session = mdr->session;
1374
1375 // note successful request in session map?
1376 //
1377 // setfilelock requests are special, they only modify states in MDS memory.
1378 // The states get lost when MDS fails. If Client re-send a completed
1379 // setfilelock request, it means that client did not receive corresponding
1380 // setfilelock reply. So MDS should re-execute the setfilelock request.
1381 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
1382 reply->get_result() == 0 && session) {
1383 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
1384 session->add_completed_request(mdr->reqid.tid, created);
1385 if (mdr->ls) {
1386 mdr->ls->touched_sessions.insert(session->info.inst.name);
1387 }
1388 }
1389
1390 // give any preallocated inos to the session
1391 apply_allocated_inos(mdr, session);
1392
1393 // get tracei/tracedn from mdr?
1394 snapid_t snapid = mdr->snapid;
1395 CInode *tracei = mdr->tracei;
1396 CDentry *tracedn = mdr->tracedn;
1397
1398 bool is_replay = mdr->client_request->is_replay();
1399 bool did_early_reply = mdr->did_early_reply;
1400 entity_inst_t client_inst = req->get_source_inst();
1401 int dentry_wanted = req->get_dentry_wanted();
1402
1403 if (!did_early_reply && !is_replay) {
1404
1405 mds->logger->inc(l_mds_reply);
1406 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
1407 mds->logger->tinc(l_mds_reply_latency, lat);
1408 dout(20) << "lat " << lat << dendl;
1409
1410 if (tracei)
1411 mdr->cap_releases.erase(tracei->vino());
1412 if (tracedn)
1413 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1414 }
1415
1416 // drop non-rdlocks before replying, so that we can issue leases
1417 mdcache->request_drop_non_rdlocks(mdr);
1418
1419 // reply at all?
1420 if (client_inst.name.is_mds() || !session) {
1421 reply->put(); // mds doesn't need a reply
1422 reply = 0;
1423 } else {
1424 // send reply.
1425 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
1426 (tracei || tracedn)) {
1427 if (is_replay) {
1428 if (tracei)
1429 mdcache->try_reconnect_cap(tracei, session);
1430 } else {
1431 // include metadata in reply
1432 set_trace_dist(session, reply, tracei, tracedn,
1433 snapid, dentry_wanted,
1434 mdr);
1435 }
1436 }
1437
1438 // We can set the extra bl unconditionally: if it's already been sent in the
1439 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
1440 reply->set_extra_bl(mdr->reply_extra_bl);
1441
1442 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
1443 req->get_connection()->send_message(reply);
1444 }
1445
1446 if (req->is_queued_for_replay() &&
1447 (mdr->has_completed || reply->get_result() < 0)) {
1448 if (reply->get_result() < 0) {
1449 int r = reply->get_result();
1450 derr << "reply_client_request: failed to replay " << *req
1451 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
1452 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
1453 }
1454 mds->queue_one_replay();
1455 }
1456
1457 // clean up request
1458 mdcache->request_finish(mdr);
1459
1460 // take a closer look at tracei, if it happens to be a remote link
1461 if (tracei &&
1462 tracedn &&
1463 tracedn->get_projected_linkage()->is_remote()) {
1464 mdcache->eval_remote(tracedn);
1465 }
1466 }
1467
1468
1469 void Server::encode_empty_dirstat(bufferlist& bl)
1470 {
1471 static DirStat empty;
1472 empty.encode(bl);
1473 }
1474
1475 void Server::encode_infinite_lease(bufferlist& bl)
1476 {
1477 LeaseStat e;
1478 e.seq = 0;
1479 e.mask = -1;
1480 e.duration_ms = -1;
1481 ::encode(e, bl);
1482 dout(20) << "encode_infinite_lease " << e << dendl;
1483 }
1484
1485 void Server::encode_null_lease(bufferlist& bl)
1486 {
1487 LeaseStat e;
1488 e.seq = 0;
1489 e.mask = 0;
1490 e.duration_ms = 0;
1491 ::encode(e, bl);
1492 dout(20) << "encode_null_lease " << e << dendl;
1493 }
1494
1495
1496 /*
1497 * pass inode OR dentry (not both, or we may get confused)
1498 *
1499 * trace is in reverse order (i.e. root inode comes last)
1500 */
1501 void Server::set_trace_dist(Session *session, MClientReply *reply,
1502 CInode *in, CDentry *dn,
1503 snapid_t snapid,
1504 int dentry_wanted,
1505 MDRequestRef& mdr)
1506 {
1507 // skip doing this for debugging purposes?
1508 if (g_conf->mds_inject_traceless_reply_probability &&
1509 mdr->ls && !mdr->o_trunc &&
1510 (rand() % 10000 < g_conf->mds_inject_traceless_reply_probability * 10000.0)) {
1511 dout(5) << "deliberately skipping trace for " << *reply << dendl;
1512 return;
1513 }
1514
1515 // inode, dentry, dir, ..., inode
1516 bufferlist bl;
1517 mds_rank_t whoami = mds->get_nodeid();
1518 client_t client = session->get_client();
1519 utime_t now = ceph_clock_now();
1520
1521 dout(20) << "set_trace_dist snapid " << snapid << dendl;
1522
1523 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
1524
1525 // realm
1526 if (snapid == CEPH_NOSNAP) {
1527 SnapRealm *realm;
1528 if (in)
1529 realm = in->find_snaprealm();
1530 else
1531 realm = dn->get_dir()->get_inode()->find_snaprealm();
1532 reply->snapbl = realm->get_snap_trace();
1533 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
1534 }
1535
1536 // dir + dentry?
1537 if (dn) {
1538 reply->head.is_dentry = 1;
1539 CDir *dir = dn->get_dir();
1540 CInode *diri = dir->get_inode();
1541
1542 diri->encode_inodestat(bl, session, NULL, snapid);
1543 dout(20) << "set_trace_dist added diri " << *diri << dendl;
1544
1545 #ifdef MDS_VERIFY_FRAGSTAT
1546 if (dir->is_complete())
1547 dir->verify_fragstat();
1548 #endif
1549 dir->encode_dirstat(bl, whoami);
1550 dout(20) << "set_trace_dist added dir " << *dir << dendl;
1551
1552 ::encode(dn->get_name(), bl);
1553 if (snapid == CEPH_NOSNAP)
1554 mds->locker->issue_client_lease(dn, client, bl, now, session);
1555 else
1556 encode_null_lease(bl);
1557 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
1558 } else
1559 reply->head.is_dentry = 0;
1560
1561 // inode
1562 if (in) {
1563 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
1564 dout(20) << "set_trace_dist added in " << *in << dendl;
1565 reply->head.is_target = 1;
1566 } else
1567 reply->head.is_target = 0;
1568
1569 reply->set_trace(bl);
1570 }
1571
1572
1573
1574
1575 /***
1576 * process a client request
1577 * This function DOES put the passed message before returning
1578 */
1579 void Server::handle_client_request(MClientRequest *req)
1580 {
1581 dout(4) << "handle_client_request " << *req << dendl;
1582
1583 if (mds->logger)
1584 mds->logger->inc(l_mds_request);
1585 if (logger)
1586 logger->inc(l_mdss_handle_client_request);
1587
1588 if (!mdcache->is_open()) {
1589 dout(5) << "waiting for root" << dendl;
1590 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
1591 return;
1592 }
1593
1594 // active session?
1595 Session *session = 0;
1596 if (req->get_source().is_client()) {
1597 session = get_session(req);
1598 if (!session) {
1599 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
1600 } else if (session->is_closed() ||
1601 session->is_closing() ||
1602 session->is_killing()) {
1603 dout(5) << "session closed|closing|killing, dropping" << dendl;
1604 session = NULL;
1605 }
1606 if (!session) {
1607 if (req->is_queued_for_replay())
1608 mds->queue_one_replay();
1609 req->put();
1610 return;
1611 }
1612 }
1613
1614 // old mdsmap?
1615 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
1616 // send it? hrm, this isn't ideal; they may get a lot of copies if
1617 // they have a high request rate.
1618 }
1619
1620 // completed request?
1621 bool has_completed = false;
1622 if (req->is_replay() || req->get_retry_attempt()) {
1623 assert(session);
1624 inodeno_t created;
1625 if (session->have_completed_request(req->get_reqid().tid, &created)) {
1626 has_completed = true;
1627 // Don't send traceless reply if the completed request has created
1628 // new inode. Treat the request as lookup request instead.
1629 if (req->is_replay() ||
1630 ((created == inodeno_t() || !mds->is_clientreplay()) &&
1631 req->get_op() != CEPH_MDS_OP_OPEN &&
1632 req->get_op() != CEPH_MDS_OP_CREATE)) {
1633 dout(5) << "already completed " << req->get_reqid() << dendl;
1634 MClientReply *reply = new MClientReply(req, 0);
1635 if (created != inodeno_t()) {
1636 bufferlist extra;
1637 ::encode(created, extra);
1638 reply->set_extra_bl(extra);
1639 }
1640 req->get_connection()->send_message(reply);
1641
1642 if (req->is_queued_for_replay())
1643 mds->queue_one_replay();
1644
1645 req->put();
1646 return;
1647 }
1648 if (req->get_op() != CEPH_MDS_OP_OPEN &&
1649 req->get_op() != CEPH_MDS_OP_CREATE) {
1650 dout(10) << " completed request which created new inode " << created
1651 << ", convert it to lookup request" << dendl;
1652 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
1653 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
1654 }
1655 }
1656 }
1657
1658 // trim completed_request list
1659 if (req->get_oldest_client_tid() > 0) {
1660 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
1661 assert(session);
1662 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
1663 // Sessions 'completed_requests' was dirtied, mark it to be
1664 // potentially flushed at segment expiry.
1665 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
1666
1667 if (session->get_num_trim_requests_warnings() > 0 &&
1668 session->get_num_completed_requests() * 2 < g_conf->mds_max_completed_requests)
1669 session->reset_num_trim_requests_warnings();
1670 } else {
1671 if (session->get_num_completed_requests() >=
1672 (g_conf->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
1673 session->inc_num_trim_requests_warnings();
1674 stringstream ss;
1675 ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
1676 << req->get_oldest_client_tid() << "), "
1677 << session->get_num_completed_requests()
1678 << " completed requests recorded in session\n";
1679 mds->clog->warn() << ss.str();
1680 dout(20) << __func__ << " " << ss.str() << dendl;
1681 }
1682 }
1683 }
1684
1685 // register + dispatch
1686 MDRequestRef mdr = mdcache->request_start(req);
1687 if (!mdr.get())
1688 return;
1689
1690 if (session) {
1691 mdr->session = session;
1692 session->requests.push_back(&mdr->item_session_request);
1693 }
1694
1695 if (has_completed)
1696 mdr->has_completed = true;
1697
1698 // process embedded cap releases?
1699 // (only if NOT replay!)
1700 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
1701 client_t client = req->get_source().num();
1702 for (vector<MClientRequest::Release>::iterator p = req->releases.begin();
1703 p != req->releases.end();
1704 ++p)
1705 mds->locker->process_request_cap_release(mdr, client, p->item, p->dname);
1706 req->releases.clear();
1707 }
1708
1709 dispatch_client_request(mdr);
1710 return;
1711 }
1712
1713 void Server::handle_osd_map()
1714 {
1715 /* Note that we check the OSDMAP_FULL flag directly rather than
1716 * using osdmap_full_flag(), because we want to know "is the flag set"
1717 * rather than "does the flag apply to us?" */
1718 mds->objecter->with_osdmap([this](const OSDMap& o) {
1719 is_full = o.test_flag(CEPH_OSDMAP_FULL);
1720 dout(7) << __func__ << ": full = " << is_full << " epoch = "
1721 << o.get_epoch() << dendl;
1722 });
1723 }
1724
1725 void Server::dispatch_client_request(MDRequestRef& mdr)
1726 {
1727 // we shouldn't be waiting on anyone.
1728 assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
1729
1730 if (mdr->killed) {
1731 dout(10) << "request " << *mdr << " was killed" << dendl;
1732 return;
1733 }
1734
1735 MClientRequest *req = mdr->client_request;
1736
1737 if (logger) logger->inc(l_mdss_dispatch_client_request);
1738
1739 dout(7) << "dispatch_client_request " << *req << dendl;
1740
1741 if (req->may_write()) {
1742 if (mdcache->is_readonly()) {
1743 dout(10) << " read-only FS" << dendl;
1744 respond_to_request(mdr, -EROFS);
1745 return;
1746 }
1747 if (mdr->has_more() && mdr->more()->slave_error) {
1748 dout(10) << " got error from slaves" << dendl;
1749 respond_to_request(mdr, mdr->more()->slave_error);
1750 return;
1751 }
1752 }
1753
1754 if (is_full) {
1755 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1756 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
1757 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1758 req->get_op() == CEPH_MDS_OP_RMXATTR ||
1759 req->get_op() == CEPH_MDS_OP_SETXATTR ||
1760 req->get_op() == CEPH_MDS_OP_CREATE ||
1761 req->get_op() == CEPH_MDS_OP_SYMLINK ||
1762 req->get_op() == CEPH_MDS_OP_MKSNAP ||
1763 ((req->get_op() == CEPH_MDS_OP_LINK ||
1764 req->get_op() == CEPH_MDS_OP_RENAME) &&
1765 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
1766 ) {
1767
1768 dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
1769 respond_to_request(mdr, -ENOSPC);
1770 return;
1771 } else {
1772 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
1773 }
1774 }
1775
1776 switch (req->get_op()) {
1777 case CEPH_MDS_OP_LOOKUPHASH:
1778 case CEPH_MDS_OP_LOOKUPINO:
1779 handle_client_lookup_ino(mdr, false, false);
1780 break;
1781 case CEPH_MDS_OP_LOOKUPPARENT:
1782 handle_client_lookup_ino(mdr, true, false);
1783 break;
1784 case CEPH_MDS_OP_LOOKUPNAME:
1785 handle_client_lookup_ino(mdr, false, true);
1786 break;
1787
1788 // inodes ops.
1789 case CEPH_MDS_OP_LOOKUP:
1790 handle_client_getattr(mdr, true);
1791 break;
1792
1793 case CEPH_MDS_OP_LOOKUPSNAP:
1794 // lookupsnap does not reference a CDentry; treat it as a getattr
1795 case CEPH_MDS_OP_GETATTR:
1796 handle_client_getattr(mdr, false);
1797 break;
1798
1799 case CEPH_MDS_OP_SETATTR:
1800 handle_client_setattr(mdr);
1801 break;
1802 case CEPH_MDS_OP_SETLAYOUT:
1803 handle_client_setlayout(mdr);
1804 break;
1805 case CEPH_MDS_OP_SETDIRLAYOUT:
1806 handle_client_setdirlayout(mdr);
1807 break;
1808 case CEPH_MDS_OP_SETXATTR:
1809 handle_client_setxattr(mdr);
1810 break;
1811 case CEPH_MDS_OP_RMXATTR:
1812 handle_client_removexattr(mdr);
1813 break;
1814
1815 case CEPH_MDS_OP_READDIR:
1816 handle_client_readdir(mdr);
1817 break;
1818
1819 case CEPH_MDS_OP_SETFILELOCK:
1820 handle_client_file_setlock(mdr);
1821 break;
1822
1823 case CEPH_MDS_OP_GETFILELOCK:
1824 handle_client_file_readlock(mdr);
1825 break;
1826
1827 // funky.
1828 case CEPH_MDS_OP_CREATE:
1829 if (mdr->has_completed)
1830 handle_client_open(mdr); // already created.. just open
1831 else
1832 handle_client_openc(mdr);
1833 break;
1834
1835 case CEPH_MDS_OP_OPEN:
1836 handle_client_open(mdr);
1837 break;
1838
1839 // namespace.
1840 // no prior locks.
1841 case CEPH_MDS_OP_MKNOD:
1842 handle_client_mknod(mdr);
1843 break;
1844 case CEPH_MDS_OP_LINK:
1845 handle_client_link(mdr);
1846 break;
1847 case CEPH_MDS_OP_UNLINK:
1848 case CEPH_MDS_OP_RMDIR:
1849 handle_client_unlink(mdr);
1850 break;
1851 case CEPH_MDS_OP_RENAME:
1852 handle_client_rename(mdr);
1853 break;
1854 case CEPH_MDS_OP_MKDIR:
1855 handle_client_mkdir(mdr);
1856 break;
1857 case CEPH_MDS_OP_SYMLINK:
1858 handle_client_symlink(mdr);
1859 break;
1860
1861
1862 // snaps
1863 case CEPH_MDS_OP_LSSNAP:
1864 handle_client_lssnap(mdr);
1865 break;
1866 case CEPH_MDS_OP_MKSNAP:
1867 handle_client_mksnap(mdr);
1868 break;
1869 case CEPH_MDS_OP_RMSNAP:
1870 handle_client_rmsnap(mdr);
1871 break;
1872 case CEPH_MDS_OP_RENAMESNAP:
1873 handle_client_renamesnap(mdr);
1874 break;
1875
1876 default:
1877 dout(1) << " unknown client op " << req->get_op() << dendl;
1878 respond_to_request(mdr, -EOPNOTSUPP);
1879 }
1880 }
1881
1882
1883 // ---------------------------------------
1884 // SLAVE REQUESTS
1885
1886 /* This function DOES put the passed message before returning*/
1887 void Server::handle_slave_request(MMDSSlaveRequest *m)
1888 {
1889 dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
1890 mds_rank_t from = mds_rank_t(m->get_source().num());
1891
1892 if (logger) logger->inc(l_mdss_handle_slave_request);
1893
1894 // reply?
1895 if (m->is_reply())
1896 return handle_slave_request_reply(m);
1897
1898 // the purpose of rename notify is enforcing causal message ordering. making sure
1899 // bystanders have received all messages from rename srcdn's auth MDS.
1900 if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
1901 MMDSSlaveRequest *reply = new MMDSSlaveRequest(m->get_reqid(), m->get_attempt(),
1902 MMDSSlaveRequest::OP_RENAMENOTIFYACK);
1903 mds->send_message(reply, m->get_connection());
1904 m->put();
1905 return;
1906 }
1907
1908 CDentry *straydn = NULL;
1909 if (m->stray.length() > 0) {
1910 straydn = mdcache->add_replica_stray(m->stray, from);
1911 assert(straydn);
1912 m->stray.clear();
1913 }
1914
1915 // am i a new slave?
1916 MDRequestRef mdr;
1917 if (mdcache->have_request(m->get_reqid())) {
1918 // existing?
1919 mdr = mdcache->request_get(m->get_reqid());
1920
1921 // is my request newer?
1922 if (mdr->attempt > m->get_attempt()) {
1923 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
1924 << ", dropping " << *m << dendl;
1925 m->put();
1926 return;
1927 }
1928
1929
1930 if (mdr->attempt < m->get_attempt()) {
1931 // mine is old, close it out
1932 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
1933 << ", closing out" << dendl;
1934 mdcache->request_finish(mdr);
1935 mdr.reset();
1936 } else if (mdr->slave_to_mds != from) {
1937 dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
1938 m->put();
1939 return;
1940 }
1941
1942 if (m->get_op() == MMDSSlaveRequest::OP_FINISH && m->is_abort()) {
1943 mdr->aborted = true;
1944 if (mdr->slave_request) {
1945 // only abort on-going xlock, wrlock and auth pin
1946 assert(!mdr->slave_did_prepare());
1947 } else {
1948 mdcache->request_finish(mdr);
1949 }
1950 return;
1951 }
1952 }
1953 if (!mdr.get()) {
1954 // new?
1955 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
1956 dout(10) << "missing slave request for " << m->get_reqid()
1957 << " OP_FINISH, must have lost race with a forward" << dendl;
1958 m->put();
1959 return;
1960 }
1961 mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
1962 mdr->set_op_stamp(m->op_stamp);
1963 }
1964 assert(mdr->slave_request == 0); // only one at a time, please!
1965
1966 if (straydn) {
1967 mdr->pin(straydn);
1968 mdr->straydn = straydn;
1969 }
1970
1971 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
1972 dout(3) << "not clientreplay|active yet, waiting" << dendl;
1973 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
1974 return;
1975 } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
1976 mdr->locks.empty()) {
1977 dout(3) << "not active yet, waiting" << dendl;
1978 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
1979 return;
1980 }
1981
1982 mdr->slave_request = m;
1983
1984 dispatch_slave_request(mdr);
1985 }
1986
1987 /* This function DOES put the passed message before returning*/
1988 void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
1989 {
1990 mds_rank_t from = mds_rank_t(m->get_source().num());
1991
1992 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
1993 metareqid_t r = m->get_reqid();
1994 if (!mdcache->have_uncommitted_master(r, from)) {
1995 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
1996 << from << " reqid " << r << dendl;
1997 m->put();
1998 return;
1999 }
2000 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2001 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2002 return;
2003 }
2004
2005 if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
2006 metareqid_t r = m->get_reqid();
2007 mdcache->committed_master_slave(r, from);
2008 m->put();
2009 return;
2010 }
2011
2012 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2013 if (m->get_attempt() != mdr->attempt) {
2014 dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
2015 << m->get_attempt() << dendl;
2016 m->put();
2017 return;
2018 }
2019
2020 switch (m->get_op()) {
2021 case MMDSSlaveRequest::OP_XLOCKACK:
2022 {
2023 // identify lock, master request
2024 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2025 m->get_object_info());
2026 mdr->more()->slaves.insert(from);
2027 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2028 mdr->xlocks.insert(lock);
2029 mdr->locks.insert(lock);
2030 mdr->finish_locking(lock);
2031 lock->get_xlock(mdr, mdr->get_client());
2032
2033 assert(mdr->more()->waiting_on_slave.count(from));
2034 mdr->more()->waiting_on_slave.erase(from);
2035 assert(mdr->more()->waiting_on_slave.empty());
2036 mdcache->dispatch_request(mdr);
2037 }
2038 break;
2039
2040 case MMDSSlaveRequest::OP_WRLOCKACK:
2041 {
2042 // identify lock, master request
2043 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2044 m->get_object_info());
2045 mdr->more()->slaves.insert(from);
2046 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2047 mdr->remote_wrlocks[lock] = from;
2048 mdr->locks.insert(lock);
2049 mdr->finish_locking(lock);
2050
2051 assert(mdr->more()->waiting_on_slave.count(from));
2052 mdr->more()->waiting_on_slave.erase(from);
2053 assert(mdr->more()->waiting_on_slave.empty());
2054 mdcache->dispatch_request(mdr);
2055 }
2056 break;
2057
2058 case MMDSSlaveRequest::OP_AUTHPINACK:
2059 handle_slave_auth_pin_ack(mdr, m);
2060 break;
2061
2062 case MMDSSlaveRequest::OP_LINKPREPACK:
2063 handle_slave_link_prep_ack(mdr, m);
2064 break;
2065
2066 case MMDSSlaveRequest::OP_RMDIRPREPACK:
2067 handle_slave_rmdir_prep_ack(mdr, m);
2068 break;
2069
2070 case MMDSSlaveRequest::OP_RENAMEPREPACK:
2071 handle_slave_rename_prep_ack(mdr, m);
2072 break;
2073
2074 case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
2075 handle_slave_rename_notify_ack(mdr, m);
2076 break;
2077
2078 default:
2079 ceph_abort();
2080 }
2081
2082 // done with reply.
2083 m->put();
2084 }
2085
2086 /* This function DOES put the mdr->slave_request before returning*/
2087 void Server::dispatch_slave_request(MDRequestRef& mdr)
2088 {
2089 dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
2090
2091 if (mdr->aborted) {
2092 dout(7) << " abort flag set, finishing" << dendl;
2093 mdcache->request_finish(mdr);
2094 return;
2095 }
2096
2097 if (logger) logger->inc(l_mdss_dispatch_slave_request);
2098
2099 int op = mdr->slave_request->get_op();
2100 switch (op) {
2101 case MMDSSlaveRequest::OP_XLOCK:
2102 case MMDSSlaveRequest::OP_WRLOCK:
2103 {
2104 // identify object
2105 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2106 mdr->slave_request->get_object_info());
2107
2108 if (!lock) {
2109 dout(10) << "don't have object, dropping" << dendl;
2110 ceph_abort(); // can this happen, if we auth pinned properly.
2111 }
2112 if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2113 dout(10) << "not auth for remote xlock attempt, dropping on "
2114 << *lock << " on " << *lock->get_parent() << dendl;
2115 } else {
2116 // use acquire_locks so that we get auth_pinning.
2117 set<SimpleLock*> rdlocks;
2118 set<SimpleLock*> wrlocks = mdr->wrlocks;
2119 set<SimpleLock*> xlocks = mdr->xlocks;
2120
2121 int replycode = 0;
2122 switch (op) {
2123 case MMDSSlaveRequest::OP_XLOCK:
2124 xlocks.insert(lock);
2125 replycode = MMDSSlaveRequest::OP_XLOCKACK;
2126 break;
2127 case MMDSSlaveRequest::OP_WRLOCK:
2128 wrlocks.insert(lock);
2129 replycode = MMDSSlaveRequest::OP_WRLOCKACK;
2130 break;
2131 }
2132
2133 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
2134 return;
2135
2136 // ack
2137 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, replycode);
2138 r->set_lock_type(lock->get_type());
2139 lock->get_parent()->set_object_info(r->get_object_info());
2140 mds->send_message(r, mdr->slave_request->get_connection());
2141 }
2142
2143 // done.
2144 mdr->slave_request->put();
2145 mdr->slave_request = 0;
2146 }
2147 break;
2148
2149 case MMDSSlaveRequest::OP_UNXLOCK:
2150 case MMDSSlaveRequest::OP_UNWRLOCK:
2151 {
2152 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2153 mdr->slave_request->get_object_info());
2154 assert(lock);
2155 bool need_issue = false;
2156 switch (op) {
2157 case MMDSSlaveRequest::OP_UNXLOCK:
2158 mds->locker->xlock_finish(lock, mdr.get(), &need_issue);
2159 break;
2160 case MMDSSlaveRequest::OP_UNWRLOCK:
2161 mds->locker->wrlock_finish(lock, mdr.get(), &need_issue);
2162 break;
2163 }
2164 if (need_issue)
2165 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2166
2167 // done. no ack necessary.
2168 mdr->slave_request->put();
2169 mdr->slave_request = 0;
2170 }
2171 break;
2172
2173 case MMDSSlaveRequest::OP_DROPLOCKS:
2174 mds->locker->drop_locks(mdr.get());
2175 mdr->slave_request->put();
2176 mdr->slave_request = 0;
2177 break;
2178
2179 case MMDSSlaveRequest::OP_AUTHPIN:
2180 handle_slave_auth_pin(mdr);
2181 break;
2182
2183 case MMDSSlaveRequest::OP_LINKPREP:
2184 case MMDSSlaveRequest::OP_UNLINKPREP:
2185 handle_slave_link_prep(mdr);
2186 break;
2187
2188 case MMDSSlaveRequest::OP_RMDIRPREP:
2189 handle_slave_rmdir_prep(mdr);
2190 break;
2191
2192 case MMDSSlaveRequest::OP_RENAMEPREP:
2193 handle_slave_rename_prep(mdr);
2194 break;
2195
2196 case MMDSSlaveRequest::OP_FINISH:
2197 // information about rename imported caps
2198 if (mdr->slave_request->inode_export.length() > 0)
2199 mdr->more()->inode_import.claim(mdr->slave_request->inode_export);
2200 // finish off request.
2201 mdcache->request_finish(mdr);
2202 break;
2203
2204 default:
2205 ceph_abort();
2206 }
2207 }
2208
2209 /* This function DOES put the mdr->slave_request before returning*/
2210 void Server::handle_slave_auth_pin(MDRequestRef& mdr)
2211 {
2212 dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
2213
2214 // build list of objects
2215 list<MDSCacheObject*> objects;
2216 CInode *auth_pin_freeze = NULL;
2217 bool fail = false, wouldblock = false, readonly = false;
2218
2219 if (mdcache->is_readonly()) {
2220 dout(10) << " read-only FS" << dendl;
2221 readonly = true;
2222 fail = true;
2223 }
2224
2225 if (!fail) {
2226 for (vector<MDSCacheObjectInfo>::iterator p = mdr->slave_request->get_authpins().begin();
2227 p != mdr->slave_request->get_authpins().end();
2228 ++p) {
2229 MDSCacheObject *object = mdcache->get_object(*p);
2230 if (!object) {
2231 dout(10) << " don't have " << *p << dendl;
2232 fail = true;
2233 break;
2234 }
2235
2236 objects.push_back(object);
2237 if (*p == mdr->slave_request->get_authpin_freeze())
2238 auth_pin_freeze = static_cast<CInode*>(object);
2239 }
2240 }
2241
2242 // can we auth pin them?
2243 if (!fail) {
2244 for (list<MDSCacheObject*>::iterator p = objects.begin();
2245 p != objects.end();
2246 ++p) {
2247 if (!(*p)->is_auth()) {
2248 dout(10) << " not auth for " << **p << dendl;
2249 fail = true;
2250 break;
2251 }
2252 if (mdr->is_auth_pinned(*p))
2253 continue;
2254 if (!mdr->can_auth_pin(*p)) {
2255 if (mdr->slave_request->is_nonblock()) {
2256 dout(10) << " can't auth_pin (freezing?) " << **p << " nonblocking" << dendl;
2257 fail = true;
2258 wouldblock = true;
2259 break;
2260 }
2261 // wait
2262 dout(10) << " waiting for authpinnable on " << **p << dendl;
2263 (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2264 mdr->drop_local_auth_pins();
2265
2266 mds->locker->notify_freeze_waiter(*p);
2267 return;
2268 }
2269 }
2270 }
2271
2272 // auth pin!
2273 if (fail) {
2274 mdr->drop_local_auth_pins(); // just in case
2275 } else {
2276 /* freeze authpin wrong inode */
2277 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
2278 mdr->more()->rename_inode != auth_pin_freeze)
2279 mdr->unfreeze_auth_pin(true);
2280
2281 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2282 * on the source inode to complete. This happens after all locks for the rename
2283 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2284 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2285 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2286 * The solution is freeze the inode and prevent other MDRequests from getting new
2287 * auth pins.
2288 */
2289 if (auth_pin_freeze) {
2290 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
2291 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
2292 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
2293 mds->mdlog->flush();
2294 return;
2295 }
2296 }
2297 for (list<MDSCacheObject*>::iterator p = objects.begin();
2298 p != objects.end();
2299 ++p) {
2300 dout(10) << "auth_pinning " << **p << dendl;
2301 mdr->auth_pin(*p);
2302 }
2303 }
2304
2305 // ack!
2306 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
2307
2308 // return list of my auth_pins (if any)
2309 for (set<MDSCacheObject*>::iterator p = mdr->auth_pins.begin();
2310 p != mdr->auth_pins.end();
2311 ++p) {
2312 MDSCacheObjectInfo info;
2313 (*p)->set_object_info(info);
2314 reply->get_authpins().push_back(info);
2315 if (*p == (MDSCacheObject*)auth_pin_freeze)
2316 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
2317 }
2318
2319 if (wouldblock)
2320 reply->mark_error_wouldblock();
2321 if (readonly)
2322 reply->mark_error_rofs();
2323
2324 mds->send_message_mds(reply, mdr->slave_to_mds);
2325
2326 // clean up this request
2327 mdr->slave_request->put();
2328 mdr->slave_request = 0;
2329 return;
2330 }
2331
2332 /* This function DOES NOT put the passed ack before returning*/
2333 void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
2334 {
2335 dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
2336 mds_rank_t from = mds_rank_t(ack->get_source().num());
2337
2338 // added auth pins?
2339 set<MDSCacheObject*> pinned;
2340 for (vector<MDSCacheObjectInfo>::iterator p = ack->get_authpins().begin();
2341 p != ack->get_authpins().end();
2342 ++p) {
2343 MDSCacheObject *object = mdcache->get_object(*p);
2344 assert(object); // we pinned it
2345 dout(10) << " remote has pinned " << *object << dendl;
2346 if (!mdr->is_auth_pinned(object))
2347 mdr->remote_auth_pins[object] = from;
2348 if (*p == ack->get_authpin_freeze())
2349 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
2350 pinned.insert(object);
2351 }
2352
2353 // removed frozen auth pin ?
2354 if (mdr->more()->is_remote_frozen_authpin &&
2355 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
2356 auto p = mdr->remote_auth_pins.find(mdr->more()->rename_inode);
2357 assert(p != mdr->remote_auth_pins.end());
2358 if (p->second == from) {
2359 mdr->more()->is_remote_frozen_authpin = false;
2360 }
2361 }
2362
2363 // removed auth pins?
2364 map<MDSCacheObject*, mds_rank_t>::iterator p = mdr->remote_auth_pins.begin();
2365 while (p != mdr->remote_auth_pins.end()) {
2366 MDSCacheObject* object = p->first;
2367 if (p->second == from && pinned.count(object) == 0) {
2368 dout(10) << " remote has unpinned " << *object << dendl;
2369 mdr->remote_auth_pins.erase(p++);
2370 } else {
2371 ++p;
2372 }
2373 }
2374
2375 if (ack->is_error_rofs()) {
2376 mdr->more()->slave_error = -EROFS;
2377 mdr->aborted = true;
2378 } else if (ack->is_error_wouldblock()) {
2379 mdr->more()->slave_error = -EWOULDBLOCK;
2380 mdr->aborted = true;
2381 }
2382
2383 // note slave
2384 mdr->more()->slaves.insert(from);
2385
2386 // clear from waiting list
2387 assert(mdr->more()->waiting_on_slave.count(from));
2388 mdr->more()->waiting_on_slave.erase(from);
2389
2390 // go again?
2391 if (mdr->more()->waiting_on_slave.empty())
2392 mdcache->dispatch_request(mdr);
2393 else
2394 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
2395 }
2396
2397
2398 // ---------------------------------------
2399 // HELPERS
2400
2401
2402 /**
2403 * check whether we are permitted to complete a request
2404 *
2405 * Check whether we have permission to perform the operation specified
2406 * by mask on the given inode, based on the capability in the mdr's
2407 * session.
2408 */
2409 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
2410 {
2411 if (mdr->session) {
2412 int r = mdr->session->check_access(
2413 in, mask,
2414 mdr->client_request->get_caller_uid(),
2415 mdr->client_request->get_caller_gid(),
2416 &mdr->client_request->get_caller_gid_list(),
2417 mdr->client_request->head.args.setattr.uid,
2418 mdr->client_request->head.args.setattr.gid);
2419 if (r < 0) {
2420 respond_to_request(mdr, r);
2421 return false;
2422 }
2423 }
2424 return true;
2425 }
2426
2427 /**
2428 * check whether fragment has reached maximum size
2429 *
2430 */
2431 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
2432 {
2433 const auto size = in->get_frag_size();
2434 if (size >= g_conf->mds_bal_fragment_size_max) {
2435 dout(10) << "fragment " << *in << " size exceeds " << g_conf->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
2436 respond_to_request(mdr, -ENOSPC);
2437 return false;
2438 }
2439
2440 return true;
2441 }
2442
2443
2444 /** validate_dentry_dir
2445 *
2446 * verify that the dir exists and would own the dname.
2447 * do not check if the dentry exists.
2448 */
2449 CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, const string& dname)
2450 {
2451 // make sure parent is a dir?
2452 if (!diri->is_dir()) {
2453 dout(7) << "validate_dentry_dir: not a dir" << dendl;
2454 respond_to_request(mdr, -ENOTDIR);
2455 return NULL;
2456 }
2457
2458 // which dirfrag?
2459 frag_t fg = diri->pick_dirfrag(dname);
2460 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
2461 if (!dir)
2462 return 0;
2463
2464 // frozen?
2465 if (dir->is_frozen()) {
2466 dout(7) << "dir is frozen " << *dir << dendl;
2467 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2468 return NULL;
2469 }
2470
2471 return dir;
2472 }
2473
2474
2475 /** prepare_null_dentry
2476 * prepare a null (or existing) dentry in given dir.
2477 * wait for any dn lock.
2478 */
2479 CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, const string& dname, bool okexist)
2480 {
2481 dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
2482 assert(dir->is_auth());
2483
2484 client_t client = mdr->get_client();
2485
2486 // does it already exist?
2487 CDentry *dn = dir->lookup(dname);
2488 if (dn) {
2489 /*
2490 if (dn->lock.is_xlocked_by_other(mdr)) {
2491 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2492 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2493 return 0;
2494 }
2495 */
2496 if (!dn->get_linkage(client, mdr)->is_null()) {
2497 // name already exists
2498 dout(10) << "dentry " << dname << " exists in " << *dir << dendl;
2499 if (!okexist) {
2500 respond_to_request(mdr, -EEXIST);
2501 return 0;
2502 }
2503 } else {
2504 dn->first = dir->inode->find_snaprealm()->get_newest_seq() + 1;
2505 }
2506
2507 return dn;
2508 }
2509
2510 // make sure dir is complete
2511 if (!dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2512 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2513 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2514 return 0;
2515 }
2516
2517 // create
2518 dn = dir->add_null_dentry(dname, dir->inode->find_snaprealm()->get_newest_seq() + 1);
2519 dn->mark_new();
2520 dout(10) << "prepare_null_dentry added " << *dn << dendl;
2521 return dn;
2522 }
2523
2524 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
2525 {
2526 CDentry *straydn = mdr->straydn;
2527 if (straydn) {
2528 string straydname;
2529 in->name_stray_dentry(straydname);
2530 if (straydn->get_name() == straydname)
2531 return straydn;
2532
2533 assert(!mdr->done_locking);
2534 mdr->unpin(straydn);
2535 }
2536
2537 CDir *straydir = mdcache->get_stray_dir(in);
2538
2539 if (!mdr->client_request->is_replay() &&
2540 !check_fragment_space(mdr, straydir))
2541 return NULL;
2542
2543 straydn = mdcache->get_or_create_stray_dentry(in);
2544 mdr->straydn = straydn;
2545 mdr->pin(straydn);
2546 return straydn;
2547 }
2548
2549 /** prepare_new_inode
2550 *
2551 * create a new inode. set c/m/atime. hit dir pop.
2552 */
2553 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
2554 file_layout_t *layout)
2555 {
2556 CInode *in = new CInode(mdcache);
2557
2558 // Server::prepare_force_open_sessions() can re-open session in closing
2559 // state. In that corner case, session's prealloc_inos are being freed.
2560 // To simplify the code, we disallow using/refilling session's prealloc_ino
2561 // while session is opening.
2562 bool allow_prealloc_inos = !mdr->session->is_opening();
2563
2564 // assign ino
2565 if (allow_prealloc_inos &&
2566 mdr->session->info.prealloc_inos.size()) {
2567 mdr->used_prealloc_ino =
2568 in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used
2569 mds->sessionmap.mark_projected(mdr->session);
2570
2571 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
2572 << " (" << mdr->session->info.prealloc_inos
2573 << ", " << mdr->session->info.prealloc_inos.size() << " left)"
2574 << dendl;
2575 } else {
2576 mdr->alloc_ino =
2577 in->inode.ino = mds->inotable->project_alloc_id();
2578 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
2579 }
2580
2581 if (useino && useino != in->inode.ino) {
2582 dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
2583 mds->clog->error() << mdr->client_request->get_source()
2584 << " specified ino " << useino
2585 << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
2586 //ceph_abort(); // just for now.
2587 }
2588
2589 if (allow_prealloc_inos &&
2590 mdr->session->get_num_projected_prealloc_inos() < g_conf->mds_client_prealloc_inos / 2) {
2591 int need = g_conf->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
2592 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
2593 assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
2594 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
2595 mds->sessionmap.mark_projected(mdr->session);
2596 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
2597 }
2598
2599 in->inode.version = 1;
2600 in->inode.xattr_version = 1;
2601 in->inode.nlink = 1; // FIXME
2602
2603 in->inode.mode = mode;
2604
2605 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
2606 if (in->inode.is_dir()) {
2607 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
2608 } else if (layout) {
2609 in->inode.layout = *layout;
2610 } else {
2611 in->inode.layout = mdcache->default_file_layout;
2612 }
2613
2614 in->inode.truncate_size = -1ull; // not truncated, yet!
2615 in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
2616
2617 CInode *diri = dir->get_inode();
2618
2619 dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
2620
2621 if (diri->inode.mode & S_ISGID) {
2622 dout(10) << " dir is sticky" << dendl;
2623 in->inode.gid = diri->inode.gid;
2624 if (S_ISDIR(mode)) {
2625 dout(10) << " new dir also sticky" << dendl;
2626 in->inode.mode |= S_ISGID;
2627 }
2628 } else
2629 in->inode.gid = mdr->client_request->get_caller_gid();
2630
2631 in->inode.uid = mdr->client_request->get_caller_uid();
2632
2633 in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
2634 mdr->get_op_stamp();
2635
2636 in->inode.change_attr = 0;
2637
2638 MClientRequest *req = mdr->client_request;
2639 if (req->get_data().length()) {
2640 bufferlist::iterator p = req->get_data().begin();
2641
2642 // xattrs on new inode?
2643 map<string,bufferptr> xattrs;
2644 ::decode(xattrs, p);
2645 for (map<string,bufferptr>::iterator p = xattrs.begin(); p != xattrs.end(); ++p) {
2646 dout(10) << "prepare_new_inode setting xattr " << p->first << dendl;
2647 in->xattrs[p->first] = p->second;
2648 }
2649 }
2650
2651 if (!mds->mdsmap->get_inline_data_enabled() ||
2652 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
2653 in->inode.inline_data.version = CEPH_INLINE_NONE;
2654
2655 mdcache->add_inode(in); // add
2656 dout(10) << "prepare_new_inode " << *in << dendl;
2657 return in;
2658 }
2659
2660 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
2661 {
2662 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
2663 << " inotablev " << mds->inotable->get_projected_version()
2664 << dendl;
2665 blob->set_ino_alloc(mdr->alloc_ino,
2666 mdr->used_prealloc_ino,
2667 mdr->prealloc_inos,
2668 mdr->client_request->get_source(),
2669 mds->sessionmap.get_projected(),
2670 mds->inotable->get_projected_version());
2671 }
2672
2673 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
2674 {
2675 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
2676 << " / " << mdr->prealloc_inos
2677 << " / " << mdr->used_prealloc_ino << dendl;
2678
2679 if (mdr->alloc_ino) {
2680 mds->inotable->apply_alloc_id(mdr->alloc_ino);
2681 }
2682 if (mdr->prealloc_inos.size()) {
2683 assert(session);
2684 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
2685 session->info.prealloc_inos.insert(mdr->prealloc_inos);
2686 mds->sessionmap.mark_dirty(session);
2687 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
2688 }
2689 if (mdr->used_prealloc_ino) {
2690 assert(session);
2691 session->info.used_inos.erase(mdr->used_prealloc_ino);
2692 mds->sessionmap.mark_dirty(session);
2693 }
2694 }
2695
2696 class C_MDS_TryFindInode : public ServerContext {
2697 MDRequestRef mdr;
2698 public:
2699 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
2700 void finish(int r) override {
2701 if (r == -ESTALE) // :( find_ino_peers failed
2702 server->respond_to_request(mdr, r);
2703 else
2704 server->dispatch_client_request(mdr);
2705 }
2706 };
2707
2708 CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath)
2709 {
2710 // figure parent dir vs dname
2711 if (refpath.depth() == 0) {
2712 dout(7) << "can't do that to root" << dendl;
2713 respond_to_request(mdr, -EINVAL);
2714 return 0;
2715 }
2716 string dname = refpath.last_dentry();
2717 refpath.pop_dentry();
2718
2719 dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl;
2720
2721 // traverse to parent dir
2722 CInode *diri;
2723 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &trace, &diri, MDS_TRAVERSE_FORWARD);
2724 if (r > 0) return 0; // delayed
2725 if (r < 0) {
2726 if (r == -ESTALE) {
2727 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2728 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
2729 return 0;
2730 }
2731 respond_to_request(mdr, r);
2732 return 0;
2733 }
2734
2735 // is it an auth dir?
2736 CDir *dir = validate_dentry_dir(mdr, diri, dname);
2737 if (!dir)
2738 return 0; // forwarded or waiting for freeze
2739
2740 dout(10) << "traverse_to_auth_dir " << *dir << dendl;
2741 return dir;
2742 }
2743
2744 /* If this returns null, the request has been handled
2745 * as appropriate: forwarded on, or the client's been replied to */
2746 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n,
2747 set<SimpleLock*> &rdlocks,
2748 bool want_auth,
2749 bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's
2750 a snapped dir */
2751 file_layout_t **layout,
2752 bool no_lookup) // true if we cannot return a null dentry lease
2753 {
2754 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2755 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
2756
2757 if (mdr->done_locking)
2758 return mdr->in[n];
2759
2760 // traverse
2761 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &mdr->dn[n], &mdr->in[n], MDS_TRAVERSE_FORWARD);
2762 if (r > 0)
2763 return NULL; // delayed
2764 if (r < 0) { // error
2765 if (r == -ENOENT && n == 0 && mdr->dn[n].size()) {
2766 if (!no_lookup)
2767 mdr->tracedn = mdr->dn[n][mdr->dn[n].size()-1];
2768 respond_to_request(mdr, r);
2769 } else if (r == -ESTALE) {
2770 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2771 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
2772 mdcache->find_ino_peers(refpath.get_ino(), c);
2773 } else {
2774 dout(10) << "FAIL on error " << r << dendl;
2775 respond_to_request(mdr, r);
2776 }
2777 return 0;
2778 }
2779 CInode *ref = mdr->in[n];
2780 dout(10) << "ref is " << *ref << dendl;
2781
2782 // fw to inode auth?
2783 if (mdr->snapid != CEPH_NOSNAP && !no_want_auth)
2784 want_auth = true;
2785
2786 if (want_auth) {
2787 if (ref->is_ambiguous_auth()) {
2788 dout(10) << "waiting for single auth on " << *ref << dendl;
2789 ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
2790 return 0;
2791 }
2792 if (!ref->is_auth()) {
2793 dout(10) << "fw to auth for " << *ref << dendl;
2794 mdcache->request_forward(mdr, ref->authority().first);
2795 return 0;
2796 }
2797
2798 // auth_pin?
2799 // do NOT proceed if freezing, as cap release may defer in that case, and
2800 // we could deadlock when we try to lock @ref.
2801 // if we're already auth_pinned, continue; the release has already been processed.
2802 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
2803 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
2804 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
2805 ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2806 /* If we have any auth pins, this will deadlock.
2807 * But the only way to get here if we've already got auth pins
2808 * is because we're on an inode with snapshots that got updated
2809 * between dispatches of this request. So we're going to drop
2810 * our locks and our auth pins and reacquire them later.
2811 *
2812 * This is safe since we're only in this function when working on
2813 * a single MDS request; otherwise we'd be in
2814 * rdlock_path_xlock_dentry.
2815 */
2816 mds->locker->drop_locks(mdr.get(), NULL);
2817 mdr->drop_local_auth_pins();
2818 if (!mdr->remote_auth_pins.empty())
2819 mds->locker->notify_freeze_waiter(ref);
2820 return 0;
2821 }
2822
2823 mdr->auth_pin(ref);
2824 }
2825
2826 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2827 rdlocks.insert(&mdr->dn[n][i]->lock);
2828 if (layout)
2829 mds->locker->include_snap_rdlocks_wlayout(rdlocks, ref, layout);
2830 else
2831 mds->locker->include_snap_rdlocks(rdlocks, ref);
2832
2833 // set and pin ref
2834 mdr->pin(ref);
2835 return ref;
2836 }
2837
2838
2839 /** rdlock_path_xlock_dentry
2840 * traverse path to the directory that could/would contain dentry.
2841 * make sure i am auth for that dentry, forward as necessary.
2842 * create null dentry in place (or use existing if okexist).
2843 * get rdlocks on traversed dentries, xlock on new dentry.
2844 */
2845 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
2846 set<SimpleLock*>& rdlocks, set<SimpleLock*>& wrlocks, set<SimpleLock*>& xlocks,
2847 bool okexist, bool mustexist, bool alwaysxlock,
2848 file_layout_t **layout)
2849 {
2850 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2851
2852 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
2853
2854 client_t client = mdr->get_client();
2855
2856 if (mdr->done_locking)
2857 return mdr->dn[n].back();
2858
2859 CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
2860 if (!dir) return 0;
2861
2862 CInode *diri = dir->get_inode();
2863 if (!mdr->reqid.name.is_mds()) {
2864 if (diri->is_system() && !diri->is_root()) {
2865 respond_to_request(mdr, -EROFS);
2866 return 0;
2867 }
2868 }
2869 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
2870 respond_to_request(mdr, -ENOENT);
2871 return 0;
2872 }
2873
2874 // make a null dentry?
2875 const string &dname = refpath.last_dentry();
2876 CDentry *dn;
2877 if (mustexist) {
2878 dn = dir->lookup(dname);
2879
2880 // make sure dir is complete
2881 if (!dn && !dir->is_complete() &&
2882 (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2883 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2884 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2885 return 0;
2886 }
2887
2888 // readable?
2889 if (dn && !dn->lock.can_read(client) && dn->lock.get_xlock_by() != mdr) {
2890 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2891 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2892 return 0;
2893 }
2894
2895 // exists?
2896 if (!dn || dn->get_linkage(client, mdr)->is_null()) {
2897 dout(7) << "dentry " << dname << " dne in " << *dir << dendl;
2898 respond_to_request(mdr, -ENOENT);
2899 return 0;
2900 }
2901 } else {
2902 dn = prepare_null_dentry(mdr, dir, dname, okexist);
2903 if (!dn)
2904 return 0;
2905 }
2906
2907 mdr->dn[n].push_back(dn);
2908 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
2909 mdr->in[n] = dnl->get_inode();
2910
2911 // -- lock --
2912 // NOTE: rename takes the same set of locks for srcdn
2913 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2914 rdlocks.insert(&mdr->dn[n][i]->lock);
2915 if (alwaysxlock || dnl->is_null())
2916 xlocks.insert(&dn->lock); // new dn, xlock
2917 else
2918 rdlocks.insert(&dn->lock); // existing dn, rdlock
2919 wrlocks.insert(&dn->get_dir()->inode->filelock); // also, wrlock on dir mtime
2920 wrlocks.insert(&dn->get_dir()->inode->nestlock); // also, wrlock on dir mtime
2921 if (layout)
2922 mds->locker->include_snap_rdlocks_wlayout(rdlocks, dn->get_dir()->inode, layout);
2923 else
2924 mds->locker->include_snap_rdlocks(rdlocks, dn->get_dir()->inode);
2925
2926 return dn;
2927 }
2928
2929
2930
2931
2932
2933 /**
2934 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
2935 *
2936 * @param diri base inode
2937 * @param fg the exact frag we want
2938 * @param mdr request
2939 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
2940 */
2941 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
2942 {
2943 CDir *dir = diri->get_dirfrag(fg);
2944
2945 // not open and inode not mine?
2946 if (!dir && !diri->is_auth()) {
2947 mds_rank_t inauth = diri->authority().first;
2948 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
2949 mdcache->request_forward(mdr, inauth);
2950 return 0;
2951 }
2952
2953 // not open and inode frozen?
2954 if (!dir && diri->is_frozen()) {
2955 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
2956 assert(diri->get_parent_dir());
2957 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2958 return 0;
2959 }
2960
2961 // invent?
2962 if (!dir)
2963 dir = diri->get_or_open_dirfrag(mdcache, fg);
2964
2965 // am i auth for the dirfrag?
2966 if (!dir->is_auth()) {
2967 mds_rank_t auth = dir->authority().first;
2968 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
2969 << ", fw to mds." << auth << dendl;
2970 mdcache->request_forward(mdr, auth);
2971 return 0;
2972 }
2973
2974 return dir;
2975 }
2976
2977
2978 // ===============================================================================
2979 // STAT
2980
2981 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
2982 {
2983 MClientRequest *req = mdr->client_request;
2984 set<SimpleLock*> rdlocks, wrlocks, xlocks;
2985
2986 if (req->get_filepath().depth() == 0 && is_lookup) {
2987 // refpath can't be empty for lookup but it can for
2988 // getattr (we do getattr with empty refpath for mount of '/')
2989 respond_to_request(mdr, -EINVAL);
2990 return;
2991 }
2992
2993 CInode *ref = rdlock_path_pin_ref(mdr, 0, rdlocks, false, false, NULL, !is_lookup);
2994 if (!ref) return;
2995
2996 /*
2997 * if client currently holds the EXCL cap on a field, do not rdlock
2998 * it; client's stat() will result in valid info if _either_ EXCL
2999 * cap is held or MDS rdlocks and reads the value here.
3000 *
3001 * handling this case here is easier than weakening rdlock
3002 * semantics... that would cause problems elsewhere.
3003 */
3004 client_t client = mdr->get_client();
3005 int issued = 0;
3006 Capability *cap = ref->get_client_cap(client);
3007 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3008 mdr->snapid <= cap->client_follows))
3009 issued = cap->issued();
3010
3011 int mask = req->head.args.getattr.mask;
3012 if ((mask & CEPH_CAP_LINK_SHARED) && (issued & CEPH_CAP_LINK_EXCL) == 0) rdlocks.insert(&ref->linklock);
3013 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0) rdlocks.insert(&ref->authlock);
3014 if ((mask & CEPH_CAP_FILE_SHARED) && (issued & CEPH_CAP_FILE_EXCL) == 0) rdlocks.insert(&ref->filelock);
3015 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0) rdlocks.insert(&ref->xattrlock);
3016
3017 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3018 return;
3019
3020 if (!check_access(mdr, ref, MAY_READ))
3021 return;
3022
3023 // note which caps are requested, so we return at least a snapshot
3024 // value for them. (currently this matters for xattrs and inline data)
3025 mdr->getattr_caps = mask;
3026
3027 mds->balancer->hit_inode(ceph_clock_now(), ref, META_POP_IRD,
3028 req->get_source().num());
3029
3030 // reply
3031 dout(10) << "reply to stat on " << *req << dendl;
3032 mdr->tracei = ref;
3033 if (is_lookup)
3034 mdr->tracedn = mdr->dn[0].back();
3035 respond_to_request(mdr, 0);
3036 }
3037
3038 struct C_MDS_LookupIno2 : public ServerContext {
3039 MDRequestRef mdr;
3040 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3041 void finish(int r) override {
3042 server->_lookup_ino_2(mdr, r);
3043 }
3044 };
3045
3046 /* This function DOES clean up the mdr before returning*/
3047 /*
3048 * filepath: ino
3049 */
3050 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3051 bool want_parent, bool want_dentry)
3052 {
3053 MClientRequest *req = mdr->client_request;
3054
3055 inodeno_t ino = req->get_filepath().get_ino();
3056 CInode *in = mdcache->get_inode(ino);
3057 if (in && in->state_test(CInode::STATE_PURGING)) {
3058 respond_to_request(mdr, -ESTALE);
3059 return;
3060 }
3061 if (!in) {
3062 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3063 return;
3064 }
3065
3066 if (mdr && in->snaprealm && !in->snaprealm->is_open() &&
3067 !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
3068 return;
3069 }
3070
3071 // check for nothing (not read or write); this still applies the
3072 // path check.
3073 if (!check_access(mdr, in, 0))
3074 return;
3075
3076 CDentry *dn = in->get_projected_parent_dn();
3077 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3078
3079 set<SimpleLock*> rdlocks;
3080 if (dn && (want_parent || want_dentry)) {
3081 mdr->pin(dn);
3082 rdlocks.insert(&dn->lock);
3083 }
3084
3085 unsigned mask = req->head.args.getattr.mask;
3086 if (mask) {
3087 Capability *cap = in->get_client_cap(mdr->get_client());
3088 int issued = 0;
3089 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3090 issued = cap->issued();
3091 // permission bits, ACL/security xattrs
3092 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3093 rdlocks.insert(&in->authlock);
3094 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3095 rdlocks.insert(&in->xattrlock);
3096
3097 mdr->getattr_caps = mask;
3098 }
3099
3100 if (!rdlocks.empty()) {
3101 set<SimpleLock*> wrlocks, xlocks;
3102 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3103 return;
3104
3105 if (diri != NULL) {
3106 // need read access to directory inode
3107 if (!check_access(mdr, diri, MAY_READ))
3108 return;
3109 }
3110 }
3111
3112 if (want_parent) {
3113 if (in->is_base()) {
3114 respond_to_request(mdr, -EINVAL);
3115 return;
3116 }
3117 if (!diri || diri->is_stray()) {
3118 respond_to_request(mdr, -ESTALE);
3119 return;
3120 }
3121 dout(10) << "reply to lookup_parent " << *in << dendl;
3122 mdr->tracei = diri;
3123 respond_to_request(mdr, 0);
3124 } else {
3125 if (want_dentry) {
3126 inodeno_t dirino = req->get_filepath2().get_ino();
3127 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3128 respond_to_request(mdr, -ENOENT);
3129 return;
3130 }
3131 dout(10) << "reply to lookup_name " << *in << dendl;
3132 } else
3133 dout(10) << "reply to lookup_ino " << *in << dendl;
3134
3135 mdr->tracei = in;
3136 if (want_dentry)
3137 mdr->tracedn = dn;
3138 respond_to_request(mdr, 0);
3139 }
3140 }
3141
3142 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
3143 {
3144 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
3145 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3146
3147 // `r` is a rank if >=0, else an error code
3148 if (r >= 0) {
3149 mds_rank_t dest_rank(r);
3150 if (dest_rank == mds->get_nodeid())
3151 dispatch_client_request(mdr);
3152 else
3153 mdcache->request_forward(mdr, dest_rank);
3154 return;
3155 }
3156
3157 // give up
3158 if (r == -ENOENT || r == -ENODATA)
3159 r = -ESTALE;
3160 respond_to_request(mdr, r);
3161 }
3162
3163
3164 /* This function takes responsibility for the passed mdr*/
3165 void Server::handle_client_open(MDRequestRef& mdr)
3166 {
3167 MClientRequest *req = mdr->client_request;
3168 dout(7) << "open on " << req->get_filepath() << dendl;
3169
3170 int flags = req->head.args.open.flags;
3171 int cmode = ceph_flags_to_mode(flags);
3172 if (cmode < 0) {
3173 respond_to_request(mdr, -EINVAL);
3174 return;
3175 }
3176
3177 bool need_auth = !file_mode_is_readonly(cmode) ||
3178 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
3179
3180 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
3181 dout(7) << "read-only FS" << dendl;
3182 respond_to_request(mdr, -EROFS);
3183 return;
3184 }
3185
3186 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3187 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, need_auth);
3188 if (!cur)
3189 return;
3190
3191 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
3192 assert(!need_auth);
3193 mdr->done_locking = false;
3194 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3195 if (!cur)
3196 return;
3197 }
3198
3199 if (!cur->inode.is_file()) {
3200 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3201 cmode = CEPH_FILE_MODE_PIN;
3202 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3203 if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
3204 flags &= ~CEPH_O_TRUNC;
3205 }
3206
3207 dout(10) << "open flags = " << flags
3208 << ", filemode = " << cmode
3209 << ", need_auth = " << need_auth
3210 << dendl;
3211
3212 // regular file?
3213 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3214 dout(7) << "not a file or dir " << *cur << dendl;
3215 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3216 return;
3217 }*/
3218 if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
3219 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
3220 respond_to_request(mdr, -EINVAL);
3221 return;
3222 }
3223
3224 if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
3225 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
3226 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3227 respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
3228 return;
3229 }
3230
3231 if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
3232 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
3233 dout(7) << "old client cannot open inline data file " << *cur << dendl;
3234 respond_to_request(mdr, -EPERM);
3235 return;
3236 }
3237
3238 // snapped data is read only
3239 if (mdr->snapid != CEPH_NOSNAP &&
3240 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
3241 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
3242 respond_to_request(mdr, -EROFS);
3243 return;
3244 }
3245
3246 unsigned mask = req->head.args.open.mask;
3247 if (mask) {
3248 Capability *cap = cur->get_client_cap(mdr->get_client());
3249 int issued = 0;
3250 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3251 issued = cap->issued();
3252 // permission bits, ACL/security xattrs
3253 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3254 rdlocks.insert(&cur->authlock);
3255 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3256 rdlocks.insert(&cur->xattrlock);
3257
3258 mdr->getattr_caps = mask;
3259 }
3260
3261 // O_TRUNC
3262 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
3263 assert(cur->is_auth());
3264
3265 xlocks.insert(&cur->filelock);
3266 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3267 return;
3268
3269 if (!check_access(mdr, cur, MAY_WRITE))
3270 return;
3271
3272 // wait for pending truncate?
3273 const inode_t *pi = cur->get_projected_inode();
3274 if (pi->is_truncating()) {
3275 dout(10) << " waiting for pending truncate from " << pi->truncate_from
3276 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
3277 mds->locker->drop_locks(mdr.get());
3278 mdr->drop_local_auth_pins();
3279 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
3280 return;
3281 }
3282
3283 do_open_truncate(mdr, cmode);
3284 return;
3285 }
3286
3287 // sync filelock if snapped.
3288 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3289 // and that data itself is flushed so that we can read the snapped data off disk.
3290 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
3291 rdlocks.insert(&cur->filelock);
3292 }
3293
3294 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3295 return;
3296
3297 mask = MAY_READ;
3298 if (cmode & CEPH_FILE_MODE_WR)
3299 mask |= MAY_WRITE;
3300 if (!check_access(mdr, cur, mask))
3301 return;
3302
3303 if (cur->is_file() || cur->is_dir()) {
3304 if (mdr->snapid == CEPH_NOSNAP) {
3305 // register new cap
3306 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr->session, 0, req->is_replay());
3307 if (cap)
3308 dout(12) << "open issued caps " << ccap_string(cap->pending())
3309 << " for " << req->get_source()
3310 << " on " << *cur << dendl;
3311 } else {
3312 int caps = ceph_caps_for_mode(cmode);
3313 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
3314 << " for " << req->get_source()
3315 << " snapid " << mdr->snapid
3316 << " on " << *cur << dendl;
3317 mdr->snap_caps = caps;
3318 }
3319 }
3320
3321 // increase max_size?
3322 if (cmode & CEPH_FILE_MODE_WR)
3323 mds->locker->check_inode_max_size(cur);
3324
3325 // make sure this inode gets into the journal
3326 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
3327 !cur->item_open_file.is_on_list()) {
3328 LogSegment *ls = mds->mdlog->get_current_segment();
3329 EOpen *le = new EOpen(mds->mdlog);
3330 mdlog->start_entry(le);
3331 le->add_clean_inode(cur);
3332 ls->open_files.push_back(&cur->item_open_file);
3333 mdlog->submit_entry(le);
3334 }
3335
3336 // hit pop
3337 if (cmode & CEPH_FILE_MODE_WR)
3338 mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IWR);
3339 else
3340 mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IRD,
3341 mdr->client_request->get_source().num());
3342
3343 CDentry *dn = 0;
3344 if (req->get_dentry_wanted()) {
3345 assert(mdr->dn[0].size());
3346 dn = mdr->dn[0].back();
3347 }
3348
3349 mdr->tracei = cur;
3350 mdr->tracedn = dn;
3351 respond_to_request(mdr, 0);
3352 }
3353
3354 class C_MDS_openc_finish : public ServerLogContext {
3355 CDentry *dn;
3356 CInode *newi;
3357 snapid_t follows;
3358 public:
3359 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni, snapid_t f) :
3360 ServerLogContext(s, r), dn(d), newi(ni), follows(f) {}
3361 void finish(int r) override {
3362 assert(r == 0);
3363
3364 dn->pop_projected_linkage();
3365
3366 // dirty inode, dn, dir
3367 newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
3368 newi->mark_dirty(newi->inode.version+1, mdr->ls);
3369 newi->_mark_dirty_parent(mdr->ls, true);
3370
3371 mdr->apply();
3372
3373 get_mds()->locker->share_inode_max_size(newi);
3374
3375 MDRequestRef null_ref;
3376 get_mds()->mdcache->send_dentry_link(dn, null_ref);
3377
3378 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
3379
3380 server->respond_to_request(mdr, 0);
3381
3382 assert(g_conf->mds_kill_openc_at != 1);
3383 }
3384 };
3385
3386 /* This function takes responsibility for the passed mdr*/
3387 void Server::handle_client_openc(MDRequestRef& mdr)
3388 {
3389 MClientRequest *req = mdr->client_request;
3390 client_t client = mdr->get_client();
3391
3392 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
3393
3394 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
3395 if (cmode < 0) {
3396 respond_to_request(mdr, -EINVAL);
3397 return;
3398 }
3399
3400 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
3401
3402 if (!excl) {
3403 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(),
3404 &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
3405 if (r > 0) return;
3406 if (r == 0) {
3407 // it existed.
3408 handle_client_open(mdr);
3409 return;
3410 }
3411 if (r < 0 && r != -ENOENT) {
3412 if (r == -ESTALE) {
3413 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3414 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
3415 mdcache->find_ino_peers(req->get_filepath().get_ino(), c);
3416 } else {
3417 dout(10) << "FAIL on error " << r << dendl;
3418 respond_to_request(mdr, r);
3419 }
3420 return;
3421 }
3422 }
3423
3424 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3425 file_layout_t *dir_layout = NULL;
3426 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks,
3427 !excl, false, false, &dir_layout);
3428 if (!dn) return;
3429 if (mdr->snapid != CEPH_NOSNAP) {
3430 respond_to_request(mdr, -EROFS);
3431 return;
3432 }
3433 // set layout
3434 file_layout_t layout;
3435 if (dir_layout)
3436 layout = *dir_layout;
3437 else
3438 layout = mdcache->default_file_layout;
3439
3440 // What kind of client caps are required to complete this operation
3441 uint64_t access = MAY_WRITE;
3442
3443 const auto default_layout = layout;
3444
3445 // fill in any special params from client
3446 if (req->head.args.open.stripe_unit)
3447 layout.stripe_unit = req->head.args.open.stripe_unit;
3448 if (req->head.args.open.stripe_count)
3449 layout.stripe_count = req->head.args.open.stripe_count;
3450 if (req->head.args.open.object_size)
3451 layout.object_size = req->head.args.open.object_size;
3452 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
3453 (__s32)req->head.args.open.pool >= 0) {
3454 layout.pool_id = req->head.args.open.pool;
3455
3456 // make sure we have as new a map as the client
3457 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
3458 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
3459 return;
3460 }
3461 }
3462
3463 // If client doesn't have capability to modify layout pools, then
3464 // only permit this request if the requested pool matches what the
3465 // file would have inherited anyway from its parent.
3466 if (default_layout != layout) {
3467 access |= MAY_SET_VXATTR;
3468 }
3469
3470 if (!layout.is_valid()) {
3471 dout(10) << " invalid initial file layout" << dendl;
3472 respond_to_request(mdr, -EINVAL);
3473 return;
3474 }
3475 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
3476 dout(10) << " invalid data pool " << layout.pool_id << dendl;
3477 respond_to_request(mdr, -EINVAL);
3478 return;
3479 }
3480
3481 // created null dn.
3482 CDir *dir = dn->get_dir();
3483 CInode *diri = dir->get_inode();
3484 rdlocks.insert(&diri->authlock);
3485 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3486 return;
3487
3488 if (!check_access(mdr, diri, access))
3489 return;
3490
3491 if (!check_fragment_space(mdr, dir))
3492 return;
3493
3494 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3495
3496 if (!dnl->is_null()) {
3497 // it existed.
3498 assert(req->head.args.open.flags & CEPH_O_EXCL);
3499 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
3500 mdr->tracei = dnl->get_inode();
3501 mdr->tracedn = dn;
3502 respond_to_request(mdr, -EEXIST);
3503 return;
3504 }
3505
3506 // create inode.
3507 SnapRealm *realm = diri->find_snaprealm(); // use directory's realm; inode isn't attached yet.
3508 snapid_t follows = realm->get_newest_seq();
3509
3510 CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
3511 req->head.args.open.mode | S_IFREG, &layout);
3512 assert(in);
3513
3514 // it's a file.
3515 dn->push_projected_linkage(in);
3516
3517 in->inode.version = dn->pre_dirty();
3518 if (layout.pool_id != mdcache->default_file_layout.pool_id)
3519 in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
3520 in->inode.update_backtrace();
3521 if (cmode & CEPH_FILE_MODE_WR) {
3522 in->inode.client_ranges[client].range.first = 0;
3523 in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
3524 in->inode.client_ranges[client].follows = follows;
3525 }
3526 in->inode.rstat.rfiles = 1;
3527
3528 assert(dn->first == follows+1);
3529 in->first = dn->first;
3530
3531 // prepare finisher
3532 mdr->ls = mdlog->get_current_segment();
3533 EUpdate *le = new EUpdate(mdlog, "openc");
3534 mdlog->start_entry(le);
3535 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
3536 journal_allocated_inos(mdr, &le->metablob);
3537 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
3538 le->metablob.add_primary_dentry(dn, in, true, true, true);
3539
3540 // do the open
3541 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
3542 in->authlock.set_state(LOCK_EXCL);
3543 in->xattrlock.set_state(LOCK_EXCL);
3544
3545 // make sure this inode gets into the journal
3546 le->metablob.add_opened_ino(in->ino());
3547 LogSegment *ls = mds->mdlog->get_current_segment();
3548 ls->open_files.push_back(&in->item_open_file);
3549
3550 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in, follows);
3551
3552 if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
3553 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
3554 // add the file created flag onto the reply if create_flags features is supported
3555 ::encode(in->inode.ino, mdr->reply_extra_bl);
3556 }
3557
3558 journal_and_reply(mdr, in, dn, le, fin);
3559
3560 // We hit_dir (via hit_inode) in our finish callback, but by then we might
3561 // have overshot the split size (multiple opencs in flight), so here is
3562 // an early chance to split the dir if this openc makes it oversized.
3563 mds->balancer->maybe_fragment(dir, false);
3564 }
3565
3566
3567
3568 void Server::handle_client_readdir(MDRequestRef& mdr)
3569 {
3570 MClientRequest *req = mdr->client_request;
3571 client_t client = req->get_source().num();
3572 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3573 CInode *diri = rdlock_path_pin_ref(mdr, 0, rdlocks, false, true);
3574 if (!diri) return;
3575
3576 // it's a directory, right?
3577 if (!diri->is_dir()) {
3578 // not a dir
3579 dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
3580 respond_to_request(mdr, -ENOTDIR);
3581 return;
3582 }
3583
3584 rdlocks.insert(&diri->filelock);
3585 rdlocks.insert(&diri->dirfragtreelock);
3586
3587 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3588 return;
3589
3590 if (!check_access(mdr, diri, MAY_READ))
3591 return;
3592
3593 // which frag?
3594 frag_t fg = (__u32)req->head.args.readdir.frag;
3595 unsigned req_flags = (__u32)req->head.args.readdir.flags;
3596 string offset_str = req->get_path2();
3597
3598 __u32 offset_hash = 0;
3599 if (!offset_str.empty())
3600 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
3601 else
3602 offset_hash = (__u32)req->head.args.readdir.offset_hash;
3603
3604 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
3605 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
3606
3607 // does the frag exist?
3608 if (diri->dirfragtree[fg.value()] != fg) {
3609 frag_t newfg;
3610 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3611 if (fg.contains((unsigned)offset_hash)) {
3612 newfg = diri->dirfragtree[offset_hash];
3613 } else {
3614 // client actually wants next frag
3615 newfg = diri->dirfragtree[fg.value()];
3616 }
3617 } else {
3618 offset_str.clear();
3619 newfg = diri->dirfragtree[fg.value()];
3620 }
3621 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
3622 fg = newfg;
3623 }
3624
3625 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
3626 if (!dir) return;
3627
3628 // ok!
3629 dout(10) << "handle_client_readdir on " << *dir << dendl;
3630 assert(dir->is_auth());
3631
3632 if (!dir->is_complete()) {
3633 if (dir->is_frozen()) {
3634 dout(7) << "dir is frozen " << *dir << dendl;
3635 mds->locker->drop_locks(mdr.get());
3636 mdr->drop_local_auth_pins();
3637 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3638 return;
3639 }
3640 // fetch
3641 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
3642 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
3643 return;
3644 }
3645
3646 #ifdef MDS_VERIFY_FRAGSTAT
3647 dir->verify_fragstat();
3648 #endif
3649
3650 utime_t now = ceph_clock_now();
3651 mdr->set_mds_stamp(now);
3652
3653 snapid_t snapid = mdr->snapid;
3654 dout(10) << "snapid " << snapid << dendl;
3655
3656 SnapRealm *realm = diri->find_snaprealm();
3657
3658 unsigned max = req->head.args.readdir.max_entries;
3659 if (!max)
3660 max = dir->get_num_any(); // whatever, something big.
3661 unsigned max_bytes = req->head.args.readdir.max_bytes;
3662 if (!max_bytes)
3663 // make sure at least one item can be encoded
3664 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
3665
3666 // start final blob
3667 bufferlist dirbl;
3668 dir->encode_dirstat(dirbl, mds->get_nodeid());
3669
3670 // count bytes available.
3671 // this isn't perfect, but we should capture the main variable/unbounded size items!
3672 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
3673 int bytes_left = max_bytes - front_bytes;
3674 bytes_left -= realm->get_snap_trace().length();
3675
3676 // build dir contents
3677 bufferlist dnbl;
3678 __u32 numfiles = 0;
3679 bool start = !offset_hash && offset_str.empty();
3680 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
3681 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
3682 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
3683 bool end = (it == dir->end());
3684 for (; !end && numfiles < max; end = (it == dir->end())) {
3685 CDentry *dn = it->second;
3686 ++it;
3687
3688 if (dn->state_test(CDentry::STATE_PURGING))
3689 continue;
3690
3691 bool dnp = dn->use_projected(client, mdr);
3692 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
3693
3694 if (dnl->is_null())
3695 continue;
3696
3697 if (dn->last < snapid || dn->first > snapid) {
3698 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
3699 continue;
3700 }
3701
3702 if (!start) {
3703 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
3704 if (!(offset_key < dn->key()))
3705 continue;
3706 }
3707
3708 CInode *in = dnl->get_inode();
3709
3710 if (in && in->ino() == CEPH_INO_CEPH)
3711 continue;
3712
3713 // remote link?
3714 // better for the MDS to do the work, if we think the client will stat any of these files.
3715 if (dnl->is_remote() && !in) {
3716 in = mdcache->get_inode(dnl->get_remote_ino());
3717 if (in) {
3718 dn->link_remote(dnl, in);
3719 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
3720 dout(10) << "skipping bad remote ino on " << *dn << dendl;
3721 continue;
3722 } else {
3723 // touch everything i _do_ have
3724 for (CDir::map_t::iterator p = dir->begin(); p != dir->end(); ++p)
3725 if (!p->second->get_linkage()->is_null())
3726 mdcache->lru.lru_touch(p->second);
3727
3728 // already issued caps and leases, reply immediately.
3729 if (dnbl.length() > 0) {
3730 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
3731 dout(10) << " open remote dentry after caps were issued, stopping at "
3732 << dnbl.length() << " < " << bytes_left << dendl;
3733 break;
3734 }
3735
3736 mds->locker->drop_locks(mdr.get());
3737 mdr->drop_local_auth_pins();
3738 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
3739 return;
3740 }
3741 }
3742 assert(in);
3743
3744 if ((int)(dnbl.length() + dn->name.length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
3745 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
3746 break;
3747 }
3748
3749 unsigned start_len = dnbl.length();
3750
3751 // dentry
3752 dout(12) << "including dn " << *dn << dendl;
3753 ::encode(dn->name, dnbl);
3754 mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session);
3755
3756 // inode
3757 dout(12) << "including inode " << *in << dendl;
3758 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
3759 if (r < 0) {
3760 // chop off dn->name, lease
3761 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
3762 bufferlist keep;
3763 keep.substr_of(dnbl, 0, start_len);
3764 dnbl.swap(keep);
3765 break;
3766 }
3767 assert(r >= 0);
3768 numfiles++;
3769
3770 // touch dn
3771 mdcache->lru.lru_touch(dn);
3772 }
3773
3774 __u16 flags = 0;
3775 if (end) {
3776 flags = CEPH_READDIR_FRAG_END;
3777 if (start)
3778 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
3779 }
3780 // client only understand END and COMPLETE flags ?
3781 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3782 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
3783 }
3784
3785 // finish final blob
3786 ::encode(numfiles, dirbl);
3787 ::encode(flags, dirbl);
3788 dirbl.claim_append(dnbl);
3789
3790 // yay, reply
3791 dout(10) << "reply to " << *req << " readdir num=" << numfiles
3792 << " bytes=" << dirbl.length()
3793 << " start=" << (int)start
3794 << " end=" << (int)end
3795 << dendl;
3796 mdr->reply_extra_bl = dirbl;
3797
3798 // bump popularity. NOTE: this doesn't quite capture it.
3799 mds->balancer->hit_dir(now, dir, META_POP_IRD, -1, numfiles);
3800
3801 // reply
3802 mdr->tracei = diri;
3803 respond_to_request(mdr, 0);
3804 }
3805
3806
3807
3808 // ===============================================================================
3809 // INODE UPDATES
3810
3811
3812 /*
3813 * finisher for basic inode updates
3814 */
3815 class C_MDS_inode_update_finish : public ServerLogContext {
3816 CInode *in;
3817 bool truncating_smaller, changed_ranges;
3818 public:
3819 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
3820 bool sm=false, bool cr=false) :
3821 ServerLogContext(s, r), in(i), truncating_smaller(sm), changed_ranges(cr) { }
3822 void finish(int r) override {
3823 assert(r == 0);
3824
3825 // apply
3826 in->pop_and_dirty_projected_inode(mdr->ls);
3827 mdr->apply();
3828
3829 // notify any clients
3830 if (truncating_smaller && in->inode.is_truncating()) {
3831 get_mds()->locker->issue_truncate(in);
3832 get_mds()->mdcache->truncate_inode(in, mdr->ls);
3833 }
3834
3835 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
3836
3837 server->respond_to_request(mdr, 0);
3838
3839 if (changed_ranges)
3840 get_mds()->locker->share_inode_max_size(in);
3841 }
3842 };
3843
3844 void Server::handle_client_file_setlock(MDRequestRef& mdr)
3845 {
3846 MClientRequest *req = mdr->client_request;
3847 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3848
3849 // get the inode to operate on, and set up any locks needed for that
3850 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3851 if (!cur)
3852 return;
3853
3854 xlocks.insert(&cur->flocklock);
3855 /* acquire_locks will return true if it gets the locks. If it fails,
3856 it will redeliver this request at a later date, so drop the request.
3857 */
3858 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
3859 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
3860 return;
3861 }
3862
3863 // copy the lock change into a ceph_filelock so we can store/apply it
3864 ceph_filelock set_lock;
3865 set_lock.start = req->head.args.filelock_change.start;
3866 set_lock.length = req->head.args.filelock_change.length;
3867 set_lock.client = req->get_orig_source().num();
3868 set_lock.owner = req->head.args.filelock_change.owner;
3869 set_lock.pid = req->head.args.filelock_change.pid;
3870 set_lock.type = req->head.args.filelock_change.type;
3871 bool will_wait = req->head.args.filelock_change.wait;
3872
3873 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
3874
3875 ceph_lock_state_t *lock_state = NULL;
3876 bool interrupt = false;
3877
3878 // get the appropriate lock state
3879 switch (req->head.args.filelock_change.rule) {
3880 case CEPH_LOCK_FLOCK_INTR:
3881 interrupt = true;
3882 // fall-thru
3883 case CEPH_LOCK_FLOCK:
3884 lock_state = cur->get_flock_lock_state();
3885 break;
3886
3887 case CEPH_LOCK_FCNTL_INTR:
3888 interrupt = true;
3889 // fall-thru
3890 case CEPH_LOCK_FCNTL:
3891 lock_state = cur->get_fcntl_lock_state();
3892 break;
3893
3894 default:
3895 dout(10) << "got unknown lock type " << set_lock.type
3896 << ", dropping request!" << dendl;
3897 respond_to_request(mdr, -EOPNOTSUPP);
3898 return;
3899 }
3900
3901 dout(10) << " state prior to lock change: " << *lock_state << dendl;
3902 if (CEPH_LOCK_UNLOCK == set_lock.type) {
3903 list<ceph_filelock> activated_locks;
3904 list<MDSInternalContextBase*> waiters;
3905 if (lock_state->is_waiting(set_lock)) {
3906 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
3907 lock_state->remove_waiting(set_lock);
3908 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
3909 } else if (!interrupt) {
3910 dout(10) << " unlock attempt on " << set_lock << dendl;
3911 lock_state->remove_lock(set_lock, activated_locks);
3912 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
3913 }
3914 mds->queue_waiters(waiters);
3915
3916 respond_to_request(mdr, 0);
3917 } else {
3918 dout(10) << " lock attempt on " << set_lock << dendl;
3919 bool deadlock = false;
3920 if (mdr->more()->flock_was_waiting &&
3921 !lock_state->is_waiting(set_lock)) {
3922 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
3923 respond_to_request(mdr, -EINTR);
3924 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
3925 dout(10) << " it failed on this attempt" << dendl;
3926 // couldn't set lock right now
3927 if (deadlock) {
3928 respond_to_request(mdr, -EDEADLK);
3929 } else if (!will_wait) {
3930 respond_to_request(mdr, -EWOULDBLOCK);
3931 } else {
3932 dout(10) << " added to waiting list" << dendl;
3933 assert(lock_state->is_waiting(set_lock));
3934 mdr->more()->flock_was_waiting = true;
3935 mds->locker->drop_locks(mdr.get());
3936 mdr->drop_local_auth_pins();
3937 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
3938 }
3939 } else
3940 respond_to_request(mdr, 0);
3941 }
3942 dout(10) << " state after lock change: " << *lock_state << dendl;
3943 }
3944
3945 void Server::handle_client_file_readlock(MDRequestRef& mdr)
3946 {
3947 MClientRequest *req = mdr->client_request;
3948 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3949
3950 // get the inode to operate on, and set up any locks needed for that
3951 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3952 if (!cur)
3953 return;
3954
3955 /* acquire_locks will return true if it gets the locks. If it fails,
3956 it will redeliver this request at a later date, so drop the request.
3957 */
3958 rdlocks.insert(&cur->flocklock);
3959 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
3960 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
3961 return;
3962 }
3963
3964 // copy the lock change into a ceph_filelock so we can store/apply it
3965 ceph_filelock checking_lock;
3966 checking_lock.start = req->head.args.filelock_change.start;
3967 checking_lock.length = req->head.args.filelock_change.length;
3968 checking_lock.client = req->get_orig_source().num();
3969 checking_lock.owner = req->head.args.filelock_change.owner;
3970 checking_lock.pid = req->head.args.filelock_change.pid;
3971 checking_lock.type = req->head.args.filelock_change.type;
3972
3973 // get the appropriate lock state
3974 ceph_lock_state_t *lock_state = NULL;
3975 switch (req->head.args.filelock_change.rule) {
3976 case CEPH_LOCK_FLOCK:
3977 lock_state = cur->get_flock_lock_state();
3978 break;
3979
3980 case CEPH_LOCK_FCNTL:
3981 lock_state = cur->get_fcntl_lock_state();
3982 break;
3983
3984 default:
3985 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
3986 respond_to_request(mdr, -EINVAL);
3987 return;
3988 }
3989 lock_state->look_for_lock(checking_lock);
3990
3991 bufferlist lock_bl;
3992 ::encode(checking_lock, lock_bl);
3993
3994 mdr->reply_extra_bl = lock_bl;
3995 respond_to_request(mdr, 0);
3996 }
3997
3998 void Server::handle_client_setattr(MDRequestRef& mdr)
3999 {
4000 MClientRequest *req = mdr->client_request;
4001 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4002 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4003 if (!cur) return;
4004
4005 if (mdr->snapid != CEPH_NOSNAP) {
4006 respond_to_request(mdr, -EROFS);
4007 return;
4008 }
4009 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
4010 respond_to_request(mdr, -EPERM);
4011 return;
4012 }
4013
4014 __u32 mask = req->head.args.setattr.mask;
4015 __u32 access_mask = MAY_WRITE;
4016
4017 // xlock inode
4018 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
4019 xlocks.insert(&cur->authlock);
4020 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
4021 xlocks.insert(&cur->filelock);
4022 if (mask & CEPH_SETATTR_CTIME)
4023 wrlocks.insert(&cur->versionlock);
4024
4025 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4026 return;
4027
4028 if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
4029 access_mask |= MAY_CHOWN;
4030
4031 if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
4032 access_mask |= MAY_CHGRP;
4033
4034 if (!check_access(mdr, cur, access_mask))
4035 return;
4036
4037 // trunc from bigger -> smaller?
4038 inode_t *pi = cur->get_projected_inode();
4039
4040 uint64_t old_size = MAX(pi->size, req->head.args.setattr.old_size);
4041
4042 // ENOSPC on growing file while full, but allow shrinks
4043 if (is_full && req->head.args.setattr.size > old_size) {
4044 dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
4045 respond_to_request(mdr, -ENOSPC);
4046 return;
4047 }
4048
4049 bool truncating_smaller = false;
4050 if (mask & CEPH_SETATTR_SIZE) {
4051 truncating_smaller = req->head.args.setattr.size < old_size;
4052 if (truncating_smaller && pi->is_truncating()) {
4053 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4054 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4055 mds->locker->drop_locks(mdr.get());
4056 mdr->drop_local_auth_pins();
4057 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4058 return;
4059 }
4060 }
4061
4062 bool changed_ranges = false;
4063
4064 // project update
4065 mdr->ls = mdlog->get_current_segment();
4066 EUpdate *le = new EUpdate(mdlog, "setattr");
4067 mdlog->start_entry(le);
4068
4069 pi = cur->project_inode();
4070
4071 if (mask & CEPH_SETATTR_UID)
4072 pi->uid = req->head.args.setattr.uid;
4073 if (mask & CEPH_SETATTR_GID)
4074 pi->gid = req->head.args.setattr.gid;
4075
4076 if (mask & CEPH_SETATTR_MODE)
4077 pi->mode = (pi->mode & ~07777) | (req->head.args.setattr.mode & 07777);
4078 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
4079 S_ISREG(pi->mode) &&
4080 (pi->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
4081 pi->mode &= ~(S_ISUID|S_ISGID);
4082 }
4083
4084 if (mask & CEPH_SETATTR_MTIME)
4085 pi->mtime = req->head.args.setattr.mtime;
4086 if (mask & CEPH_SETATTR_ATIME)
4087 pi->atime = req->head.args.setattr.atime;
4088 if (mask & CEPH_SETATTR_BTIME)
4089 pi->btime = req->head.args.setattr.btime;
4090 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
4091 pi->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
4092 if (mask & CEPH_SETATTR_SIZE) {
4093 if (truncating_smaller) {
4094 pi->truncate(old_size, req->head.args.setattr.size);
4095 le->metablob.add_truncate_start(cur->ino());
4096 } else {
4097 pi->size = req->head.args.setattr.size;
4098 pi->rstat.rbytes = pi->size;
4099 }
4100 pi->mtime = mdr->get_op_stamp();
4101
4102 // adjust client's max_size?
4103 map<client_t,client_writeable_range_t> new_ranges;
4104 bool max_increased = false;
4105 mds->locker->calc_new_client_ranges(cur, pi->size, &new_ranges, &max_increased);
4106 if (pi->client_ranges != new_ranges) {
4107 dout(10) << " client_ranges " << pi->client_ranges << " -> " << new_ranges << dendl;
4108 pi->client_ranges = new_ranges;
4109 changed_ranges = true;
4110 }
4111 }
4112
4113 pi->version = cur->pre_dirty();
4114 pi->ctime = mdr->get_op_stamp();
4115 pi->change_attr++;
4116
4117 // log + wait
4118 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4119 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4120 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4121
4122 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
4123 truncating_smaller, changed_ranges));
4124
4125 // flush immediately if there are readers/writers waiting
4126 if (xlocks.count(&cur->filelock) &&
4127 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
4128 mds->mdlog->flush();
4129 }
4130
4131 /* Takes responsibility for mdr */
4132 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
4133 {
4134 CInode *in = mdr->in[0];
4135 client_t client = mdr->get_client();
4136 assert(in);
4137
4138 dout(10) << "do_open_truncate " << *in << dendl;
4139
4140 SnapRealm *realm = in->find_snaprealm();
4141 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
4142
4143 mdr->ls = mdlog->get_current_segment();
4144 EUpdate *le = new EUpdate(mdlog, "open_truncate");
4145 mdlog->start_entry(le);
4146
4147 // prepare
4148 inode_t *pi = in->project_inode();
4149 pi->version = in->pre_dirty();
4150 pi->mtime = pi->ctime = mdr->get_op_stamp();
4151 pi->change_attr++;
4152
4153 uint64_t old_size = MAX(pi->size, mdr->client_request->head.args.open.old_size);
4154 if (old_size > 0) {
4155 pi->truncate(old_size, 0);
4156 le->metablob.add_truncate_start(in->ino());
4157 }
4158
4159 bool changed_ranges = false;
4160 if (cmode & CEPH_FILE_MODE_WR) {
4161 pi->client_ranges[client].range.first = 0;
4162 pi->client_ranges[client].range.last = pi->get_layout_size_increment();
4163 pi->client_ranges[client].follows = in->find_snaprealm()->get_newest_seq();
4164 changed_ranges = true;
4165 }
4166
4167 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
4168
4169 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
4170 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
4171
4172 // make sure ino gets into the journal
4173 le->metablob.add_opened_ino(in->ino());
4174 LogSegment *ls = mds->mdlog->get_current_segment();
4175 ls->open_files.push_back(&in->item_open_file);
4176
4177 mdr->o_trunc = true;
4178
4179 CDentry *dn = 0;
4180 if (mdr->client_request->get_dentry_wanted()) {
4181 assert(mdr->dn[0].size());
4182 dn = mdr->dn[0].back();
4183 }
4184
4185 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
4186 changed_ranges));
4187 // Although the `open` part can give an early reply, the truncation won't
4188 // happen until our EUpdate is persistent, to give the client a prompt
4189 // response we must also flush that event.
4190 mdlog->flush();
4191 }
4192
4193
4194 /* This function cleans up the passed mdr */
4195 void Server::handle_client_setlayout(MDRequestRef& mdr)
4196 {
4197 MClientRequest *req = mdr->client_request;
4198 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4199 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4200 if (!cur) return;
4201
4202 if (mdr->snapid != CEPH_NOSNAP) {
4203 respond_to_request(mdr, -EROFS);
4204 return;
4205 }
4206 if (!cur->is_file()) {
4207 respond_to_request(mdr, -EINVAL);
4208 return;
4209 }
4210 if (cur->get_projected_inode()->size ||
4211 cur->get_projected_inode()->truncate_seq > 1) {
4212 respond_to_request(mdr, -ENOTEMPTY);
4213 return;
4214 }
4215
4216 // validate layout
4217 file_layout_t layout = cur->get_projected_inode()->layout;
4218 // save existing layout for later
4219 const auto old_layout = layout;
4220
4221 int access = MAY_WRITE;
4222
4223 if (req->head.args.setlayout.layout.fl_object_size > 0)
4224 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4225 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4226 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4227 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4228 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4229 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4230 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4231
4232 // make sure we have as new a map as the client
4233 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4234 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4235 return;
4236 }
4237 }
4238
4239 // Don't permit layout modifications without 'p' caps
4240 if (layout != old_layout) {
4241 access |= MAY_SET_VXATTR;
4242 }
4243
4244 if (!layout.is_valid()) {
4245 dout(10) << "bad layout" << dendl;
4246 respond_to_request(mdr, -EINVAL);
4247 return;
4248 }
4249 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4250 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4251 respond_to_request(mdr, -EINVAL);
4252 return;
4253 }
4254
4255 xlocks.insert(&cur->filelock);
4256 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4257 return;
4258
4259 if (!check_access(mdr, cur, access))
4260 return;
4261
4262 // project update
4263 inode_t *pi = cur->project_inode();
4264 pi->layout = layout;
4265 // add the old pool to the inode
4266 pi->add_old_pool(old_layout.pool_id);
4267 pi->version = cur->pre_dirty();
4268 pi->ctime = mdr->get_op_stamp();
4269 pi->change_attr++;
4270
4271 // log + wait
4272 mdr->ls = mdlog->get_current_segment();
4273 EUpdate *le = new EUpdate(mdlog, "setlayout");
4274 mdlog->start_entry(le);
4275 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4276 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4277 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4278
4279 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4280 }
4281
4282 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
4283 {
4284 MClientRequest *req = mdr->client_request;
4285 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4286 file_layout_t *dir_layout = NULL;
4287 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4288 if (!cur) return;
4289
4290 if (mdr->snapid != CEPH_NOSNAP) {
4291 respond_to_request(mdr, -EROFS);
4292 return;
4293 }
4294
4295 if (!cur->is_dir()) {
4296 respond_to_request(mdr, -ENOTDIR);
4297 return;
4298 }
4299
4300 xlocks.insert(&cur->policylock);
4301 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4302 return;
4303
4304 // validate layout
4305 const inode_t *old_pi = cur->get_projected_inode();
4306 file_layout_t layout;
4307 if (old_pi->has_layout())
4308 layout = old_pi->layout;
4309 else if (dir_layout)
4310 layout = *dir_layout;
4311 else
4312 layout = mdcache->default_file_layout;
4313
4314 // Level of access required to complete
4315 int access = MAY_WRITE;
4316
4317 const auto old_layout = layout;
4318
4319 if (req->head.args.setlayout.layout.fl_object_size > 0)
4320 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4321 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4322 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4323 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4324 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4325 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4326 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4327 // make sure we have as new a map as the client
4328 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4329 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4330 return;
4331 }
4332 }
4333
4334 if (layout != old_layout) {
4335 access |= MAY_SET_VXATTR;
4336 }
4337
4338 if (!layout.is_valid()) {
4339 dout(10) << "bad layout" << dendl;
4340 respond_to_request(mdr, -EINVAL);
4341 return;
4342 }
4343 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4344 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4345 respond_to_request(mdr, -EINVAL);
4346 return;
4347 }
4348
4349 if (!check_access(mdr, cur, access))
4350 return;
4351
4352 inode_t *pi = cur->project_inode();
4353 pi->layout = layout;
4354 pi->version = cur->pre_dirty();
4355
4356 // log + wait
4357 mdr->ls = mdlog->get_current_segment();
4358 EUpdate *le = new EUpdate(mdlog, "setlayout");
4359 mdlog->start_entry(le);
4360 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4361 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4362 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4363
4364 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4365 }
4366
4367 // XATTRS
4368
4369 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
4370 file_layout_t *layout, bool validate)
4371 {
4372 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
4373 try {
4374 if (name == "layout") {
4375 string::iterator begin = value.begin();
4376 string::iterator end = value.end();
4377 keys_and_values<string::iterator> p; // create instance of parser
4378 std::map<string, string> m; // map to receive results
4379 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4380 return -EINVAL;
4381 }
4382 string left(begin, end);
4383 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4384 if (begin != end)
4385 return -EINVAL;
4386 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4387 // Skip validation on each attr, we do it once at the end (avoid
4388 // rejecting intermediate states if the overall result is ok)
4389 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
4390 osdmap, layout, false);
4391 if (r < 0)
4392 return r;
4393 }
4394 } else if (name == "layout.object_size") {
4395 layout->object_size = boost::lexical_cast<unsigned>(value);
4396 } else if (name == "layout.stripe_unit") {
4397 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
4398 } else if (name == "layout.stripe_count") {
4399 layout->stripe_count = boost::lexical_cast<unsigned>(value);
4400 } else if (name == "layout.pool") {
4401 try {
4402 layout->pool_id = boost::lexical_cast<unsigned>(value);
4403 } catch (boost::bad_lexical_cast const&) {
4404 int64_t pool = osdmap.lookup_pg_pool_name(value);
4405 if (pool < 0) {
4406 dout(10) << " unknown pool " << value << dendl;
4407 return -ENOENT;
4408 }
4409 layout->pool_id = pool;
4410 }
4411 } else if (name == "layout.pool_namespace") {
4412 layout->pool_ns = value;
4413 } else {
4414 dout(10) << " unknown layout vxattr " << name << dendl;
4415 return -EINVAL;
4416 }
4417 } catch (boost::bad_lexical_cast const&) {
4418 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4419 return -EINVAL;
4420 }
4421
4422 if (validate && !layout->is_valid()) {
4423 dout(10) << "bad layout" << dendl;
4424 return -EINVAL;
4425 }
4426 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
4427 dout(10) << " invalid data pool " << layout->pool_id << dendl;
4428 return -EINVAL;
4429 }
4430 return 0;
4431 }
4432
4433 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
4434 {
4435 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
4436 try {
4437 if (name == "quota") {
4438 string::iterator begin = value.begin();
4439 string::iterator end = value.end();
4440 keys_and_values<string::iterator> p; // create instance of parser
4441 std::map<string, string> m; // map to receive results
4442 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4443 return -EINVAL;
4444 }
4445 string left(begin, end);
4446 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4447 if (begin != end)
4448 return -EINVAL;
4449 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4450 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
4451 if (r < 0)
4452 return r;
4453 }
4454 } else if (name == "quota.max_bytes") {
4455 int64_t q = boost::lexical_cast<int64_t>(value);
4456 if (q < 0)
4457 return -EINVAL;
4458 quota->max_bytes = q;
4459 } else if (name == "quota.max_files") {
4460 int64_t q = boost::lexical_cast<int64_t>(value);
4461 if (q < 0)
4462 return -EINVAL;
4463 quota->max_files = q;
4464 } else {
4465 dout(10) << " unknown quota vxattr " << name << dendl;
4466 return -EINVAL;
4467 }
4468 } catch (boost::bad_lexical_cast const&) {
4469 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4470 return -EINVAL;
4471 }
4472
4473 if (!quota->is_valid()) {
4474 dout(10) << "bad quota" << dendl;
4475 return -EINVAL;
4476 }
4477 return 0;
4478 }
4479
4480 /*
4481 * Verify that the file layout attribute carried by client
4482 * is well-formatted.
4483 * Return 0 on success, otherwise this function takes
4484 * responsibility for the passed mdr.
4485 */
4486 int Server::check_layout_vxattr(MDRequestRef& mdr,
4487 string name,
4488 string value,
4489 file_layout_t *layout)
4490 {
4491 MClientRequest *req = mdr->client_request;
4492 epoch_t epoch;
4493 int r;
4494
4495 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4496 r = parse_layout_vxattr(name, value, osdmap, layout);
4497 epoch = osdmap.get_epoch();
4498 });
4499
4500 if (r == -ENOENT) {
4501
4502 // we don't have the specified pool, make sure our map
4503 // is newer than or as new as the client.
4504 epoch_t req_epoch = req->get_osdmap_epoch();
4505
4506 if (req_epoch > epoch) {
4507
4508 // well, our map is older. consult mds.
4509 Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
4510
4511 if (!mds->objecter->wait_for_map(req_epoch, fin))
4512 return r; // wait, fin will retry this request later
4513
4514 delete fin;
4515
4516 // now we have at least as new a map as the client, try again.
4517 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4518 r = parse_layout_vxattr(name, value, osdmap, layout);
4519 epoch = osdmap.get_epoch();
4520 });
4521
4522 assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
4523
4524 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
4525
4526 // For compatibility with client w/ old code, we still need get the
4527 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
4528 // we can remove those code.
4529 mdr->waited_for_osdmap = true;
4530 mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
4531 mds, new C_MDS_RetryRequest(mdcache, mdr)));
4532 return r;
4533 }
4534 }
4535
4536 if (r < 0) {
4537
4538 if (r == -ENOENT)
4539 r = -EINVAL;
4540
4541 respond_to_request(mdr, r);
4542 return r;
4543 }
4544
4545 // all is well
4546 return 0;
4547 }
4548
4549 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
4550 file_layout_t *dir_layout,
4551 set<SimpleLock*> rdlocks,
4552 set<SimpleLock*> wrlocks,
4553 set<SimpleLock*> xlocks)
4554 {
4555 MClientRequest *req = mdr->client_request;
4556 string name(req->get_path2());
4557 bufferlist bl = req->get_data();
4558 string value (bl.c_str(), bl.length());
4559 dout(10) << "handle_set_vxattr " << name
4560 << " val " << value.length()
4561 << " bytes on " << *cur
4562 << dendl;
4563
4564 inode_t *pi = NULL;
4565 string rest;
4566
4567 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
4568 return;
4569 }
4570
4571 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
4572 if (!cur->is_dir()) {
4573 respond_to_request(mdr, -EINVAL);
4574 return;
4575 }
4576
4577 file_layout_t layout;
4578 if (cur->get_projected_inode()->has_layout())
4579 layout = cur->get_projected_inode()->layout;
4580 else if (dir_layout)
4581 layout = *dir_layout;
4582 else
4583 layout = mdcache->default_file_layout;
4584
4585 rest = name.substr(name.find("layout"));
4586 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4587 return;
4588
4589 xlocks.insert(&cur->policylock);
4590 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4591 return;
4592
4593 pi = cur->project_inode();
4594 pi->layout = layout;
4595 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
4596 if (!cur->is_file()) {
4597 respond_to_request(mdr, -EINVAL);
4598 return;
4599 }
4600 if (cur->get_projected_inode()->size ||
4601 cur->get_projected_inode()->truncate_seq > 1) {
4602 respond_to_request(mdr, -ENOTEMPTY);
4603 return;
4604 }
4605 file_layout_t layout = cur->get_projected_inode()->layout;
4606 rest = name.substr(name.find("layout"));
4607 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4608 return;
4609
4610 xlocks.insert(&cur->filelock);
4611 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4612 return;
4613
4614 pi = cur->project_inode();
4615 int64_t old_pool = pi->layout.pool_id;
4616 pi->add_old_pool(old_pool);
4617 pi->layout = layout;
4618 pi->ctime = mdr->get_op_stamp();
4619 } else if (name.compare(0, 10, "ceph.quota") == 0) {
4620 if (!cur->is_dir() || cur->is_root()) {
4621 respond_to_request(mdr, -EINVAL);
4622 return;
4623 }
4624
4625 quota_info_t quota = cur->get_projected_inode()->quota;
4626
4627 rest = name.substr(name.find("quota"));
4628 int r = parse_quota_vxattr(rest, value, &quota);
4629 if (r < 0) {
4630 respond_to_request(mdr, r);
4631 return;
4632 }
4633
4634 xlocks.insert(&cur->policylock);
4635 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4636 return;
4637
4638 pi = cur->project_inode();
4639 pi->quota = quota;
4640 } else if (name.find("ceph.dir.pin") == 0) {
4641 if (!cur->is_dir() || cur->is_root()) {
4642 respond_to_request(mdr, -EINVAL);
4643 return;
4644 }
4645
4646 mds_rank_t rank;
4647 try {
4648 rank = boost::lexical_cast<mds_rank_t>(value);
4649 if (rank < 0) rank = MDS_RANK_NONE;
4650 } catch (boost::bad_lexical_cast const&) {
4651 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4652 respond_to_request(mdr, -EINVAL);
4653 return;
4654 }
4655
4656 xlocks.insert(&cur->policylock);
4657 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4658 return;
4659
4660 pi = cur->project_inode();
4661 cur->set_export_pin(rank);
4662 } else {
4663 dout(10) << " unknown vxattr " << name << dendl;
4664 respond_to_request(mdr, -EINVAL);
4665 return;
4666 }
4667
4668 pi->change_attr++;
4669 pi->ctime = mdr->get_op_stamp();
4670 pi->version = cur->pre_dirty();
4671 if (cur->is_file())
4672 pi->update_backtrace();
4673
4674 // log + wait
4675 mdr->ls = mdlog->get_current_segment();
4676 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
4677 mdlog->start_entry(le);
4678 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4679 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4680 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4681
4682 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4683 return;
4684 }
4685
4686 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
4687 file_layout_t *dir_layout,
4688 set<SimpleLock*> rdlocks,
4689 set<SimpleLock*> wrlocks,
4690 set<SimpleLock*> xlocks)
4691 {
4692 MClientRequest *req = mdr->client_request;
4693 string name(req->get_path2());
4694
4695 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
4696
4697 if (name == "ceph.dir.layout") {
4698 if (!cur->is_dir()) {
4699 respond_to_request(mdr, -ENODATA);
4700 return;
4701 }
4702 if (cur->is_root()) {
4703 dout(10) << "can't remove layout policy on the root directory" << dendl;
4704 respond_to_request(mdr, -EINVAL);
4705 return;
4706 }
4707
4708 if (!cur->get_projected_inode()->has_layout()) {
4709 respond_to_request(mdr, -ENODATA);
4710 return;
4711 }
4712
4713 xlocks.insert(&cur->policylock);
4714 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4715 return;
4716
4717 inode_t *pi = cur->project_inode();
4718 pi->clear_layout();
4719 pi->version = cur->pre_dirty();
4720
4721 // log + wait
4722 mdr->ls = mdlog->get_current_segment();
4723 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
4724 mdlog->start_entry(le);
4725 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4726 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4727 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4728
4729 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4730 return;
4731 } else if (name == "ceph.dir.layout.pool_namespace"
4732 || name == "ceph.file.layout.pool_namespace") {
4733 // Namespace is the only layout field that has a meaningful
4734 // null/none value (empty string, means default layout). Is equivalent
4735 // to a setxattr with empty string: pass through the empty payload of
4736 // the rmxattr request to do this.
4737 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4738 return;
4739 }
4740
4741 respond_to_request(mdr, -ENODATA);
4742 }
4743
4744 class C_MDS_inode_xattr_update_finish : public ServerLogContext {
4745 CInode *in;
4746 public:
4747
4748 C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
4749 ServerLogContext(s, r), in(i) { }
4750 void finish(int r) override {
4751 assert(r == 0);
4752
4753 // apply
4754 in->pop_and_dirty_projected_inode(mdr->ls);
4755
4756 mdr->apply();
4757
4758 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
4759
4760 server->respond_to_request(mdr, 0);
4761 }
4762 };
4763
4764 void Server::handle_client_setxattr(MDRequestRef& mdr)
4765 {
4766 MClientRequest *req = mdr->client_request;
4767 string name(req->get_path2());
4768 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4769 CInode *cur;
4770
4771 file_layout_t *dir_layout = NULL;
4772 if (name.compare(0, 15, "ceph.dir.layout") == 0)
4773 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4774 else
4775 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4776 if (!cur)
4777 return;
4778
4779 if (mdr->snapid != CEPH_NOSNAP) {
4780 respond_to_request(mdr, -EROFS);
4781 return;
4782 }
4783
4784 int flags = req->head.args.setxattr.flags;
4785
4786 // magic ceph.* namespace?
4787 if (name.compare(0, 5, "ceph.") == 0) {
4788 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4789 return;
4790 }
4791
4792 xlocks.insert(&cur->xattrlock);
4793 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4794 return;
4795
4796 if (!check_access(mdr, cur, MAY_WRITE))
4797 return;
4798
4799 map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
4800 size_t len = req->get_data().length();
4801 size_t inc = len + name.length();
4802
4803 // check xattrs kv pairs size
4804 size_t cur_xattrs_size = 0;
4805 for (const auto& p : *pxattrs) {
4806 if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
4807 continue;
4808 }
4809 cur_xattrs_size += p.first.length() + p.second.length();
4810 }
4811
4812 if (((cur_xattrs_size + inc) > g_conf->mds_max_xattr_pairs_size)) {
4813 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
4814 << cur_xattrs_size << ", inc " << inc << dendl;
4815 respond_to_request(mdr, -ENOSPC);
4816 return;
4817 }
4818
4819 if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(name)) {
4820 dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
4821 respond_to_request(mdr, -EEXIST);
4822 return;
4823 }
4824 if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(name)) {
4825 dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
4826 respond_to_request(mdr, -ENODATA);
4827 return;
4828 }
4829
4830 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
4831
4832 // project update
4833 map<string,bufferptr> *px = new map<string,bufferptr>;
4834 inode_t *pi = cur->project_inode(px);
4835 pi->version = cur->pre_dirty();
4836 pi->ctime = mdr->get_op_stamp();
4837 pi->change_attr++;
4838 pi->xattr_version++;
4839 px->erase(name);
4840 if (!(flags & CEPH_XATTR_REMOVE)) {
4841 (*px)[name] = buffer::create(len);
4842 if (len)
4843 req->get_data().copy(0, len, (*px)[name].c_str());
4844 }
4845
4846 // log + wait
4847 mdr->ls = mdlog->get_current_segment();
4848 EUpdate *le = new EUpdate(mdlog, "setxattr");
4849 mdlog->start_entry(le);
4850 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4851 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4852 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4853
4854 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4855 }
4856
4857 void Server::handle_client_removexattr(MDRequestRef& mdr)
4858 {
4859 MClientRequest *req = mdr->client_request;
4860 string name(req->get_path2());
4861 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4862 file_layout_t *dir_layout = NULL;
4863 CInode *cur;
4864 if (name == "ceph.dir.layout")
4865 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4866 else
4867 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4868 if (!cur)
4869 return;
4870
4871 if (mdr->snapid != CEPH_NOSNAP) {
4872 respond_to_request(mdr, -EROFS);
4873 return;
4874 }
4875
4876 if (name.compare(0, 5, "ceph.") == 0) {
4877 handle_remove_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4878 return;
4879 }
4880
4881 xlocks.insert(&cur->xattrlock);
4882 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4883 return;
4884
4885 map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
4886 if (pxattrs->count(name) == 0) {
4887 dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
4888 respond_to_request(mdr, -ENODATA);
4889 return;
4890 }
4891
4892 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
4893
4894 // project update
4895 map<string,bufferptr> *px = new map<string,bufferptr>;
4896 inode_t *pi = cur->project_inode(px);
4897 pi->version = cur->pre_dirty();
4898 pi->ctime = mdr->get_op_stamp();
4899 pi->change_attr++;
4900 pi->xattr_version++;
4901 px->erase(name);
4902
4903 // log + wait
4904 mdr->ls = mdlog->get_current_segment();
4905 EUpdate *le = new EUpdate(mdlog, "removexattr");
4906 mdlog->start_entry(le);
4907 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4908 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4909 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4910
4911 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4912 }
4913
4914
4915 // =================================================================
4916 // DIRECTORY and NAMESPACE OPS
4917
4918
4919 // ------------------------------------------------
4920
4921 // MKNOD
4922
4923 class C_MDS_mknod_finish : public ServerLogContext {
4924 CDentry *dn;
4925 CInode *newi;
4926 public:
4927 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4928 ServerLogContext(s, r), dn(d), newi(ni) {}
4929 void finish(int r) override {
4930 assert(r == 0);
4931
4932 // link the inode
4933 dn->pop_projected_linkage();
4934
4935 // be a bit hacky with the inode version, here.. we decrement it
4936 // just to keep mark_dirty() happen. (we didn't bother projecting
4937 // a new version of hte inode since it's just been created)
4938 newi->inode.version--;
4939 newi->mark_dirty(newi->inode.version + 1, mdr->ls);
4940 newi->_mark_dirty_parent(mdr->ls, true);
4941
4942 // mkdir?
4943 if (newi->inode.is_dir()) {
4944 CDir *dir = newi->get_dirfrag(frag_t());
4945 assert(dir);
4946 dir->fnode.version--;
4947 dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
4948 dir->mark_new(mdr->ls);
4949 }
4950
4951 mdr->apply();
4952
4953 MDRequestRef null_ref;
4954 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4955
4956 if (newi->inode.is_file())
4957 get_mds()->locker->share_inode_max_size(newi);
4958
4959 // hit pop
4960 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
4961
4962 // reply
4963 server->respond_to_request(mdr, 0);
4964 }
4965 };
4966
4967
4968 void Server::handle_client_mknod(MDRequestRef& mdr)
4969 {
4970 MClientRequest *req = mdr->client_request;
4971 client_t client = mdr->get_client();
4972 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4973 file_layout_t *dir_layout = NULL;
4974 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false,
4975 &dir_layout);
4976 if (!dn) return;
4977 if (mdr->snapid != CEPH_NOSNAP) {
4978 respond_to_request(mdr, -EROFS);
4979 return;
4980 }
4981 CInode *diri = dn->get_dir()->get_inode();
4982 rdlocks.insert(&diri->authlock);
4983 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4984 return;
4985
4986 if (!check_access(mdr, diri, MAY_WRITE))
4987 return;
4988
4989 if (!check_fragment_space(mdr, dn->get_dir()))
4990 return;
4991
4992 unsigned mode = req->head.args.mknod.mode;
4993 if ((mode & S_IFMT) == 0)
4994 mode |= S_IFREG;
4995
4996 // set layout
4997 file_layout_t layout;
4998 if (dir_layout && S_ISREG(mode))
4999 layout = *dir_layout;
5000 else
5001 layout = mdcache->default_file_layout;
5002
5003 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5004 snapid_t follows = realm->get_newest_seq();
5005 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
5006 mode, &layout);
5007 assert(newi);
5008
5009 dn->push_projected_linkage(newi);
5010
5011 newi->inode.rdev = req->head.args.mknod.rdev;
5012 newi->inode.version = dn->pre_dirty();
5013 newi->inode.rstat.rfiles = 1;
5014 if (layout.pool_id != mdcache->default_file_layout.pool_id)
5015 newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
5016 newi->inode.update_backtrace();
5017
5018 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5019 // want to write to it (e.g., if they are reexporting NFS)
5020 if (S_ISREG(newi->inode.mode)) {
5021 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
5022 newi->inode.client_ranges[client].range.first = 0;
5023 newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
5024 newi->inode.client_ranges[client].follows = follows;
5025
5026 // issue a cap on the file
5027 int cmode = CEPH_FILE_MODE_RDWR;
5028 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5029 if (cap) {
5030 cap->set_wanted(0);
5031
5032 // put locks in excl mode
5033 newi->filelock.set_state(LOCK_EXCL);
5034 newi->authlock.set_state(LOCK_EXCL);
5035 newi->xattrlock.set_state(LOCK_EXCL);
5036 }
5037 }
5038
5039 assert(dn->first == follows + 1);
5040 newi->first = dn->first;
5041
5042 dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
5043
5044 // prepare finisher
5045 mdr->ls = mdlog->get_current_segment();
5046 EUpdate *le = new EUpdate(mdlog, "mknod");
5047 mdlog->start_entry(le);
5048 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5049 journal_allocated_inos(mdr, &le->metablob);
5050
5051 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
5052 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5053 le->metablob.add_primary_dentry(dn, newi, true, true, true);
5054
5055 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5056 }
5057
5058
5059
5060 // MKDIR
5061 /* This function takes responsibility for the passed mdr*/
5062 void Server::handle_client_mkdir(MDRequestRef& mdr)
5063 {
5064 MClientRequest *req = mdr->client_request;
5065 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5066 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5067 if (!dn) return;
5068 if (mdr->snapid != CEPH_NOSNAP) {
5069 respond_to_request(mdr, -EROFS);
5070 return;
5071 }
5072 CDir *dir = dn->get_dir();
5073 CInode *diri = dir->get_inode();
5074 rdlocks.insert(&diri->authlock);
5075 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5076 return;
5077
5078 // mkdir check access
5079 if (!check_access(mdr, diri, MAY_WRITE))
5080 return;
5081
5082 if (!check_fragment_space(mdr, dir))
5083 return;
5084
5085 // new inode
5086 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5087 snapid_t follows = realm->get_newest_seq();
5088
5089 unsigned mode = req->head.args.mkdir.mode;
5090 mode &= ~S_IFMT;
5091 mode |= S_IFDIR;
5092 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5093 assert(newi);
5094
5095 // it's a directory.
5096 dn->push_projected_linkage(newi);
5097
5098 newi->inode.version = dn->pre_dirty();
5099 newi->inode.rstat.rsubdirs = 1;
5100 newi->inode.update_backtrace();
5101
5102 dout(12) << " follows " << follows << dendl;
5103 assert(dn->first == follows + 1);
5104 newi->first = dn->first;
5105
5106 // ...and that new dir is empty.
5107 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
5108 newdir->state_set(CDir::STATE_CREATING);
5109 newdir->mark_complete();
5110 newdir->fnode.version = newdir->pre_dirty();
5111
5112 // prepare finisher
5113 mdr->ls = mdlog->get_current_segment();
5114 EUpdate *le = new EUpdate(mdlog, "mkdir");
5115 mdlog->start_entry(le);
5116 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5117 journal_allocated_inos(mdr, &le->metablob);
5118 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5119 le->metablob.add_primary_dentry(dn, newi, true, true);
5120 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
5121
5122 // issue a cap on the directory
5123 int cmode = CEPH_FILE_MODE_RDWR;
5124 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5125 if (cap) {
5126 cap->set_wanted(0);
5127
5128 // put locks in excl mode
5129 newi->filelock.set_state(LOCK_EXCL);
5130 newi->authlock.set_state(LOCK_EXCL);
5131 newi->xattrlock.set_state(LOCK_EXCL);
5132 }
5133
5134 // make sure this inode gets into the journal
5135 le->metablob.add_opened_ino(newi->ino());
5136 LogSegment *ls = mds->mdlog->get_current_segment();
5137 ls->open_files.push_back(&newi->item_open_file);
5138
5139 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5140 }
5141
5142
5143 // SYMLINK
5144
5145 void Server::handle_client_symlink(MDRequestRef& mdr)
5146 {
5147 MClientRequest *req = mdr->client_request;
5148 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5149 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5150 if (!dn) return;
5151 if (mdr->snapid != CEPH_NOSNAP) {
5152 respond_to_request(mdr, -EROFS);
5153 return;
5154 }
5155 CDir *dir = dn->get_dir();
5156 CInode *diri = dir->get_inode();
5157 rdlocks.insert(&diri->authlock);
5158 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5159 return;
5160
5161 if (!check_access(mdr, diri, MAY_WRITE))
5162 return;
5163
5164 if (!check_fragment_space(mdr, dir))
5165 return;
5166
5167 unsigned mode = S_IFLNK | 0777;
5168 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5169 assert(newi);
5170
5171 // it's a symlink
5172 dn->push_projected_linkage(newi);
5173
5174 newi->symlink = req->get_path2();
5175 newi->inode.size = newi->symlink.length();
5176 newi->inode.rstat.rbytes = newi->inode.size;
5177 newi->inode.rstat.rfiles = 1;
5178 newi->inode.version = dn->pre_dirty();
5179 newi->inode.update_backtrace();
5180
5181 newi->first = dn->first;
5182
5183 // prepare finisher
5184 mdr->ls = mdlog->get_current_segment();
5185 EUpdate *le = new EUpdate(mdlog, "symlink");
5186 mdlog->start_entry(le);
5187 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5188 journal_allocated_inos(mdr, &le->metablob);
5189 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5190 le->metablob.add_primary_dentry(dn, newi, true, true);
5191
5192 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5193 }
5194
5195
5196
5197
5198
5199 // LINK
5200
5201 void Server::handle_client_link(MDRequestRef& mdr)
5202 {
5203 MClientRequest *req = mdr->client_request;
5204
5205 dout(7) << "handle_client_link " << req->get_filepath()
5206 << " to " << req->get_filepath2()
5207 << dendl;
5208
5209 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5210
5211 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5212 if (!dn) return;
5213 CInode *targeti = rdlock_path_pin_ref(mdr, 1, rdlocks, false);
5214 if (!targeti) return;
5215 if (mdr->snapid != CEPH_NOSNAP) {
5216 respond_to_request(mdr, -EROFS);
5217 return;
5218 }
5219
5220 CDir *dir = dn->get_dir();
5221 dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
5222 dout(7) << "target is " << *targeti << dendl;
5223 if (targeti->is_dir()) {
5224 // if srcdn is replica, need to make sure its linkage is correct
5225 vector<CDentry*>& trace = mdr->dn[1];
5226 if (trace.empty() ||
5227 trace.back()->is_auth() ||
5228 trace.back()->lock.can_read(mdr->get_client())) {
5229 dout(7) << "target is a dir, failing..." << dendl;
5230 respond_to_request(mdr, -EINVAL);
5231 return;
5232 }
5233 }
5234
5235 xlocks.insert(&targeti->linklock);
5236
5237 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5238 return;
5239
5240 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5241 if (!check_access(mdr, targeti, MAY_WRITE))
5242 return;
5243
5244 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
5245 return;
5246
5247 if (!check_fragment_space(mdr, dir))
5248 return;
5249 }
5250
5251 // go!
5252 assert(g_conf->mds_kill_link_at != 1);
5253
5254 // local or remote?
5255 if (targeti->is_auth())
5256 _link_local(mdr, dn, targeti);
5257 else
5258 _link_remote(mdr, true, dn, targeti);
5259 }
5260
5261
5262 class C_MDS_link_local_finish : public ServerLogContext {
5263 CDentry *dn;
5264 CInode *targeti;
5265 version_t dnpv;
5266 version_t tipv;
5267 public:
5268 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
5269 version_t dnpv_, version_t tipv_) :
5270 ServerLogContext(s, r), dn(d), targeti(ti),
5271 dnpv(dnpv_), tipv(tipv_) { }
5272 void finish(int r) override {
5273 assert(r == 0);
5274 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv);
5275 }
5276 };
5277
5278
5279 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
5280 {
5281 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
5282
5283 mdr->ls = mdlog->get_current_segment();
5284
5285 // predirty NEW dentry
5286 version_t dnpv = dn->pre_dirty();
5287 version_t tipv = targeti->pre_dirty();
5288
5289 // project inode update
5290 inode_t *pi = targeti->project_inode();
5291 pi->nlink++;
5292 pi->ctime = mdr->get_op_stamp();
5293 pi->change_attr++;
5294 pi->version = tipv;
5295
5296 // log + wait
5297 EUpdate *le = new EUpdate(mdlog, "link_local");
5298 mdlog->start_entry(le);
5299 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5300 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
5301 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
5302 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5303 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
5304
5305 // do this after predirty_*, to avoid funky extra dnl arg
5306 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5307
5308 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv));
5309 }
5310
5311 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
5312 version_t dnpv, version_t tipv)
5313 {
5314 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
5315
5316 // link and unlock the NEW dentry
5317 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
5318 if (!dnl->get_inode())
5319 dn->link_remote(dnl, targeti);
5320 dn->mark_dirty(dnpv, mdr->ls);
5321
5322 // target inode
5323 targeti->pop_and_dirty_projected_inode(mdr->ls);
5324
5325 mdr->apply();
5326
5327 MDRequestRef null_ref;
5328 mdcache->send_dentry_link(dn, null_ref);
5329
5330 // bump target popularity
5331 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5332 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
5333
5334 // reply
5335 respond_to_request(mdr, 0);
5336 }
5337
5338
5339 // link / unlink remote
5340
5341 class C_MDS_link_remote_finish : public ServerLogContext {
5342 bool inc;
5343 CDentry *dn;
5344 CInode *targeti;
5345 version_t dpv;
5346 public:
5347 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
5348 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
5349 dpv(d->get_projected_version()) {}
5350 void finish(int r) override {
5351 assert(r == 0);
5352 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
5353 }
5354 };
5355
5356 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
5357 {
5358 dout(10) << "_link_remote "
5359 << (inc ? "link ":"unlink ")
5360 << *dn << " to " << *targeti << dendl;
5361
5362 // 1. send LinkPrepare to dest (journal nlink++ prepare)
5363 mds_rank_t linkauth = targeti->authority().first;
5364 if (mdr->more()->witnessed.count(linkauth) == 0) {
5365 if (mds->is_cluster_degraded() &&
5366 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
5367 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
5368 if (mdr->more()->waiting_on_slave.empty())
5369 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
5370 return;
5371 }
5372
5373 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
5374 int op;
5375 if (inc)
5376 op = MMDSSlaveRequest::OP_LINKPREP;
5377 else
5378 op = MMDSSlaveRequest::OP_UNLINKPREP;
5379 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, op);
5380 targeti->set_object_info(req->get_object_info());
5381 req->op_stamp = mdr->get_op_stamp();
5382 mds->send_message_mds(req, linkauth);
5383
5384 assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
5385 mdr->more()->waiting_on_slave.insert(linkauth);
5386 return;
5387 }
5388 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
5389
5390 assert(g_conf->mds_kill_link_at != 2);
5391
5392 mdr->set_mds_stamp(ceph_clock_now());
5393
5394 // add to event
5395 mdr->ls = mdlog->get_current_segment();
5396 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
5397 mdlog->start_entry(le);
5398 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5399 if (!mdr->more()->witnessed.empty()) {
5400 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
5401 le->reqid = mdr->reqid;
5402 le->had_slaves = true;
5403 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
5404 }
5405
5406 if (inc) {
5407 dn->pre_dirty();
5408 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
5409 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5410 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5411 } else {
5412 dn->pre_dirty();
5413 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
5414 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
5415 le->metablob.add_null_dentry(dn, true);
5416 dn->push_projected_linkage();
5417 }
5418
5419 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
5420 }
5421
5422 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
5423 CDentry *dn, CInode *targeti,
5424 version_t dpv)
5425 {
5426 dout(10) << "_link_remote_finish "
5427 << (inc ? "link ":"unlink ")
5428 << *dn << " to " << *targeti << dendl;
5429
5430 assert(g_conf->mds_kill_link_at != 3);
5431
5432 if (!mdr->more()->witnessed.empty())
5433 mdcache->logged_master_update(mdr->reqid);
5434
5435 if (inc) {
5436 // link the new dentry
5437 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
5438 if (!dnl->get_inode())
5439 dn->link_remote(dnl, targeti);
5440 dn->mark_dirty(dpv, mdr->ls);
5441 } else {
5442 // unlink main dentry
5443 dn->get_dir()->unlink_inode(dn);
5444 dn->pop_projected_linkage();
5445 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
5446 }
5447
5448 mdr->apply();
5449
5450 MDRequestRef null_ref;
5451 if (inc)
5452 mdcache->send_dentry_link(dn, null_ref);
5453 else
5454 mdcache->send_dentry_unlink(dn, NULL, null_ref);
5455
5456 // bump target popularity
5457 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5458 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
5459
5460 // reply
5461 respond_to_request(mdr, 0);
5462
5463 if (!inc)
5464 // removing a new dn?
5465 dn->get_dir()->try_remove_unlinked_dn(dn);
5466 }
5467
5468
5469 // remote linking/unlinking
5470
5471 class C_MDS_SlaveLinkPrep : public ServerLogContext {
5472 CInode *targeti;
5473 public:
5474 C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t) :
5475 ServerLogContext(s, r), targeti(t) { }
5476 void finish(int r) override {
5477 assert(r == 0);
5478 server->_logged_slave_link(mdr, targeti);
5479 }
5480 };
5481
5482 class C_MDS_SlaveLinkCommit : public ServerContext {
5483 MDRequestRef mdr;
5484 CInode *targeti;
5485 public:
5486 C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
5487 ServerContext(s), mdr(r), targeti(t) { }
5488 void finish(int r) override {
5489 server->_commit_slave_link(mdr, r, targeti);
5490 }
5491 };
5492
5493 /* This function DOES put the mdr->slave_request before returning*/
5494 void Server::handle_slave_link_prep(MDRequestRef& mdr)
5495 {
5496 dout(10) << "handle_slave_link_prep " << *mdr
5497 << " on " << mdr->slave_request->get_object_info()
5498 << dendl;
5499
5500 assert(g_conf->mds_kill_link_at != 4);
5501
5502 CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
5503 assert(targeti);
5504 dout(10) << "targeti " << *targeti << dendl;
5505 CDentry *dn = targeti->get_parent_dn();
5506 CDentry::linkage_t *dnl = dn->get_linkage();
5507 assert(dnl->is_primary());
5508
5509 mdr->set_op_stamp(mdr->slave_request->op_stamp);
5510
5511 mdr->auth_pin(targeti);
5512
5513 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
5514 assert(g_conf->mds_kill_link_at != 5);
5515
5516 // journal it
5517 mdr->ls = mdlog->get_current_segment();
5518 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
5519 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
5520 mdlog->start_entry(le);
5521
5522 inode_t *pi = dnl->get_inode()->project_inode();
5523
5524 // update journaled target inode
5525 bool inc;
5526 if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
5527 inc = true;
5528 pi->nlink++;
5529 } else {
5530 inc = false;
5531 pi->nlink--;
5532 }
5533
5534 link_rollback rollback;
5535 rollback.reqid = mdr->reqid;
5536 rollback.ino = targeti->ino();
5537 rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
5538 const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
5539 rollback.old_dir_mtime = pf->fragstat.mtime;
5540 rollback.old_dir_rctime = pf->rstat.rctime;
5541 rollback.was_inc = inc;
5542 ::encode(rollback, le->rollback);
5543 mdr->more()->rollback_bl = le->rollback;
5544
5545 pi->ctime = mdr->get_op_stamp();
5546 pi->version = targeti->pre_dirty();
5547
5548 dout(10) << " projected inode " << pi << " v " << pi->version << dendl;
5549
5550 // commit case
5551 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
5552 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
5553
5554 // set up commit waiter
5555 mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
5556
5557 mdr->more()->slave_update_journaled = true;
5558 submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti),
5559 mdr, __func__);
5560 mdlog->flush();
5561 }
5562
5563 void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti)
5564 {
5565 dout(10) << "_logged_slave_link " << *mdr
5566 << " " << *targeti << dendl;
5567
5568 assert(g_conf->mds_kill_link_at != 6);
5569
5570 // update the target
5571 targeti->pop_and_dirty_projected_inode(mdr->ls);
5572 mdr->apply();
5573
5574 // hit pop
5575 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5576
5577 // done.
5578 mdr->slave_request->put();
5579 mdr->slave_request = 0;
5580
5581 // ack
5582 if (!mdr->aborted) {
5583 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5584 MMDSSlaveRequest::OP_LINKPREPACK);
5585 mds->send_message_mds(reply, mdr->slave_to_mds);
5586 } else {
5587 dout(10) << " abort flag set, finishing" << dendl;
5588 mdcache->request_finish(mdr);
5589 }
5590 }
5591
5592
5593 struct C_MDS_CommittedSlave : public ServerLogContext {
5594 C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
5595 void finish(int r) override {
5596 server->_committed_slave(mdr);
5597 }
5598 };
5599
5600 void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
5601 {
5602 dout(10) << "_commit_slave_link " << *mdr
5603 << " r=" << r
5604 << " " << *targeti << dendl;
5605
5606 assert(g_conf->mds_kill_link_at != 7);
5607
5608 if (r == 0) {
5609 // drop our pins, etc.
5610 mdr->cleanup();
5611
5612 // write a commit to the journal
5613 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
5614 ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
5615 mdlog->start_entry(le);
5616 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
5617 mdlog->flush();
5618 } else {
5619 do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
5620 }
5621 }
5622
5623 void Server::_committed_slave(MDRequestRef& mdr)
5624 {
5625 dout(10) << "_committed_slave " << *mdr << dendl;
5626
5627 assert(g_conf->mds_kill_link_at != 8);
5628
5629 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5630 MMDSSlaveRequest::OP_COMMITTED);
5631 mds->send_message_mds(req, mdr->slave_to_mds);
5632 mdcache->request_finish(mdr);
5633 }
5634
5635 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
5636 MutationRef mut;
5637 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r) : ServerLogContext(s, r), mut(m) {}
5638 void finish(int r) override {
5639 server->_link_rollback_finish(mut, mdr);
5640 }
5641 };
5642
5643 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
5644 {
5645 link_rollback rollback;
5646 bufferlist::iterator p = rbl.begin();
5647 ::decode(rollback, p);
5648
5649 dout(10) << "do_link_rollback on " << rollback.reqid
5650 << (rollback.was_inc ? " inc":" dec")
5651 << " ino " << rollback.ino
5652 << dendl;
5653
5654 assert(g_conf->mds_kill_link_at != 9);
5655
5656 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
5657 assert(mdr || mds->is_resolve());
5658
5659 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
5660 mut->ls = mds->mdlog->get_current_segment();
5661
5662 CInode *in = mdcache->get_inode(rollback.ino);
5663 assert(in);
5664 dout(10) << " target is " << *in << dendl;
5665 assert(!in->is_projected()); // live slave request hold versionlock xlock.
5666
5667 inode_t *pi = in->project_inode();
5668 pi->version = in->pre_dirty();
5669 mut->add_projected_inode(in);
5670
5671 // parent dir rctime
5672 CDir *parent = in->get_projected_parent_dn()->get_dir();
5673 fnode_t *pf = parent->project_fnode();
5674 mut->add_projected_fnode(parent);
5675 pf->version = parent->pre_dirty();
5676 if (pf->fragstat.mtime == pi->ctime) {
5677 pf->fragstat.mtime = rollback.old_dir_mtime;
5678 if (pf->rstat.rctime == pi->ctime)
5679 pf->rstat.rctime = rollback.old_dir_rctime;
5680 mut->add_updated_lock(&parent->get_inode()->filelock);
5681 mut->add_updated_lock(&parent->get_inode()->nestlock);
5682 }
5683
5684 // inode
5685 pi->ctime = rollback.old_ctime;
5686 if (rollback.was_inc)
5687 pi->nlink--;
5688 else
5689 pi->nlink++;
5690
5691 // journal it
5692 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
5693 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
5694 mdlog->start_entry(le);
5695 le->commit.add_dir_context(parent);
5696 le->commit.add_dir(parent, true);
5697 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
5698
5699 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr),
5700 mdr, __func__);
5701 mdlog->flush();
5702 }
5703
5704 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr)
5705 {
5706 dout(10) << "_link_rollback_finish" << dendl;
5707
5708 assert(g_conf->mds_kill_link_at != 10);
5709
5710 mut->apply();
5711 if (mdr)
5712 mdcache->request_finish(mdr);
5713
5714 mdcache->finish_rollback(mut->reqid);
5715
5716 mut->cleanup();
5717 }
5718
5719
5720 /* This function DOES NOT put the passed message before returning*/
5721 void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *m)
5722 {
5723 dout(10) << "handle_slave_link_prep_ack " << *mdr
5724 << " " << *m << dendl;
5725 mds_rank_t from = mds_rank_t(m->get_source().num());
5726
5727 assert(g_conf->mds_kill_link_at != 11);
5728
5729 // note slave
5730 mdr->more()->slaves.insert(from);
5731
5732 // witnessed!
5733 assert(mdr->more()->witnessed.count(from) == 0);
5734 mdr->more()->witnessed.insert(from);
5735 assert(!m->is_not_journaled());
5736 mdr->more()->has_journaled_slaves = true;
5737
5738 // remove from waiting list
5739 assert(mdr->more()->waiting_on_slave.count(from));
5740 mdr->more()->waiting_on_slave.erase(from);
5741
5742 assert(mdr->more()->waiting_on_slave.empty());
5743
5744 dispatch_client_request(mdr); // go again!
5745 }
5746
5747
5748
5749
5750
5751 // UNLINK
5752
5753 void Server::handle_client_unlink(MDRequestRef& mdr)
5754 {
5755 MClientRequest *req = mdr->client_request;
5756 client_t client = mdr->get_client();
5757
5758 // rmdir or unlink?
5759 bool rmdir = false;
5760 if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true;
5761
5762 if (req->get_filepath().depth() == 0) {
5763 respond_to_request(mdr, -EINVAL);
5764 return;
5765 }
5766
5767 // traverse to path
5768 vector<CDentry*> trace;
5769 CInode *in;
5770 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(), &trace, &in, MDS_TRAVERSE_FORWARD);
5771 if (r > 0) return;
5772 if (r < 0) {
5773 if (r == -ESTALE) {
5774 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
5775 mdcache->find_ino_peers(req->get_filepath().get_ino(), new C_MDS_TryFindInode(this, mdr));
5776 return;
5777 }
5778 respond_to_request(mdr, r);
5779 return;
5780 }
5781 if (mdr->snapid != CEPH_NOSNAP) {
5782 respond_to_request(mdr, -EROFS);
5783 return;
5784 }
5785
5786 CDentry *dn = trace[trace.size()-1];
5787 assert(dn);
5788 if (!dn->is_auth()) {
5789 mdcache->request_forward(mdr, dn->authority().first);
5790 return;
5791 }
5792
5793 CInode *diri = dn->get_dir()->get_inode();
5794
5795 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
5796 assert(!dnl->is_null());
5797
5798 if (rmdir) {
5799 dout(7) << "handle_client_rmdir on " << *dn << dendl;
5800 } else {
5801 dout(7) << "handle_client_unlink on " << *dn << dendl;
5802 }
5803 dout(7) << "dn links to " << *in << dendl;
5804
5805 // rmdir vs is_dir
5806 if (in->is_dir()) {
5807 if (rmdir) {
5808 // do empty directory checks
5809 if (_dir_is_nonempty_unlocked(mdr, in)) {
5810 respond_to_request(mdr, -ENOTEMPTY);
5811 return;
5812 }
5813 } else {
5814 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
5815 respond_to_request(mdr, -EISDIR);
5816 return;
5817 }
5818 } else {
5819 if (rmdir) {
5820 // unlink
5821 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
5822 respond_to_request(mdr, -ENOTDIR);
5823 return;
5824 }
5825 }
5826
5827 // -- create stray dentry? --
5828 CDentry *straydn = NULL;
5829 if (dnl->is_primary()) {
5830 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
5831 if (!straydn)
5832 return;
5833 dout(10) << " straydn is " << *straydn << dendl;
5834 } else if (mdr->straydn) {
5835 mdr->unpin(mdr->straydn);
5836 mdr->straydn = NULL;
5837 }
5838
5839 // lock
5840 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5841
5842 for (int i=0; i<(int)trace.size()-1; i++)
5843 rdlocks.insert(&trace[i]->lock);
5844 xlocks.insert(&dn->lock);
5845 wrlocks.insert(&diri->filelock);
5846 wrlocks.insert(&diri->nestlock);
5847 xlocks.insert(&in->linklock);
5848 if (straydn) {
5849 wrlocks.insert(&straydn->get_dir()->inode->filelock);
5850 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
5851 xlocks.insert(&straydn->lock);
5852 }
5853 if (in->is_dir())
5854 rdlocks.insert(&in->filelock); // to verify it's empty
5855 mds->locker->include_snap_rdlocks(rdlocks, dnl->get_inode());
5856
5857 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5858 return;
5859
5860 if (in->is_dir() &&
5861 _dir_is_nonempty(mdr, in)) {
5862 respond_to_request(mdr, -ENOTEMPTY);
5863 return;
5864 }
5865
5866 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5867 if (!check_access(mdr, diri, MAY_WRITE))
5868 return;
5869 }
5870
5871 // yay!
5872 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
5873 // subtree root auths need to be witnesses
5874 set<mds_rank_t> witnesses;
5875 in->list_replicas(witnesses);
5876 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
5877
5878 for (set<mds_rank_t>::iterator p = witnesses.begin();
5879 p != witnesses.end();
5880 ++p) {
5881 if (mdr->more()->witnessed.count(*p)) {
5882 dout(10) << " already witnessed by mds." << *p << dendl;
5883 } else if (mdr->more()->waiting_on_slave.count(*p)) {
5884 dout(10) << " already waiting on witness mds." << *p << dendl;
5885 } else {
5886 if (!_rmdir_prepare_witness(mdr, *p, trace, straydn))
5887 return;
5888 }
5889 }
5890 if (!mdr->more()->waiting_on_slave.empty())
5891 return; // we're waiting for a witness.
5892 }
5893
5894 // ok!
5895 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
5896 _link_remote(mdr, false, dn, dnl->get_inode());
5897 else
5898 _unlink_local(mdr, dn, straydn);
5899 }
5900
5901 class C_MDS_unlink_local_finish : public ServerLogContext {
5902 CDentry *dn;
5903 CDentry *straydn;
5904 version_t dnpv; // deleted dentry
5905 public:
5906 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
5907 ServerLogContext(s, r), dn(d), straydn(sd),
5908 dnpv(d->get_projected_version()) {}
5909 void finish(int r) override {
5910 assert(r == 0);
5911 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
5912 }
5913 };
5914
5915 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
5916 {
5917 dout(10) << "_unlink_local " << *dn << dendl;
5918
5919 CDentry::linkage_t *dnl = dn->get_projected_linkage();
5920 CInode *in = dnl->get_inode();
5921
5922 SnapRealm *realm = in->find_snaprealm();
5923 snapid_t follows = realm->get_newest_seq();
5924
5925 // ok, let's do it.
5926 mdr->ls = mdlog->get_current_segment();
5927
5928 // prepare log entry
5929 EUpdate *le = new EUpdate(mdlog, "unlink_local");
5930 mdlog->start_entry(le);
5931 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5932 if (!mdr->more()->witnessed.empty()) {
5933 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
5934 le->reqid = mdr->reqid;
5935 le->had_slaves = true;
5936 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
5937 }
5938
5939 if (straydn) {
5940 assert(dnl->is_primary());
5941 straydn->push_projected_linkage(in);
5942 straydn->first = follows + 1;
5943 }
5944
5945 // the unlinked dentry
5946 dn->pre_dirty();
5947
5948 inode_t *pi = in->project_inode();
5949 dn->make_path_string(pi->stray_prior_path, true);
5950 mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
5951 pi->version = in->pre_dirty();
5952 pi->ctime = mdr->get_op_stamp();
5953 pi->change_attr++;
5954 pi->nlink--;
5955 if (pi->nlink == 0)
5956 in->state_set(CInode::STATE_ORPHAN);
5957
5958 if (dnl->is_primary()) {
5959 // primary link. add stray dentry.
5960 assert(straydn);
5961 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
5962 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5963
5964 // project snaprealm, too
5965 if (in->snaprealm || follows + 1 > in->get_oldest_snap())
5966 in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
5967
5968 pi->update_backtrace();
5969 le->metablob.add_primary_dentry(straydn, in, true, true);
5970 } else {
5971 // remote link. update remote inode.
5972 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
5973 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5974 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5975 }
5976
5977 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
5978 le->metablob.add_null_dentry(dn, true);
5979
5980 if (in->is_dir()) {
5981 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
5982 le->metablob.renamed_dirino = in->ino();
5983 }
5984
5985 dn->push_projected_linkage();
5986
5987 if (in->is_dir()) {
5988 assert(straydn);
5989 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
5990 }
5991
5992 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
5993 }
5994
5995 void Server::_unlink_local_finish(MDRequestRef& mdr,
5996 CDentry *dn, CDentry *straydn,
5997 version_t dnpv)
5998 {
5999 dout(10) << "_unlink_local_finish " << *dn << dendl;
6000
6001 if (!mdr->more()->witnessed.empty())
6002 mdcache->logged_master_update(mdr->reqid);
6003
6004 // unlink main dentry
6005 dn->get_dir()->unlink_inode(dn);
6006 dn->pop_projected_linkage();
6007
6008 // relink as stray? (i.e. was primary link?)
6009 CInode *strayin = NULL;
6010 bool snap_is_new = false;
6011 if (straydn) {
6012 dout(20) << " straydn is " << *straydn << dendl;
6013 CDentry::linkage_t *straydnl = straydn->pop_projected_linkage();
6014 strayin = straydnl->get_inode();
6015
6016 snap_is_new = strayin->snaprealm ? true : false;
6017 mdcache->touch_dentry_bottom(straydn);
6018 }
6019
6020 dn->mark_dirty(dnpv, mdr->ls);
6021 mdr->apply();
6022
6023 if (snap_is_new) //only new if strayin exists
6024 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, true);
6025
6026 mdcache->send_dentry_unlink(dn, straydn, mdr);
6027
6028 // update subtree map?
6029 if (straydn && strayin->is_dir())
6030 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
6031
6032 // bump pop
6033 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
6034
6035 // reply
6036 respond_to_request(mdr, 0);
6037
6038 // removing a new dn?
6039 dn->get_dir()->try_remove_unlinked_dn(dn);
6040
6041 // clean up ?
6042 // respond_to_request() drops locks. So stray reintegration can race with us.
6043 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6044 // Tip off the MDCache that this dentry is a stray that
6045 // might be elegible for purge.
6046 mdcache->notify_stray(straydn);
6047 }
6048 }
6049
6050 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
6051 {
6052 if (mds->is_cluster_degraded() &&
6053 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6054 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
6055 if (mdr->more()->waiting_on_slave.empty())
6056 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6057 return false;
6058 }
6059
6060 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
6061 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6062 MMDSSlaveRequest::OP_RMDIRPREP);
6063 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
6064 for (auto dn : trace)
6065 req->srcdnpath.push_dentry(dn->name);
6066 mdcache->replicate_stray(straydn, who, req->stray);
6067
6068 req->op_stamp = mdr->get_op_stamp();
6069 mds->send_message_mds(req, who);
6070
6071 assert(mdr->more()->waiting_on_slave.count(who) == 0);
6072 mdr->more()->waiting_on_slave.insert(who);
6073 return true;
6074 }
6075
6076 struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
6077 CDentry *dn, *straydn;
6078 C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
6079 : ServerLogContext(s, r), dn(d), straydn(st) {}
6080 void finish(int r) override {
6081 server->_logged_slave_rmdir(mdr, dn, straydn);
6082 }
6083 };
6084
6085 struct C_MDS_SlaveRmdirCommit : public ServerContext {
6086 MDRequestRef mdr;
6087 CDentry *straydn;
6088 C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
6089 : ServerContext(s), mdr(r), straydn(sd) { }
6090 void finish(int r) override {
6091 server->_commit_slave_rmdir(mdr, r, straydn);
6092 }
6093 };
6094
6095 void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
6096 {
6097 dout(10) << "handle_slave_rmdir_prep " << *mdr
6098 << " " << mdr->slave_request->srcdnpath
6099 << " to " << mdr->slave_request->destdnpath
6100 << dendl;
6101
6102 vector<CDentry*> trace;
6103 filepath srcpath(mdr->slave_request->srcdnpath);
6104 dout(10) << " src " << srcpath << dendl;
6105 CInode *in;
6106 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK);
6107 if (r > 0) return;
6108 if (r == -ESTALE) {
6109 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
6110 mdr->slave_to_mds);
6111 return;
6112 }
6113 assert(r == 0);
6114 CDentry *dn = trace[trace.size()-1];
6115 dout(10) << " dn " << *dn << dendl;
6116 mdr->pin(dn);
6117
6118 assert(mdr->straydn);
6119 CDentry *straydn = mdr->straydn;
6120 dout(10) << " straydn " << *straydn << dendl;
6121
6122 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6123
6124 rmdir_rollback rollback;
6125 rollback.reqid = mdr->reqid;
6126 rollback.src_dir = dn->get_dir()->dirfrag();
6127 rollback.src_dname = dn->name;
6128 rollback.dest_dir = straydn->get_dir()->dirfrag();
6129 rollback.dest_dname = straydn->name;
6130 ::encode(rollback, mdr->more()->rollback_bl);
6131 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
6132
6133 // set up commit waiter
6134 mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
6135
6136 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
6137 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
6138 dn->get_dir()->unlink_inode(dn);
6139 straydn->get_dir()->link_primary_inode(straydn, in);
6140
6141 assert(straydn->first >= in->first);
6142 in->first = straydn->first;
6143
6144 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), false);
6145
6146 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6147 MMDSSlaveRequest::OP_RMDIRPREPACK);
6148 reply->mark_not_journaled();
6149 mds->send_message_mds(reply, mdr->slave_to_mds);
6150
6151 // send caps to auth (if we're not already)
6152 if (in->is_any_caps() && !in->state_test(CInode::STATE_EXPORTINGCAPS))
6153 mdcache->migrator->export_caps(in);
6154
6155 mdcache->touch_dentry_bottom(straydn); // move stray to end of lru
6156
6157 mdr->slave_request->put();
6158 mdr->slave_request = 0;
6159 mdr->straydn = 0;
6160 return;
6161 }
6162
6163 straydn->push_projected_linkage(in);
6164 dn->push_projected_linkage();
6165
6166 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
6167 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
6168 mdlog->start_entry(le);
6169 le->rollback = mdr->more()->rollback_bl;
6170
6171 le->commit.add_dir_context(straydn->get_dir());
6172 le->commit.add_primary_dentry(straydn, in, true);
6173 // slave: no need to journal original dentry
6174
6175 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6176 le->commit.renamed_dirino = in->ino();
6177
6178 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6179
6180 mdr->more()->slave_update_journaled = true;
6181 submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
6182 mdr, __func__);
6183 mdlog->flush();
6184 }
6185
6186 void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
6187 {
6188 dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
6189
6190 // update our cache now, so we are consistent with what is in the journal
6191 // when we journal a subtree map
6192 CInode *in = dn->get_linkage()->get_inode();
6193 dn->get_dir()->unlink_inode(dn);
6194 straydn->pop_projected_linkage();
6195 dn->pop_projected_linkage();
6196 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), true);
6197
6198 // done.
6199 mdr->slave_request->put();
6200 mdr->slave_request = 0;
6201 mdr->straydn = 0;
6202
6203 if (!mdr->aborted) {
6204 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6205 MMDSSlaveRequest::OP_RMDIRPREPACK);
6206 mds->send_message_mds(reply, mdr->slave_to_mds);
6207 } else {
6208 dout(10) << " abort flag set, finishing" << dendl;
6209 mdcache->request_finish(mdr);
6210 }
6211 }
6212
6213 void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
6214 {
6215 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
6216 << " " << *ack << dendl;
6217
6218 mds_rank_t from = mds_rank_t(ack->get_source().num());
6219
6220 mdr->more()->slaves.insert(from);
6221 mdr->more()->witnessed.insert(from);
6222 if (!ack->is_not_journaled())
6223 mdr->more()->has_journaled_slaves = true;
6224
6225 // remove from waiting list
6226 assert(mdr->more()->waiting_on_slave.count(from));
6227 mdr->more()->waiting_on_slave.erase(from);
6228
6229 if (mdr->more()->waiting_on_slave.empty())
6230 dispatch_client_request(mdr); // go again!
6231 else
6232 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
6233 }
6234
6235 void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
6236 {
6237 dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
6238
6239 if (r == 0) {
6240 if (mdr->more()->slave_update_journaled) {
6241 CInode *strayin = straydn->get_projected_linkage()->get_inode();
6242 if (strayin && !strayin->snaprealm)
6243 mdcache->clear_dirty_bits_for_stray(strayin);
6244 }
6245
6246 mdr->cleanup();
6247
6248 if (mdr->more()->slave_update_journaled) {
6249 // write a commit to the journal
6250 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
6251 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
6252 ESlaveUpdate::RMDIR);
6253 mdlog->start_entry(le);
6254 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
6255 mdlog->flush();
6256 } else {
6257 _committed_slave(mdr);
6258 }
6259 } else {
6260 // abort
6261 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
6262 }
6263 }
6264
6265 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
6266 metareqid_t reqid;
6267 CDentry *dn;
6268 CDentry *straydn;
6269 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
6270 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
6271 void finish(int r) override {
6272 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
6273 }
6274 };
6275
6276 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
6277 {
6278 // unlink the other rollback methods, the rmdir rollback is only
6279 // needed to record the subtree changes in the journal for inode
6280 // replicas who are auth for empty dirfrags. no actual changes to
6281 // the file system are taking place here, so there is no Mutation.
6282
6283 rmdir_rollback rollback;
6284 bufferlist::iterator p = rbl.begin();
6285 ::decode(rollback, p);
6286
6287 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
6288 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
6289 assert(mdr || mds->is_resolve());
6290
6291 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
6292 if (!dir)
6293 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
6294 assert(dir);
6295 CDentry *dn = dir->lookup(rollback.src_dname);
6296 assert(dn);
6297 dout(10) << " dn " << *dn << dendl;
6298 dir = mdcache->get_dirfrag(rollback.dest_dir);
6299 assert(dir);
6300 CDentry *straydn = dir->lookup(rollback.dest_dname);
6301 assert(straydn);
6302 dout(10) << " straydn " << *dn << dendl;
6303 CInode *in = straydn->get_linkage()->get_inode();
6304
6305 if (mdr && !mdr->more()->slave_update_journaled) {
6306 assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
6307
6308 straydn->get_dir()->unlink_inode(straydn);
6309 dn->get_dir()->link_primary_inode(dn, in);
6310
6311 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), false);
6312
6313 mdcache->request_finish(mdr);
6314 mdcache->finish_rollback(rollback.reqid);
6315 return;
6316 }
6317
6318 dn->push_projected_linkage(in);
6319 straydn->push_projected_linkage();
6320
6321 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
6322 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
6323 mdlog->start_entry(le);
6324
6325 le->commit.add_dir_context(dn->get_dir());
6326 le->commit.add_primary_dentry(dn, in, true);
6327 // slave: no need to journal straydn
6328
6329 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6330 le->commit.renamed_dirino = in->ino();
6331
6332 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
6333
6334 submit_mdlog_entry(le,
6335 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
6336 dn, straydn),
6337 mdr, __func__);
6338 mdlog->flush();
6339 }
6340
6341 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
6342 {
6343 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
6344
6345 straydn->get_dir()->unlink_inode(straydn);
6346 dn->pop_projected_linkage();
6347 straydn->pop_projected_linkage();
6348
6349 CInode *in = dn->get_linkage()->get_inode();
6350 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), true);
6351 if (mds->is_resolve()) {
6352 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
6353 mdcache->try_trim_non_auth_subtree(root);
6354 }
6355
6356 if (mdr)
6357 mdcache->request_finish(mdr);
6358
6359 mdcache->finish_rollback(reqid);
6360 }
6361
6362
6363 /** _dir_is_nonempty[_unlocked]
6364 *
6365 * check if a directory is non-empty (i.e. we can rmdir it).
6366 *
6367 * the unlocked varient this is a fastpath check. we can't really be
6368 * sure until we rdlock the filelock.
6369 */
6370 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
6371 {
6372 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
6373 assert(in->is_auth());
6374
6375 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
6376 return true; // in a snapshot!
6377
6378 list<CDir*> ls;
6379 in->get_dirfrags(ls);
6380 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6381 CDir *dir = *p;
6382 // is the frag obviously non-empty?
6383 if (dir->is_auth()) {
6384 if (dir->get_projected_fnode()->fragstat.size()) {
6385 dout(10) << "dir_is_nonempty_unlocked dirstat has "
6386 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
6387 return true;
6388 }
6389 }
6390 }
6391
6392 return false;
6393 }
6394
6395 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
6396 {
6397 dout(10) << "dir_is_nonempty " << *in << dendl;
6398 assert(in->is_auth());
6399 assert(in->filelock.can_read(mdr->get_client()));
6400
6401 frag_info_t dirstat;
6402 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
6403
6404 list<CDir*> ls;
6405 in->get_dirfrags(ls);
6406 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6407 CDir *dir = *p;
6408 const fnode_t *pf = dir->get_projected_fnode();
6409 if (pf->fragstat.size()) {
6410 dout(10) << "dir_is_nonempty dirstat has "
6411 << pf->fragstat.size() << " items " << *dir << dendl;
6412 return true;
6413 }
6414
6415 if (pf->accounted_fragstat.version == dirstat_version)
6416 dirstat.add(pf->accounted_fragstat);
6417 else
6418 dirstat.add(pf->fragstat);
6419 }
6420
6421 return dirstat.size() != in->get_projected_inode()->dirstat.size();
6422 }
6423
6424
6425 // ======================================================
6426
6427
6428 class C_MDS_rename_finish : public ServerLogContext {
6429 CDentry *srcdn;
6430 CDentry *destdn;
6431 CDentry *straydn;
6432 public:
6433 C_MDS_rename_finish(Server *s, MDRequestRef& r,
6434 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
6435 ServerLogContext(s, r),
6436 srcdn(sdn), destdn(ddn), straydn(stdn) { }
6437 void finish(int r) override {
6438 assert(r == 0);
6439 server->_rename_finish(mdr, srcdn, destdn, straydn);
6440 }
6441 };
6442
6443
6444 /** handle_client_rename
6445 *
6446 * rename master is the destdn auth. this is because cached inodes
6447 * must remain connected. thus, any replica of srci, must also
6448 * replicate destdn, and possibly straydn, so that srci (and
6449 * destdn->inode) remain connected during the rename.
6450 *
6451 * to do this, we freeze srci, then master (destdn auth) verifies that
6452 * all other nodes have also replciated destdn and straydn. note that
6453 * destdn replicas need not also replicate srci. this only works when
6454 * destdn is master.
6455 *
6456 * This function takes responsibility for the passed mdr.
6457 */
6458 void Server::handle_client_rename(MDRequestRef& mdr)
6459 {
6460 MClientRequest *req = mdr->client_request;
6461 dout(7) << "handle_client_rename " << *req << dendl;
6462
6463 filepath destpath = req->get_filepath();
6464 filepath srcpath = req->get_filepath2();
6465 if (destpath.depth() == 0 || srcpath.depth() == 0) {
6466 respond_to_request(mdr, -EINVAL);
6467 return;
6468 }
6469 const string &destname = destpath.last_dentry();
6470
6471 vector<CDentry*>& srctrace = mdr->dn[1];
6472 vector<CDentry*>& desttrace = mdr->dn[0];
6473
6474 set<SimpleLock*> rdlocks, wrlocks, xlocks;
6475
6476 CDentry *destdn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, true, false, true);
6477 if (!destdn) return;
6478 dout(10) << " destdn " << *destdn << dendl;
6479 if (mdr->snapid != CEPH_NOSNAP) {
6480 respond_to_request(mdr, -EROFS);
6481 return;
6482 }
6483 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
6484 CDir *destdir = destdn->get_dir();
6485 assert(destdir->is_auth());
6486
6487 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER);
6488 if (r > 0)
6489 return; // delayed
6490 if (r < 0) {
6491 if (r == -ESTALE) {
6492 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
6493 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
6494 } else {
6495 dout(10) << "FAIL on error " << r << dendl;
6496 respond_to_request(mdr, r);
6497 }
6498 return;
6499
6500 }
6501 assert(!srctrace.empty());
6502 CDentry *srcdn = srctrace[srctrace.size()-1];
6503 dout(10) << " srcdn " << *srcdn << dendl;
6504 if (srcdn->last != CEPH_NOSNAP) {
6505 respond_to_request(mdr, -EROFS);
6506 return;
6507 }
6508 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
6509 CInode *srci = srcdnl->get_inode();
6510 dout(10) << " srci " << *srci << dendl;
6511
6512 CInode *oldin = 0;
6513 if (!destdnl->is_null()) {
6514 //dout(10) << "dest dn exists " << *destdn << dendl;
6515 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
6516 if (!oldin) return;
6517 dout(10) << " oldin " << *oldin << dendl;
6518
6519 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
6520 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
6521 respond_to_request(mdr, -ENOTEMPTY);
6522 return;
6523 }
6524
6525 // if srcdn is replica, need to make sure its linkage is correct
6526 if (srcdn->is_auth() ||
6527 srcdn->lock.can_read(mdr->get_client()) ||
6528 (srcdn->lock.is_xlocked() && srcdn->lock.get_xlock_by() == mdr)) {
6529 // mv /some/thing /to/some/existing_other_thing
6530 if (oldin->is_dir() && !srci->is_dir()) {
6531 respond_to_request(mdr, -EISDIR);
6532 return;
6533 }
6534 if (!oldin->is_dir() && srci->is_dir()) {
6535 respond_to_request(mdr, -ENOTDIR);
6536 return;
6537 }
6538 if (srci == oldin && !srcdn->get_dir()->inode->is_stray()) {
6539 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
6540 return;
6541 }
6542 }
6543 }
6544
6545 // -- some sanity checks --
6546
6547 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
6548 if (destpath.get_ino() != srcpath.get_ino() &&
6549 !(req->get_source().is_mds() &&
6550 MDS_INO_IS_MDSDIR(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
6551 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
6552 CInode *destbase = desttrace[0]->get_dir()->get_inode();
6553 // ok, extend srctrace toward root until it is an ancestor of desttrace.
6554 while (srcbase != destbase &&
6555 !srcbase->is_projected_ancestor_of(destbase)) {
6556 CDentry *pdn = srcbase->get_projected_parent_dn();
6557 srctrace.insert(srctrace.begin(), pdn);
6558 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
6559 srcbase = pdn->get_dir()->get_inode();
6560 }
6561
6562 // then, extend destpath until it shares the same parent inode as srcpath.
6563 while (destbase != srcbase) {
6564 CDentry *pdn = destbase->get_projected_parent_dn();
6565 desttrace.insert(desttrace.begin(), pdn);
6566 rdlocks.insert(&pdn->lock);
6567 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
6568 destbase = pdn->get_dir()->get_inode();
6569 }
6570 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
6571 }
6572
6573 // src == dest?
6574 if (srcdn->get_dir() == destdir && srcdn->name == destname) {
6575 dout(7) << "rename src=dest, noop" << dendl;
6576 respond_to_request(mdr, 0);
6577 return;
6578 }
6579
6580 // dest a child of src?
6581 // e.g. mv /usr /usr/foo
6582 CDentry *pdn = destdir->inode->get_projected_parent_dn();
6583 while (pdn) {
6584 if (pdn == srcdn) {
6585 dout(7) << "cannot rename item to be a child of itself" << dendl;
6586 respond_to_request(mdr, -EINVAL);
6587 return;
6588 }
6589 pdn = pdn->get_dir()->inode->parent;
6590 }
6591
6592 // is this a stray migration, reintegration or merge? (sanity checks!)
6593 if (mdr->reqid.name.is_mds() &&
6594 !(MDS_INO_IS_MDSDIR(srcpath.get_ino()) &&
6595 MDS_INO_IS_MDSDIR(destpath.get_ino())) &&
6596 !(destdnl->is_remote() &&
6597 destdnl->get_remote_ino() == srci->ino())) {
6598 respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
6599 return;
6600 }
6601
6602 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
6603 (srcdnl->is_primary() || destdnl->is_primary()));
6604 if (linkmerge)
6605 dout(10) << " this is a link merge" << dendl;
6606
6607 // -- create stray dentry? --
6608 CDentry *straydn = NULL;
6609 if (destdnl->is_primary() && !linkmerge) {
6610 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
6611 if (!straydn)
6612 return;
6613 dout(10) << " straydn is " << *straydn << dendl;
6614 } else if (mdr->straydn) {
6615 mdr->unpin(mdr->straydn);
6616 mdr->straydn = NULL;
6617 }
6618
6619 // -- prepare witness list --
6620 /*
6621 * NOTE: we use _all_ replicas as witnesses.
6622 * this probably isn't totally necessary (esp for file renames),
6623 * but if/when we change that, we have to make sure rejoin is
6624 * sufficiently robust to handle strong rejoins from survivors
6625 * with totally wrong dentry->inode linkage.
6626 * (currently, it can ignore rename effects, because the resolve
6627 * stage will sort them out.)
6628 */
6629 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
6630 if (srcdn->is_auth())
6631 srcdn->list_replicas(witnesses);
6632 else
6633 witnesses.insert(srcdn->authority().first);
6634 if (srcdnl->is_remote() && !srci->is_auth())
6635 witnesses.insert(srci->authority().first);
6636 destdn->list_replicas(witnesses);
6637 if (destdnl->is_remote() && !oldin->is_auth())
6638 witnesses.insert(oldin->authority().first);
6639 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
6640
6641
6642 // -- locks --
6643 map<SimpleLock*, mds_rank_t> remote_wrlocks;
6644
6645 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
6646 for (int i=0; i<(int)srctrace.size(); i++)
6647 rdlocks.insert(&srctrace[i]->lock);
6648 xlocks.insert(&srcdn->lock);
6649 mds_rank_t srcdirauth = srcdn->get_dir()->authority().first;
6650 if (srcdirauth != mds->get_nodeid()) {
6651 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
6652 remote_wrlocks[&srcdn->get_dir()->inode->filelock] = srcdirauth;
6653 remote_wrlocks[&srcdn->get_dir()->inode->nestlock] = srcdirauth;
6654 if (srci->is_dir())
6655 rdlocks.insert(&srci->dirfragtreelock);
6656 } else {
6657 wrlocks.insert(&srcdn->get_dir()->inode->filelock);
6658 wrlocks.insert(&srcdn->get_dir()->inode->nestlock);
6659 }
6660 mds->locker->include_snap_rdlocks(rdlocks, srcdn->get_dir()->inode);
6661
6662 // straydn?
6663 if (straydn) {
6664 wrlocks.insert(&straydn->get_dir()->inode->filelock);
6665 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
6666 xlocks.insert(&straydn->lock);
6667 }
6668
6669 // xlock versionlock on dentries if there are witnesses.
6670 // replicas can't see projected dentry linkages, and will get
6671 // confused if we try to pipeline things.
6672 if (!witnesses.empty()) {
6673 // take xlock on all projected ancestor dentries for srcdn and destdn.
6674 // this ensures the srcdn and destdn can be traversed to by the witnesses.
6675 for (int i= 0; i<(int)srctrace.size(); i++) {
6676 if (srctrace[i]->is_auth() && srctrace[i]->is_projected())
6677 xlocks.insert(&srctrace[i]->versionlock);
6678 }
6679 for (int i=0; i<(int)desttrace.size(); i++) {
6680 if (desttrace[i]->is_auth() && desttrace[i]->is_projected())
6681 xlocks.insert(&desttrace[i]->versionlock);
6682 }
6683 // xlock srci and oldin's primary dentries, so witnesses can call
6684 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
6685 // is traversed.
6686 if (srcdnl->is_remote())
6687 xlocks.insert(&srci->get_projected_parent_dn()->lock);
6688 if (destdnl->is_remote())
6689 xlocks.insert(&oldin->get_projected_parent_dn()->lock);
6690 }
6691
6692 // we need to update srci's ctime. xlock its least contended lock to do that...
6693 xlocks.insert(&srci->linklock);
6694
6695 // xlock oldin (for nlink--)
6696 if (oldin) {
6697 xlocks.insert(&oldin->linklock);
6698 if (oldin->is_dir())
6699 rdlocks.insert(&oldin->filelock);
6700 }
6701 if (srcdnl->is_primary() && srci->is_dir())
6702 // FIXME: this should happen whenever we are renamning between
6703 // realms, regardless of the file type
6704 // FIXME: If/when this changes, make sure to update the
6705 // "allowance" in handle_slave_rename_prep
6706 xlocks.insert(&srci->snaplock); // FIXME: an auth bcast could be sufficient?
6707 else
6708 rdlocks.insert(&srci->snaplock);
6709
6710 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL;
6711 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks,
6712 &remote_wrlocks, auth_pin_freeze))
6713 return;
6714
6715 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6716 if (!check_access(mdr, srcdn->get_dir()->get_inode(), MAY_WRITE))
6717 return;
6718
6719 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
6720 return;
6721
6722 if (!check_fragment_space(mdr, destdn->get_dir()))
6723 return;
6724
6725 if (!check_access(mdr, srci, MAY_WRITE))
6726 return;
6727 }
6728
6729 // with read lock, really verify oldin is empty
6730 if (oldin &&
6731 oldin->is_dir() &&
6732 _dir_is_nonempty(mdr, oldin)) {
6733 respond_to_request(mdr, -ENOTEMPTY);
6734 return;
6735 }
6736
6737 /* project_past_snaprealm_parent() will do this job
6738 *
6739 // moving between snaprealms?
6740 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
6741 SnapRealm *srcrealm = srci->find_snaprealm();
6742 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
6743 if (srcrealm != destrealm &&
6744 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
6745 destrealm->get_newest_seq() + 1 > srcdn->first)) {
6746 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
6747 mdcache->snaprealm_create(mdr, srci);
6748 return;
6749 }
6750 }
6751 */
6752
6753 assert(g_conf->mds_kill_rename_at != 1);
6754
6755 // -- open all srcdn inode frags, if any --
6756 // we need these open so that auth can properly delegate from inode to dirfrags
6757 // after the inode is _ours_.
6758 if (srcdnl->is_primary() &&
6759 !srcdn->is_auth() &&
6760 srci->is_dir()) {
6761 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
6762 mdr->set_stickydirs(srci);
6763
6764 list<frag_t> frags;
6765 srci->dirfragtree.get_leaves(frags);
6766 for (list<frag_t>::iterator p = frags.begin();
6767 p != frags.end();
6768 ++p) {
6769 CDir *dir = srci->get_dirfrag(*p);
6770 if (!dir) {
6771 dout(10) << " opening " << *p << " under " << *srci << dendl;
6772 mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr));
6773 return;
6774 }
6775 }
6776 }
6777
6778 // -- prepare witnesses --
6779
6780 // do srcdn auth last
6781 mds_rank_t last = MDS_RANK_NONE;
6782 if (!srcdn->is_auth()) {
6783 last = srcdn->authority().first;
6784 mdr->more()->srcdn_auth_mds = last;
6785 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
6786 // are involved in the rename operation.
6787 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
6788 dout(10) << " preparing ambiguous auth for srci" << dendl;
6789 assert(mdr->more()->is_remote_frozen_authpin);
6790 assert(mdr->more()->rename_inode == srci);
6791 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6792 return;
6793 }
6794 }
6795
6796 for (set<mds_rank_t>::iterator p = witnesses.begin();
6797 p != witnesses.end();
6798 ++p) {
6799 if (*p == last) continue; // do it last!
6800 if (mdr->more()->witnessed.count(*p)) {
6801 dout(10) << " already witnessed by mds." << *p << dendl;
6802 } else if (mdr->more()->waiting_on_slave.count(*p)) {
6803 dout(10) << " already waiting on witness mds." << *p << dendl;
6804 } else {
6805 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
6806 return;
6807 }
6808 }
6809 if (!mdr->more()->waiting_on_slave.empty())
6810 return; // we're waiting for a witness.
6811
6812 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
6813 dout(10) << " preparing last witness (srcdn auth)" << dendl;
6814 assert(mdr->more()->waiting_on_slave.count(last) == 0);
6815 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6816 return;
6817 }
6818
6819 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
6820 if (!mdr->more()->slaves.empty() && !srci->is_dir())
6821 assert(g_conf->mds_kill_rename_at != 3);
6822 if (!mdr->more()->slaves.empty() && srci->is_dir())
6823 assert(g_conf->mds_kill_rename_at != 4);
6824
6825 // -- declare now --
6826 mdr->set_mds_stamp(ceph_clock_now());
6827
6828 // -- prepare journal entry --
6829 mdr->ls = mdlog->get_current_segment();
6830 EUpdate *le = new EUpdate(mdlog, "rename");
6831 mdlog->start_entry(le);
6832 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6833 if (!mdr->more()->witnessed.empty()) {
6834 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6835
6836 le->reqid = mdr->reqid;
6837 le->had_slaves = true;
6838
6839 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6840 // no need to send frozen auth pin to recovring auth MDS of srci
6841 mdr->more()->is_remote_frozen_authpin = false;
6842 }
6843
6844 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
6845 if (le->client_map.length())
6846 le->cmapv = mds->sessionmap.get_projected();
6847
6848 // -- commit locally --
6849 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
6850
6851 journal_and_reply(mdr, srci, destdn, le, fin);
6852 }
6853
6854
6855 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
6856 {
6857 dout(10) << "_rename_finish " << *mdr << dendl;
6858
6859 if (!mdr->more()->witnessed.empty())
6860 mdcache->logged_master_update(mdr->reqid);
6861
6862 // apply
6863 _rename_apply(mdr, srcdn, destdn, straydn);
6864
6865 mdcache->send_dentry_link(destdn, mdr);
6866
6867 CDentry::linkage_t *destdnl = destdn->get_linkage();
6868 CInode *in = destdnl->get_inode();
6869 bool need_eval = mdr->more()->cap_imports.count(in);
6870
6871 // test hack: test slave commit
6872 if (!mdr->more()->slaves.empty() && !in->is_dir())
6873 assert(g_conf->mds_kill_rename_at != 5);
6874 if (!mdr->more()->slaves.empty() && in->is_dir())
6875 assert(g_conf->mds_kill_rename_at != 6);
6876
6877 // bump popularity
6878 mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
6879 if (destdnl->is_remote() && in->is_auth())
6880 mds->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
6881
6882 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
6883
6884 assert(g_conf->mds_kill_rename_at != 7);
6885
6886 // reply
6887 respond_to_request(mdr, 0);
6888
6889 if (need_eval)
6890 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
6891
6892 // clean up?
6893 // respond_to_request() drops locks. So stray reintegration can race with us.
6894 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6895 mdcache->notify_stray(straydn);
6896 }
6897 }
6898
6899
6900
6901 // helpers
6902
6903 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
6904 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
6905 {
6906 if (mds->is_cluster_degraded() &&
6907 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6908 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
6909 if (mdr->more()->waiting_on_slave.empty())
6910 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6911 return false;
6912 }
6913
6914 dout(10) << "_rename_prepare_witness mds." << who << dendl;
6915 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6916 MMDSSlaveRequest::OP_RENAMEPREP);
6917
6918 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
6919 for (auto dn : srctrace)
6920 req->srcdnpath.push_dentry(dn->name);
6921 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
6922 for (auto dn : dsttrace)
6923 req->destdnpath.push_dentry(dn->name);
6924 if (straydn)
6925 mdcache->replicate_stray(straydn, who, req->stray);
6926
6927 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
6928
6929 // srcdn auth will verify our current witness list is sufficient
6930 req->witnesses = witnesse;
6931
6932 req->op_stamp = mdr->get_op_stamp();
6933 mds->send_message_mds(req, who);
6934
6935 assert(mdr->more()->waiting_on_slave.count(who) == 0);
6936 mdr->more()->waiting_on_slave.insert(who);
6937 return true;
6938 }
6939
6940 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
6941 {
6942 version_t oldpv = mdr->more()->inode_import_v;
6943
6944 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
6945
6946 /* import node */
6947 bufferlist::iterator blp = mdr->more()->inode_import.begin();
6948
6949 // imported caps
6950 ::decode(mdr->more()->imported_client_map, blp);
6951 ::encode(mdr->more()->imported_client_map, *client_map_bl,
6952 mds->mdsmap->get_up_features());
6953 prepare_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
6954
6955 list<ScatterLock*> updated_scatterlocks;
6956 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
6957 mdr->more()->cap_imports, updated_scatterlocks);
6958
6959 // hack: force back to !auth and clean, temporarily
6960 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
6961 srcdnl->get_inode()->mark_clean();
6962
6963 return oldpv;
6964 }
6965
6966 bool Server::_need_force_journal(CInode *diri, bool empty)
6967 {
6968 list<CDir*> ls;
6969 diri->get_dirfrags(ls);
6970
6971 bool force_journal = false;
6972 if (empty) {
6973 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6974 if ((*p)->is_subtree_root() && (*p)->get_dir_auth().first == mds->get_nodeid()) {
6975 dout(10) << " frag " << (*p)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
6976 force_journal = true;
6977 break;
6978 } else
6979 dout(20) << " frag " << (*p)->get_frag() << " is not auth subtree dirfrag" << dendl;
6980 }
6981 } else {
6982 // see if any children of our frags are auth subtrees.
6983 list<CDir*> subtrees;
6984 mdcache->list_subtrees(subtrees);
6985 dout(10) << " subtrees " << subtrees << " frags " << ls << dendl;
6986 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6987 CDir *dir = *p;
6988 for (list<CDir*>::iterator q = subtrees.begin(); q != subtrees.end(); ++q) {
6989 if (dir->contains(*q)) {
6990 if ((*q)->get_dir_auth().first == mds->get_nodeid()) {
6991 dout(10) << " frag " << (*p)->get_frag() << " contains (maybe) auth subtree, will force journal "
6992 << **q << dendl;
6993 force_journal = true;
6994 break;
6995 } else
6996 dout(20) << " frag " << (*p)->get_frag() << " contains but isn't auth for " << **q << dendl;
6997 } else
6998 dout(20) << " frag " << (*p)->get_frag() << " does not contain " << **q << dendl;
6999 }
7000 if (force_journal)
7001 break;
7002 }
7003 }
7004 return force_journal;
7005 }
7006
7007 void Server::_rename_prepare(MDRequestRef& mdr,
7008 EMetaBlob *metablob, bufferlist *client_map_bl,
7009 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7010 {
7011 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7012 if (straydn)
7013 dout(10) << " straydn " << *straydn << dendl;
7014
7015 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7016 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7017 CInode *srci = srcdnl->get_inode();
7018 CInode *oldin = destdnl->get_inode();
7019
7020 // primary+remote link merge?
7021 bool linkmerge = (srci == destdnl->get_inode() &&
7022 (srcdnl->is_primary() || destdnl->is_primary()));
7023 bool silent = srcdn->get_dir()->inode->is_stray();
7024
7025 bool force_journal_dest = false;
7026 if (srci->is_dir() && !destdn->is_auth()) {
7027 if (srci->is_auth()) {
7028 // if we are auth for srci and exporting it, force journal because journal replay needs
7029 // the source inode to create auth subtrees.
7030 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
7031 force_journal_dest = true;
7032 } else
7033 force_journal_dest = _need_force_journal(srci, false);
7034 }
7035
7036 bool force_journal_stray = false;
7037 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
7038 force_journal_stray = _need_force_journal(oldin, true);
7039
7040 if (linkmerge)
7041 dout(10) << " merging remote and primary links to the same inode" << dendl;
7042 if (silent)
7043 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
7044 if (force_journal_dest)
7045 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
7046 if (force_journal_stray)
7047 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
7048
7049 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
7050 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
7051 metablob->renamed_dirino = srci->ino();
7052 } else if (oldin && oldin->is_dir() && force_journal_stray) {
7053 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
7054 metablob->renamed_dirino = oldin->ino();
7055 }
7056
7057 // prepare
7058 inode_t *pi = 0; // renamed inode
7059 inode_t *tpi = 0; // target/overwritten inode
7060
7061 // target inode
7062 if (!linkmerge) {
7063 if (destdnl->is_primary()) {
7064 assert(straydn); // moving to straydn.
7065 // link--, and move.
7066 if (destdn->is_auth()) {
7067 tpi = oldin->project_inode(); //project_snaprealm
7068 tpi->version = straydn->pre_dirty(tpi->version);
7069 tpi->update_backtrace();
7070 }
7071 straydn->push_projected_linkage(oldin);
7072 } else if (destdnl->is_remote()) {
7073 // nlink-- targeti
7074 if (oldin->is_auth()) {
7075 tpi = oldin->project_inode();
7076 tpi->version = oldin->pre_dirty();
7077 }
7078 }
7079 }
7080
7081 // dest
7082 if (srcdnl->is_remote()) {
7083 if (!linkmerge) {
7084 // destdn
7085 if (destdn->is_auth())
7086 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
7087 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7088 // srci
7089 if (srci->is_auth()) {
7090 pi = srci->project_inode();
7091 pi->version = srci->pre_dirty();
7092 }
7093 } else {
7094 dout(10) << " will merge remote onto primary link" << dendl;
7095 if (destdn->is_auth()) {
7096 pi = oldin->project_inode();
7097 pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
7098 }
7099 }
7100 } else { // primary
7101 if (destdn->is_auth()) {
7102 version_t oldpv;
7103 if (srcdn->is_auth())
7104 oldpv = srci->get_projected_version();
7105 else {
7106 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
7107
7108 // note which dirfrags have child subtrees in the journal
7109 // event, so that we can open those (as bounds) during replay.
7110 if (srci->is_dir()) {
7111 list<CDir*> ls;
7112 srci->get_dirfrags(ls);
7113 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7114 CDir *dir = *p;
7115 if (!dir->is_auth())
7116 metablob->renamed_dir_frags.push_back(dir->get_frag());
7117 }
7118 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
7119 }
7120 }
7121 pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
7122 // & srcdnl->snaprealm
7123 pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
7124 pi->update_backtrace();
7125 }
7126 destdn->push_projected_linkage(srci);
7127 }
7128
7129 // src
7130 if (srcdn->is_auth())
7131 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
7132 srcdn->push_projected_linkage(); // push null linkage
7133
7134 if (!silent) {
7135 if (pi) {
7136 pi->ctime = mdr->get_op_stamp();
7137 pi->change_attr++;
7138 if (linkmerge)
7139 pi->nlink--;
7140 }
7141 if (tpi) {
7142 tpi->ctime = mdr->get_op_stamp();
7143 tpi->change_attr++;
7144 destdn->make_path_string(tpi->stray_prior_path, true);
7145 tpi->nlink--;
7146 if (tpi->nlink == 0)
7147 oldin->state_set(CInode::STATE_ORPHAN);
7148 }
7149 }
7150
7151 // prepare nesting, mtime updates
7152 int predirty_dir = silent ? 0:PREDIRTY_DIR;
7153
7154 // guarantee stray dir is processed first during journal replay. unlink the old inode,
7155 // then link the source inode to destdn
7156 if (destdnl->is_primary()) {
7157 assert(straydn);
7158 if (straydn->is_auth()) {
7159 metablob->add_dir_context(straydn->get_dir());
7160 metablob->add_dir(straydn->get_dir(), true);
7161 }
7162 }
7163
7164 // sub off target
7165 if (destdn->is_auth() && !destdnl->is_null()) {
7166 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
7167 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
7168 if (destdnl->is_primary()) {
7169 assert(straydn);
7170 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
7171 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7172 }
7173 }
7174
7175 // move srcdn
7176 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
7177 int flags = predirty_dir | predirty_primary;
7178 if (srcdn->is_auth())
7179 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
7180 if (destdn->is_auth())
7181 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
7182
7183 SnapRealm *src_realm = srci->find_snaprealm();
7184 SnapRealm *dest_realm = destdn->get_dir()->inode->find_snaprealm();
7185 snapid_t next_dest_snap = dest_realm->get_newest_seq() + 1;
7186
7187 // add it all to the metablob
7188 // target inode
7189 if (!linkmerge) {
7190 if (destdnl->is_primary()) {
7191 assert(straydn);
7192 if (destdn->is_auth()) {
7193 // project snaprealm, too
7194 if (oldin->snaprealm || dest_realm->get_newest_seq() + 1 > oldin->get_oldest_snap())
7195 oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
7196 straydn->first = MAX(oldin->first, next_dest_snap);
7197 metablob->add_primary_dentry(straydn, oldin, true, true);
7198 } else if (force_journal_stray) {
7199 dout(10) << " forced journaling straydn " << *straydn << dendl;
7200 metablob->add_dir_context(straydn->get_dir());
7201 metablob->add_primary_dentry(straydn, oldin, true);
7202 }
7203 } else if (destdnl->is_remote()) {
7204 if (oldin->is_auth()) {
7205 // auth for targeti
7206 metablob->add_dir_context(oldin->get_projected_parent_dir());
7207 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
7208 CEPH_NOSNAP, 0, destdnl);
7209 metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
7210 }
7211 }
7212 }
7213
7214 // dest
7215 if (srcdnl->is_remote()) {
7216 if (!linkmerge) {
7217 if (destdn->is_auth() && !destdnl->is_null())
7218 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7219 else
7220 destdn->first = MAX(destdn->first, next_dest_snap);
7221
7222 if (destdn->is_auth())
7223 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7224 if (srci->get_projected_parent_dn()->is_auth()) { // it's remote
7225 metablob->add_dir_context(srci->get_projected_parent_dir());
7226 mdcache->journal_cow_dentry(mdr.get(), metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl);
7227 metablob->add_primary_dentry(srci->get_projected_parent_dn(), srci, true);
7228 }
7229 } else {
7230 if (destdn->is_auth() && !destdnl->is_null())
7231 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7232 else
7233 destdn->first = MAX(destdn->first, next_dest_snap);
7234
7235 if (destdn->is_auth())
7236 metablob->add_primary_dentry(destdn, destdnl->get_inode(), true, true);
7237 }
7238 } else if (srcdnl->is_primary()) {
7239 // project snap parent update?
7240 if (destdn->is_auth() && src_realm != dest_realm &&
7241 (srci->snaprealm || src_realm->get_newest_seq() + 1 > srci->get_oldest_snap()))
7242 srci->project_past_snaprealm_parent(dest_realm);
7243
7244 if (destdn->is_auth() && !destdnl->is_null())
7245 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7246 else
7247 destdn->first = MAX(destdn->first, next_dest_snap);
7248
7249 if (destdn->is_auth())
7250 metablob->add_primary_dentry(destdn, srci, true, true);
7251 else if (force_journal_dest) {
7252 dout(10) << " forced journaling destdn " << *destdn << dendl;
7253 metablob->add_dir_context(destdn->get_dir());
7254 metablob->add_primary_dentry(destdn, srci, true);
7255 if (srcdn->is_auth() && srci->is_dir()) {
7256 // journal new subtrees root dirfrags
7257 list<CDir*> ls;
7258 srci->get_dirfrags(ls);
7259 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7260 CDir *dir = *p;
7261 if (dir->is_auth())
7262 metablob->add_dir(dir, true);
7263 }
7264 }
7265 }
7266 }
7267
7268 // src
7269 if (srcdn->is_auth()) {
7270 dout(10) << " journaling srcdn " << *srcdn << dendl;
7271 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
7272 // also journal the inode in case we need do slave rename rollback. It is Ok to add
7273 // both primary and NULL dentries. Because during journal replay, null dentry is
7274 // processed after primary dentry.
7275 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
7276 metablob->add_primary_dentry(srcdn, srci, true);
7277 metablob->add_null_dentry(srcdn, true);
7278 } else
7279 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
7280
7281 // make renamed inode first track the dn
7282 if (srcdnl->is_primary() && destdn->is_auth())
7283 srci->first = destdn->first;
7284
7285 if (oldin && oldin->is_dir()) {
7286 assert(straydn);
7287 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
7288 }
7289 if (srci->is_dir())
7290 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
7291
7292 }
7293
7294
7295 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7296 {
7297 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7298 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
7299
7300 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7301 CDentry::linkage_t *destdnl = destdn->get_linkage();
7302
7303 CInode *oldin = destdnl->get_inode();
7304
7305 // primary+remote link merge?
7306 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7307 (srcdnl->is_primary() || destdnl->is_primary()));
7308
7309 // target inode
7310 if (!linkmerge) {
7311 if (destdnl->is_primary()) {
7312 assert(straydn);
7313 dout(10) << "straydn is " << *straydn << dendl;
7314 destdn->get_dir()->unlink_inode(destdn, false);
7315
7316 straydn->pop_projected_linkage();
7317 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7318 assert(!straydn->is_projected()); // no other projected
7319
7320 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
7321
7322 // nlink-- targeti
7323 if (destdn->is_auth()) {
7324 bool hadrealm = (oldin->snaprealm ? true : false);
7325 oldin->pop_and_dirty_projected_inode(mdr->ls);
7326 if (oldin->snaprealm && !hadrealm)
7327 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT);
7328 } else {
7329 // FIXME this snaprealm is not filled out correctly
7330 //oldin->open_snaprealm(); might be sufficient..
7331 }
7332 } else if (destdnl->is_remote()) {
7333 destdn->get_dir()->unlink_inode(destdn, false);
7334 if (oldin->is_auth())
7335 oldin->pop_and_dirty_projected_inode(mdr->ls);
7336 }
7337 }
7338
7339 // unlink src before we relink it at dest
7340 CInode *in = srcdnl->get_inode();
7341 assert(in);
7342
7343 bool srcdn_was_remote = srcdnl->is_remote();
7344 srcdn->get_dir()->unlink_inode(srcdn);
7345
7346 // dest
7347 if (srcdn_was_remote) {
7348 if (!linkmerge) {
7349 // destdn
7350 destdnl = destdn->pop_projected_linkage();
7351 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7352 assert(!destdn->is_projected()); // no other projected
7353
7354 destdn->link_remote(destdnl, in);
7355 if (destdn->is_auth())
7356 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
7357 // in
7358 if (in->is_auth())
7359 in->pop_and_dirty_projected_inode(mdr->ls);
7360 } else {
7361 dout(10) << "merging remote onto primary link" << dendl;
7362 oldin->pop_and_dirty_projected_inode(mdr->ls);
7363 }
7364 } else { // primary
7365 if (linkmerge) {
7366 dout(10) << "merging primary onto remote link" << dendl;
7367 destdn->get_dir()->unlink_inode(destdn, false);
7368 }
7369 destdnl = destdn->pop_projected_linkage();
7370 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7371 assert(!destdn->is_projected()); // no other projected
7372
7373 // srcdn inode import?
7374 if (!srcdn->is_auth() && destdn->is_auth()) {
7375 assert(mdr->more()->inode_import.length() > 0);
7376
7377 map<client_t,Capability::Import> imported_caps;
7378
7379 // finish cap imports
7380 finish_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
7381 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
7382 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
7383 mdr->more()->srcdn_auth_mds, true,
7384 mdr->more()->cap_imports[destdnl->get_inode()],
7385 imported_caps);
7386 }
7387
7388 mdr->more()->inode_import.clear();
7389 ::encode(imported_caps, mdr->more()->inode_import);
7390
7391 /* hack: add an auth pin for each xlock we hold. These were
7392 * remote xlocks previously but now they're local and
7393 * we're going to try and unpin when we xlock_finish. */
7394 for (set<SimpleLock *>::iterator i = mdr->xlocks.begin();
7395 i != mdr->xlocks.end();
7396 ++i)
7397 if ((*i)->get_parent() == destdnl->get_inode() &&
7398 !(*i)->is_locallock())
7399 mds->locker->xlock_import(*i);
7400
7401 // hack: fix auth bit
7402 in->state_set(CInode::STATE_AUTH);
7403
7404 mdr->clear_ambiguous_auth();
7405 }
7406
7407 if (destdn->is_auth()) {
7408 in->pop_and_dirty_projected_inode(mdr->ls);
7409
7410 } else {
7411 // FIXME: fix up snaprealm!
7412 }
7413 }
7414
7415 // src
7416 if (srcdn->is_auth())
7417 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
7418 srcdn->pop_projected_linkage();
7419 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7420 assert(!srcdn->is_projected()); // no other projected
7421
7422 // apply remaining projected inodes (nested)
7423 mdr->apply();
7424
7425 // update subtree map?
7426 if (destdnl->is_primary() && in->is_dir())
7427 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
7428
7429 if (straydn && oldin->is_dir())
7430 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
7431
7432 // removing a new dn?
7433 if (srcdn->is_auth())
7434 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
7435 }
7436
7437
7438
7439 // ------------
7440 // SLAVE
7441
7442 class C_MDS_SlaveRenamePrep : public ServerLogContext {
7443 CDentry *srcdn, *destdn, *straydn;
7444 public:
7445 C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7446 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
7447 void finish(int r) override {
7448 server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
7449 }
7450 };
7451
7452 class C_MDS_SlaveRenameCommit : public ServerContext {
7453 MDRequestRef mdr;
7454 CDentry *srcdn, *destdn, *straydn;
7455 public:
7456 C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7457 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
7458 void finish(int r) override {
7459 server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
7460 }
7461 };
7462
7463 class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
7464 MDRequestRef mdr;
7465 public:
7466 C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
7467 ServerContext(s), mdr(r) {}
7468 void finish(int r) override {
7469 server->_slave_rename_sessions_flushed(mdr);
7470 }
7471 };
7472
7473 /* This function DOES put the mdr->slave_request before returning*/
7474 void Server::handle_slave_rename_prep(MDRequestRef& mdr)
7475 {
7476 dout(10) << "handle_slave_rename_prep " << *mdr
7477 << " " << mdr->slave_request->srcdnpath
7478 << " to " << mdr->slave_request->destdnpath
7479 << dendl;
7480
7481 if (mdr->slave_request->is_interrupted()) {
7482 dout(10) << " slave request interrupted, sending noop reply" << dendl;
7483 MMDSSlaveRequest *reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7484 reply->mark_interrupted();
7485 mds->send_message_mds(reply, mdr->slave_to_mds);
7486 mdr->slave_request->put();
7487 mdr->slave_request = 0;
7488 return;
7489 }
7490
7491 // discover destdn
7492 filepath destpath(mdr->slave_request->destdnpath);
7493 dout(10) << " dest " << destpath << dendl;
7494 vector<CDentry*> trace;
7495 int r = mdcache->path_traverse(mdr, NULL, NULL, destpath, &trace, NULL, MDS_TRAVERSE_DISCOVERXLOCK);
7496 if (r > 0) return;
7497 if (r == -ESTALE) {
7498 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
7499 mdr->slave_to_mds);
7500 return;
7501 }
7502 assert(r == 0); // we shouldn't get an error here!
7503
7504 CDentry *destdn = trace[trace.size()-1];
7505 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7506 dout(10) << " destdn " << *destdn << dendl;
7507 mdr->pin(destdn);
7508
7509 // discover srcdn
7510 filepath srcpath(mdr->slave_request->srcdnpath);
7511 dout(10) << " src " << srcpath << dendl;
7512 CInode *srci = nullptr;
7513 r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &srci, MDS_TRAVERSE_DISCOVERXLOCK);
7514 if (r > 0) return;
7515 assert(r == 0);
7516
7517 // srcpath must not point to a null dentry
7518 assert(srci != nullptr);
7519
7520 CDentry *srcdn = trace[trace.size()-1];
7521 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7522 dout(10) << " srcdn " << *srcdn << dendl;
7523 mdr->pin(srcdn);
7524 mdr->pin(srci);
7525
7526 // stray?
7527 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7528 (srcdnl->is_primary() || destdnl->is_primary()));
7529 CDentry *straydn = mdr->straydn;
7530 if (destdnl->is_primary() && !linkmerge)
7531 assert(straydn);
7532
7533 mdr->set_op_stamp(mdr->slave_request->op_stamp);
7534 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
7535
7536 // set up commit waiter (early, to clean up any freezing etc we do)
7537 if (!mdr->more()->slave_commit)
7538 mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
7539
7540 // am i srcdn auth?
7541 if (srcdn->is_auth()) {
7542 set<mds_rank_t> srcdnrep;
7543 srcdn->list_replicas(srcdnrep);
7544
7545 bool reply_witness = false;
7546 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
7547 // freeze?
7548 // we need this to
7549 // - avoid conflicting lock state changes
7550 // - avoid concurrent updates to the inode
7551 // (this could also be accomplished with the versionlock)
7552 int allowance = 2; // 1 for the mdr auth_pin, 1 for the link lock
7553 allowance += srcdnl->get_inode()->is_dir(); // for the snap lock
7554 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
7555 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
7556
7557 // unfreeze auth pin after freezing the inode to avoid queueing waiters
7558 if (srcdnl->get_inode()->is_frozen_auth_pin())
7559 mdr->unfreeze_auth_pin();
7560
7561 if (!frozen_inode) {
7562 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
7563 return;
7564 }
7565
7566 /*
7567 * set ambiguous auth for srci
7568 * NOTE: we don't worry about ambiguous cache expire as we do
7569 * with subtree migrations because all slaves will pin
7570 * srcdn->get_inode() for duration of this rename.
7571 */
7572 mdr->set_ambiguous_auth(srcdnl->get_inode());
7573
7574 // just mark the source inode as ambiguous auth if more than two MDS are involved.
7575 // the master will send another OP_RENAMEPREP slave request later.
7576 if (mdr->slave_request->witnesses.size() > 1) {
7577 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
7578 reply_witness = true;
7579 }
7580
7581 // make sure bystanders have received all lock related messages
7582 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7583 if (*p == mdr->slave_to_mds ||
7584 (mds->is_cluster_degraded() &&
7585 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
7586 continue;
7587 MMDSSlaveRequest *notify = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7588 MMDSSlaveRequest::OP_RENAMENOTIFY);
7589 mds->send_message_mds(notify, *p);
7590 mdr->more()->waiting_on_slave.insert(*p);
7591 }
7592
7593 // make sure clients have received all cap related messages
7594 set<client_t> export_client_set;
7595 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
7596
7597 MDSGatherBuilder gather(g_ceph_context);
7598 flush_client_sessions(export_client_set, gather);
7599 if (gather.has_subs()) {
7600 mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
7601 gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
7602 gather.activate();
7603 }
7604 }
7605
7606 // is witness list sufficient?
7607 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7608 if (*p == mdr->slave_to_mds ||
7609 mdr->slave_request->witnesses.count(*p)) continue;
7610 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
7611 reply_witness = true;
7612 break;
7613 }
7614
7615 if (reply_witness) {
7616 assert(!srcdnrep.empty());
7617 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7618 MMDSSlaveRequest::OP_RENAMEPREPACK);
7619 reply->witnesses.swap(srcdnrep);
7620 mds->send_message_mds(reply, mdr->slave_to_mds);
7621 mdr->slave_request->put();
7622 mdr->slave_request = 0;
7623 return;
7624 }
7625 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
7626 if (!mdr->more()->waiting_on_slave.empty()) {
7627 dout(10) << " still waiting for rename notify acks from "
7628 << mdr->more()->waiting_on_slave << dendl;
7629 return;
7630 }
7631 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
7632 // set ambiguous auth for srci on witnesses
7633 mdr->set_ambiguous_auth(srcdnl->get_inode());
7634 }
7635
7636 // encode everything we'd need to roll this back... basically, just the original state.
7637 rename_rollback rollback;
7638
7639 rollback.reqid = mdr->reqid;
7640
7641 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
7642 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7643 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
7644 rollback.orig_src.dname = srcdn->name;
7645 if (srcdnl->is_primary())
7646 rollback.orig_src.ino = srcdnl->get_inode()->ino();
7647 else {
7648 assert(srcdnl->is_remote());
7649 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
7650 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
7651 }
7652
7653 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
7654 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7655 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
7656 rollback.orig_dest.dname = destdn->name;
7657 if (destdnl->is_primary())
7658 rollback.orig_dest.ino = destdnl->get_inode()->ino();
7659 else if (destdnl->is_remote()) {
7660 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
7661 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
7662 }
7663
7664 if (straydn) {
7665 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
7666 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
7667 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
7668 rollback.stray.dname = straydn->name;
7669 }
7670 ::encode(rollback, mdr->more()->rollback_bl);
7671 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7672
7673 // journal.
7674 mdr->ls = mdlog->get_current_segment();
7675 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
7676 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
7677 mdlog->start_entry(le);
7678 le->rollback = mdr->more()->rollback_bl;
7679
7680 bufferlist blah; // inode import data... obviously not used if we're the slave
7681 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
7682
7683 if (le->commit.empty()) {
7684 dout(10) << " empty metablob, skipping journal" << dendl;
7685 mdlog->cancel_entry(le);
7686 mdr->ls = NULL;
7687 _logged_slave_rename(mdr, srcdn, destdn, straydn);
7688 } else {
7689 mdr->more()->slave_update_journaled = true;
7690 submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
7691 mdr, __func__);
7692 mdlog->flush();
7693 }
7694 }
7695
7696 void Server::_logged_slave_rename(MDRequestRef& mdr,
7697 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7698 {
7699 dout(10) << "_logged_slave_rename " << *mdr << dendl;
7700
7701 // prepare ack
7702 MMDSSlaveRequest *reply = NULL;
7703 if (!mdr->aborted) {
7704 reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7705 if (!mdr->more()->slave_update_journaled)
7706 reply->mark_not_journaled();
7707 }
7708
7709 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7710 CDentry::linkage_t *destdnl = NULL;
7711 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
7712
7713 // export srci?
7714 if (srcdn->is_auth() && srcdnl->is_primary()) {
7715 // set export bounds for CInode::encode_export()
7716 list<CDir*> bounds;
7717 if (srcdnl->get_inode()->is_dir()) {
7718 srcdnl->get_inode()->get_dirfrags(bounds);
7719 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7720 (*p)->state_set(CDir::STATE_EXPORTBOUND);
7721 }
7722
7723 map<client_t,entity_inst_t> exported_client_map;
7724 bufferlist inodebl;
7725 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
7726 exported_client_map);
7727
7728 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7729 (*p)->state_clear(CDir::STATE_EXPORTBOUND);
7730
7731 if (reply) {
7732 ::encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
7733 reply->inode_export.claim_append(inodebl);
7734 reply->inode_export_v = srcdnl->get_inode()->inode.version;
7735 }
7736
7737 // remove mdr auth pin
7738 mdr->auth_unpin(srcdnl->get_inode());
7739 mdr->more()->is_inode_exporter = true;
7740
7741 if (srcdnl->get_inode()->is_dirty())
7742 srcdnl->get_inode()->mark_clean();
7743
7744 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
7745 }
7746
7747 // apply
7748 _rename_apply(mdr, srcdn, destdn, straydn);
7749
7750 destdnl = destdn->get_linkage();
7751
7752 // bump popularity
7753 mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
7754 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
7755 mds->balancer->hit_inode(mdr->get_mds_stamp(), destdnl->get_inode(),
7756 META_POP_IWR);
7757
7758 // done.
7759 mdr->slave_request->put();
7760 mdr->slave_request = 0;
7761 mdr->straydn = 0;
7762
7763 if (reply) {
7764 mds->send_message_mds(reply, mdr->slave_to_mds);
7765 } else {
7766 assert(mdr->aborted);
7767 dout(10) << " abort flag set, finishing" << dendl;
7768 mdcache->request_finish(mdr);
7769 }
7770 }
7771
7772 void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
7773 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7774 {
7775 dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
7776
7777 CDentry::linkage_t *destdnl = destdn->get_linkage();
7778
7779 list<MDSInternalContextBase*> finished;
7780 if (r == 0) {
7781 // unfreeze+singleauth inode
7782 // hmm, do i really need to delay this?
7783 if (mdr->more()->is_inode_exporter) {
7784
7785 CInode *in = destdnl->get_inode();
7786
7787 // drop our pins
7788 // we exported, clear out any xlocks that we moved to another MDS
7789 set<SimpleLock*>::iterator i = mdr->xlocks.begin();
7790 while (i != mdr->xlocks.end()) {
7791 SimpleLock *lock = *i++;
7792
7793 // we only care about xlocks on the exported inode
7794 if (lock->get_parent() == in &&
7795 !lock->is_locallock())
7796 mds->locker->xlock_export(lock, mdr.get());
7797 }
7798
7799 map<client_t,Capability::Import> peer_imported;
7800 bufferlist::iterator bp = mdr->more()->inode_import.begin();
7801 ::decode(peer_imported, bp);
7802
7803 dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl;
7804 mdcache->migrator->finish_export_inode(destdnl->get_inode(),
7805 mdr->get_mds_stamp(),
7806 mdr->slave_to_mds, peer_imported, finished);
7807 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
7808
7809 // unfreeze
7810 assert(destdnl->get_inode()->is_frozen_inode());
7811 destdnl->get_inode()->unfreeze_inode(finished);
7812 }
7813
7814 // singleauth
7815 if (mdr->more()->is_ambiguous_auth) {
7816 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
7817 mdr->more()->is_ambiguous_auth = false;
7818 }
7819
7820 if (straydn && mdr->more()->slave_update_journaled) {
7821 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7822 if (strayin && !strayin->snaprealm)
7823 mdcache->clear_dirty_bits_for_stray(strayin);
7824 }
7825
7826 mds->queue_waiters(finished);
7827 mdr->cleanup();
7828
7829 if (mdr->more()->slave_update_journaled) {
7830 // write a commit to the journal
7831 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
7832 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
7833 ESlaveUpdate::RENAME);
7834 mdlog->start_entry(le);
7835 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
7836 mdlog->flush();
7837 } else {
7838 _committed_slave(mdr);
7839 }
7840 } else {
7841
7842 // abort
7843 // rollback_bl may be empty if we froze the inode but had to provide an expanded
7844 // witness list from the master, and they failed before we tried prep again.
7845 if (mdr->more()->rollback_bl.length()) {
7846 if (mdr->more()->is_inode_exporter) {
7847 dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl;
7848 destdnl->get_inode()->abort_export();
7849 }
7850 if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
7851 mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
7852 // rollback but preserve the slave request
7853 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
7854 mdr->more()->rollback_bl.clear();
7855 } else
7856 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
7857 } else {
7858 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
7859 // singleauth
7860 if (mdr->more()->is_ambiguous_auth) {
7861 if (srcdn->is_auth())
7862 mdr->more()->rename_inode->unfreeze_inode(finished);
7863
7864 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
7865 mdr->more()->is_ambiguous_auth = false;
7866 }
7867 mds->queue_waiters(finished);
7868 mdcache->request_finish(mdr);
7869 }
7870 }
7871 }
7872
7873 void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
7874 bool isdir, int linkunlink, nest_info_t &rstat)
7875 {
7876 fnode_t *pf;
7877 pf = dir->project_fnode();
7878 mut->add_projected_fnode(dir);
7879 pf->version = dir->pre_dirty();
7880
7881 if (isdir) {
7882 pf->fragstat.nsubdirs += linkunlink;
7883 } else {
7884 pf->fragstat.nfiles += linkunlink;
7885 }
7886 if (r.ino) {
7887 pf->rstat.rbytes += linkunlink * rstat.rbytes;
7888 pf->rstat.rfiles += linkunlink * rstat.rfiles;
7889 pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
7890 pf->rstat.rsnaprealms += linkunlink * rstat.rsnaprealms;
7891 }
7892 if (pf->fragstat.mtime == ctime) {
7893 pf->fragstat.mtime = r.dirfrag_old_mtime;
7894 if (pf->rstat.rctime == ctime)
7895 pf->rstat.rctime = r.dirfrag_old_rctime;
7896 }
7897 mut->add_updated_lock(&dir->get_inode()->filelock);
7898 mut->add_updated_lock(&dir->get_inode()->nestlock);
7899 }
7900
7901 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
7902 MutationRef mut;
7903 CDentry *srcdn;
7904 version_t srcdnpv;
7905 CDentry *destdn;
7906 CDentry *straydn;
7907 bool finish_mdr;
7908 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
7909 CDentry *sd, version_t pv, CDentry *dd,
7910 CDentry *st, bool f) :
7911 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
7912 straydn(st), finish_mdr(f) {}
7913 void finish(int r) override {
7914 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
7915 destdn, straydn, finish_mdr);
7916 }
7917 };
7918
7919 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
7920 bool finish_mdr)
7921 {
7922 rename_rollback rollback;
7923 bufferlist::iterator p = rbl.begin();
7924 ::decode(rollback, p);
7925
7926 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
7927 // need to finish this update before sending resolve to claim the subtree
7928 mdcache->add_rollback(rollback.reqid, master);
7929
7930 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7931 mut->ls = mds->mdlog->get_current_segment();
7932
7933 CDentry *srcdn = NULL;
7934 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
7935 if (!srcdir)
7936 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
7937 if (srcdir) {
7938 dout(10) << " srcdir " << *srcdir << dendl;
7939 srcdn = srcdir->lookup(rollback.orig_src.dname);
7940 if (srcdn) {
7941 dout(10) << " srcdn " << *srcdn << dendl;
7942 assert(srcdn->get_linkage()->is_null());
7943 } else
7944 dout(10) << " srcdn not found" << dendl;
7945 } else
7946 dout(10) << " srcdir not found" << dendl;
7947
7948 CDentry *destdn = NULL;
7949 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
7950 if (!destdir)
7951 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
7952 if (destdir) {
7953 dout(10) << " destdir " << *destdir << dendl;
7954 destdn = destdir->lookup(rollback.orig_dest.dname);
7955 if (destdn)
7956 dout(10) << " destdn " << *destdn << dendl;
7957 else
7958 dout(10) << " destdn not found" << dendl;
7959 } else
7960 dout(10) << " destdir not found" << dendl;
7961
7962 CInode *in = NULL;
7963 if (rollback.orig_src.ino) {
7964 in = mdcache->get_inode(rollback.orig_src.ino);
7965 if (in && in->is_dir())
7966 assert(srcdn && destdn);
7967 } else
7968 in = mdcache->get_inode(rollback.orig_src.remote_ino);
7969
7970 CDir *straydir = NULL;
7971 CDentry *straydn = NULL;
7972 if (rollback.stray.dirfrag.ino) {
7973 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
7974 if (straydir) {
7975 dout(10) << "straydir " << *straydir << dendl;
7976 straydn = straydir->lookup(rollback.stray.dname);
7977 if (straydn) {
7978 dout(10) << " straydn " << *straydn << dendl;
7979 assert(straydn->get_linkage()->is_primary());
7980 } else
7981 dout(10) << " straydn not found" << dendl;
7982 } else
7983 dout(10) << "straydir not found" << dendl;
7984 }
7985
7986 CInode *target = NULL;
7987 if (rollback.orig_dest.ino) {
7988 target = mdcache->get_inode(rollback.orig_dest.ino);
7989 if (target)
7990 assert(destdn && straydn);
7991 } else if (rollback.orig_dest.remote_ino)
7992 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
7993
7994 // can't use is_auth() in the resolve stage
7995 mds_rank_t whoami = mds->get_nodeid();
7996 // slave
7997 assert(!destdn || destdn->authority().first != whoami);
7998 assert(!straydn || straydn->authority().first != whoami);
7999
8000 bool force_journal_src = false;
8001 bool force_journal_dest = false;
8002 if (in && in->is_dir() && srcdn->authority().first != whoami)
8003 force_journal_src = _need_force_journal(in, false);
8004 if (in && target && target->is_dir())
8005 force_journal_dest = _need_force_journal(in, true);
8006
8007 version_t srcdnpv = 0;
8008 // repair src
8009 if (srcdn) {
8010 if (srcdn->authority().first == whoami)
8011 srcdnpv = srcdn->pre_dirty();
8012 if (rollback.orig_src.ino) {
8013 assert(in);
8014 srcdn->push_projected_linkage(in);
8015 } else
8016 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
8017 rollback.orig_src.remote_d_type);
8018 }
8019
8020 inode_t *pi = 0;
8021 if (in) {
8022 if (in->authority().first == whoami) {
8023 pi = in->project_inode();
8024 mut->add_projected_inode(in);
8025 pi->version = in->pre_dirty();
8026 } else
8027 pi = in->get_projected_inode();
8028 if (pi->ctime == rollback.ctime)
8029 pi->ctime = rollback.orig_src.old_ctime;
8030 }
8031
8032 if (srcdn && srcdn->authority().first == whoami) {
8033 nest_info_t blah;
8034 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
8035 in ? in->is_dir() : false, 1, pi ? pi->accounted_rstat : blah);
8036 }
8037
8038 // repair dest
8039 if (destdn) {
8040 if (rollback.orig_dest.ino && target) {
8041 destdn->push_projected_linkage(target);
8042 } else if (rollback.orig_dest.remote_ino) {
8043 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
8044 rollback.orig_dest.remote_d_type);
8045 } else {
8046 // the dentry will be trimmed soon, it's ok to have wrong linkage
8047 if (rollback.orig_dest.ino)
8048 assert(mds->is_resolve());
8049 destdn->push_projected_linkage();
8050 }
8051 }
8052
8053 if (straydn)
8054 straydn->push_projected_linkage();
8055
8056 if (target) {
8057 inode_t *ti = NULL;
8058 if (target->authority().first == whoami) {
8059 ti = target->project_inode();
8060 mut->add_projected_inode(target);
8061 ti->version = target->pre_dirty();
8062 } else
8063 ti = target->get_projected_inode();
8064 if (ti->ctime == rollback.ctime)
8065 ti->ctime = rollback.orig_dest.old_ctime;
8066 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
8067 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
8068 assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
8069 else
8070 assert(rollback.orig_dest.remote_ino &&
8071 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
8072 } else
8073 ti->nlink++;
8074 }
8075
8076 if (srcdn)
8077 dout(0) << " srcdn back to " << *srcdn << dendl;
8078 if (in)
8079 dout(0) << " srci back to " << *in << dendl;
8080 if (destdn)
8081 dout(0) << " destdn back to " << *destdn << dendl;
8082 if (target)
8083 dout(0) << " desti back to " << *target << dendl;
8084
8085 // journal it
8086 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
8087 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
8088 mdlog->start_entry(le);
8089
8090 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
8091 le->commit.add_dir_context(srcdir);
8092 if (rollback.orig_src.ino)
8093 le->commit.add_primary_dentry(srcdn, 0, true);
8094 else
8095 le->commit.add_remote_dentry(srcdn, true);
8096 }
8097
8098 if (!rollback.orig_src.ino && // remote linkage
8099 in && in->authority().first == whoami) {
8100 le->commit.add_dir_context(in->get_projected_parent_dir());
8101 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
8102 }
8103
8104 if (force_journal_dest) {
8105 assert(rollback.orig_dest.ino);
8106 le->commit.add_dir_context(destdir);
8107 le->commit.add_primary_dentry(destdn, 0, true);
8108 }
8109
8110 // slave: no need to journal straydn
8111
8112 if (target && target != in && target->authority().first == whoami) {
8113 assert(rollback.orig_dest.remote_ino);
8114 le->commit.add_dir_context(target->get_projected_parent_dir());
8115 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
8116 }
8117
8118 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
8119 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
8120 le->commit.renamed_dirino = in->ino();
8121 if (srcdn->authority().first == whoami) {
8122 list<CDir*> ls;
8123 in->get_dirfrags(ls);
8124 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
8125 CDir *dir = *p;
8126 if (!dir->is_auth())
8127 le->commit.renamed_dir_frags.push_back(dir->get_frag());
8128 }
8129 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
8130 }
8131 } else if (force_journal_dest) {
8132 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
8133 le->commit.renamed_dirino = target->ino();
8134 }
8135
8136 if (target && target->is_dir()) {
8137 assert(destdn);
8138 mdcache->project_subtree_rename(target, straydir, destdir);
8139 }
8140
8141 if (in && in->is_dir()) {
8142 assert(srcdn);
8143 mdcache->project_subtree_rename(in, destdir, srcdir);
8144 }
8145
8146 if (mdr && !mdr->more()->slave_update_journaled) {
8147 assert(le->commit.empty());
8148 mdlog->cancel_entry(le);
8149 mut->ls = NULL;
8150 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, finish_mdr);
8151 } else {
8152 assert(!le->commit.empty());
8153 if (mdr)
8154 mdr->more()->slave_update_journaled = false;
8155 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr, srcdn, srcdnpv,
8156 destdn, straydn, finish_mdr);
8157 submit_mdlog_entry(le, fin, mdr, __func__);
8158 mdlog->flush();
8159 }
8160 }
8161
8162 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
8163 version_t srcdnpv, CDentry *destdn,
8164 CDentry *straydn, bool finish_mdr)
8165 {
8166 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
8167
8168 if (straydn) {
8169 straydn->get_dir()->unlink_inode(straydn);
8170 straydn->pop_projected_linkage();
8171 }
8172 if (destdn) {
8173 destdn->get_dir()->unlink_inode(destdn);
8174 destdn->pop_projected_linkage();
8175 }
8176 if (srcdn) {
8177 srcdn->pop_projected_linkage();
8178 if (srcdn->authority().first == mds->get_nodeid())
8179 srcdn->mark_dirty(srcdnpv, mut->ls);
8180 }
8181
8182 mut->apply();
8183
8184 if (srcdn && srcdn->get_linkage()->is_primary()) {
8185 CInode *in = srcdn->get_linkage()->get_inode();
8186 if (srcdn->authority().first == mds->get_nodeid())
8187 in->state_set(CInode::STATE_AUTH);
8188 // update subtree map?
8189 if (in && in->is_dir()) {
8190 assert(destdn);
8191 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
8192 }
8193 }
8194
8195 if (destdn) {
8196 CInode *oldin = destdn->get_linkage()->get_inode();
8197 // update subtree map?
8198 if (oldin && oldin->is_dir()) {
8199 assert(straydn);
8200 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
8201 }
8202 }
8203
8204 if (mds->is_resolve()) {
8205 CDir *root = NULL;
8206 if (straydn)
8207 root = mdcache->get_subtree_root(straydn->get_dir());
8208 else if (destdn)
8209 root = mdcache->get_subtree_root(destdn->get_dir());
8210 if (root)
8211 mdcache->try_trim_non_auth_subtree(root);
8212 }
8213
8214 if (mdr) {
8215 list<MDSInternalContextBase*> finished;
8216 if (mdr->more()->is_ambiguous_auth) {
8217 if (srcdn->is_auth())
8218 mdr->more()->rename_inode->unfreeze_inode(finished);
8219
8220 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8221 mdr->more()->is_ambiguous_auth = false;
8222 }
8223 mds->queue_waiters(finished);
8224 if (finish_mdr || mdr->aborted)
8225 mdcache->request_finish(mdr);
8226 else
8227 mdr->more()->slave_rolling_back = false;
8228 }
8229
8230 mdcache->finish_rollback(mut->reqid);
8231
8232 mut->cleanup();
8233 }
8234
8235 /* This function DOES put the passed message before returning*/
8236 void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8237 {
8238 dout(10) << "handle_slave_rename_prep_ack " << *mdr
8239 << " witnessed by " << ack->get_source()
8240 << " " << *ack << dendl;
8241 mds_rank_t from = mds_rank_t(ack->get_source().num());
8242
8243 // note slave
8244 mdr->more()->slaves.insert(from);
8245 if (mdr->more()->srcdn_auth_mds == from &&
8246 mdr->more()->is_remote_frozen_authpin &&
8247 !mdr->more()->is_ambiguous_auth) {
8248 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
8249 }
8250
8251 // witnessed? or add extra witnesses?
8252 assert(mdr->more()->witnessed.count(from) == 0);
8253 if (ack->is_interrupted()) {
8254 dout(10) << " slave request interrupted, noop" << dendl;
8255 } else if (ack->witnesses.empty()) {
8256 mdr->more()->witnessed.insert(from);
8257 if (!ack->is_not_journaled())
8258 mdr->more()->has_journaled_slaves = true;
8259 } else {
8260 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
8261 mdr->more()->extra_witnesses.swap(ack->witnesses);
8262 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
8263 }
8264
8265 // srci import?
8266 if (ack->inode_export.length()) {
8267 dout(10) << " got srci import" << dendl;
8268 mdr->more()->inode_import.claim(ack->inode_export);
8269 mdr->more()->inode_import_v = ack->inode_export_v;
8270 }
8271
8272 // remove from waiting list
8273 assert(mdr->more()->waiting_on_slave.count(from));
8274 mdr->more()->waiting_on_slave.erase(from);
8275
8276 if (mdr->more()->waiting_on_slave.empty())
8277 dispatch_client_request(mdr); // go again!
8278 else
8279 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
8280 }
8281
8282 void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8283 {
8284 dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
8285 << ack->get_source() << dendl;
8286 assert(mdr->is_slave());
8287 mds_rank_t from = mds_rank_t(ack->get_source().num());
8288
8289 if (mdr->more()->waiting_on_slave.count(from)) {
8290 mdr->more()->waiting_on_slave.erase(from);
8291
8292 if (mdr->more()->waiting_on_slave.empty()) {
8293 if (mdr->slave_request)
8294 dispatch_slave_request(mdr);
8295 } else
8296 dout(10) << " still waiting for rename notify acks from "
8297 << mdr->more()->waiting_on_slave << dendl;
8298 }
8299 }
8300
8301 void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
8302 {
8303 dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
8304
8305 if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
8306 mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
8307
8308 if (mdr->more()->waiting_on_slave.empty()) {
8309 if (mdr->slave_request)
8310 dispatch_slave_request(mdr);
8311 } else
8312 dout(10) << " still waiting for rename notify acks from "
8313 << mdr->more()->waiting_on_slave << dendl;
8314 }
8315 }
8316
8317 // snaps
8318 /* This function takes responsibility for the passed mdr*/
8319 void Server::handle_client_lssnap(MDRequestRef& mdr)
8320 {
8321 MClientRequest *req = mdr->client_request;
8322
8323 // traverse to path
8324 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8325 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8326 respond_to_request(mdr, -ESTALE);
8327 return;
8328 }
8329 if (!diri->is_auth()) {
8330 mdcache->request_forward(mdr, diri->authority().first);
8331 return;
8332 }
8333 if (!diri->is_dir()) {
8334 respond_to_request(mdr, -ENOTDIR);
8335 return;
8336 }
8337 dout(10) << "lssnap on " << *diri << dendl;
8338
8339 // lock snap
8340 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8341 mds->locker->include_snap_rdlocks(rdlocks, diri);
8342 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8343 return;
8344
8345 if (!check_access(mdr, diri, MAY_READ))
8346 return;
8347
8348 SnapRealm *realm = diri->find_snaprealm();
8349 map<snapid_t,SnapInfo*> infomap;
8350 realm->get_snap_info(infomap, diri->get_oldest_snap());
8351
8352 unsigned max_entries = req->head.args.readdir.max_entries;
8353 if (!max_entries)
8354 max_entries = infomap.size();
8355 int max_bytes = req->head.args.readdir.max_bytes;
8356 if (!max_bytes)
8357 // make sure at least one item can be encoded
8358 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
8359
8360 __u64 last_snapid = 0;
8361 string offset_str = req->get_path2();
8362 if (!offset_str.empty())
8363 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
8364
8365 bufferlist dirbl;
8366 encode_empty_dirstat(dirbl);
8367
8368 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
8369
8370 __u32 num = 0;
8371 bufferlist dnbl;
8372 map<snapid_t,SnapInfo*>::iterator p = infomap.upper_bound(last_snapid);
8373 for (; p != infomap.end() && num < max_entries; ++p) {
8374 dout(10) << p->first << " -> " << *p->second << dendl;
8375
8376 // actual
8377 string snap_name;
8378 if (p->second->ino == diri->ino())
8379 snap_name = p->second->name;
8380 else
8381 snap_name = p->second->get_long_name();
8382
8383 unsigned start_len = dnbl.length();
8384 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
8385 break;
8386
8387 ::encode(snap_name, dnbl);
8388 encode_infinite_lease(dnbl);
8389
8390 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
8391 if (r < 0) {
8392 bufferlist keep;
8393 keep.substr_of(dnbl, 0, start_len);
8394 dnbl.swap(keep);
8395 break;
8396 }
8397 ++num;
8398 }
8399
8400 ::encode(num, dirbl);
8401 __u16 flags = 0;
8402 if (p == infomap.end()) {
8403 flags = CEPH_READDIR_FRAG_END;
8404 if (last_snapid == 0)
8405 flags |= CEPH_READDIR_FRAG_COMPLETE;
8406 }
8407 ::encode(flags, dirbl);
8408 dirbl.claim_append(dnbl);
8409
8410 mdr->reply_extra_bl = dirbl;
8411 mdr->tracei = diri;
8412 respond_to_request(mdr, 0);
8413 }
8414
8415
8416 // MKSNAP
8417
8418 struct C_MDS_mksnap_finish : public ServerLogContext {
8419 CInode *diri;
8420 SnapInfo info;
8421 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
8422 ServerLogContext(s, r), diri(di), info(i) {}
8423 void finish(int r) override {
8424 server->_mksnap_finish(mdr, diri, info);
8425 }
8426 };
8427
8428 /* This function takes responsibility for the passed mdr*/
8429 void Server::handle_client_mksnap(MDRequestRef& mdr)
8430 {
8431 if (!mds->mdsmap->allows_snaps()) {
8432 // you can't make snapshots until you set an option right now
8433 respond_to_request(mdr, -EPERM);
8434 return;
8435 }
8436
8437 MClientRequest *req = mdr->client_request;
8438 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8439 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8440 respond_to_request(mdr, -ESTALE);
8441 return;
8442 }
8443
8444 if (!diri->is_auth()) { // fw to auth?
8445 mdcache->request_forward(mdr, diri->authority().first);
8446 return;
8447 }
8448
8449 // dir only
8450 if (!diri->is_dir()) {
8451 respond_to_request(mdr, -ENOTDIR);
8452 return;
8453 }
8454 if (diri->is_system() && !diri->is_root()) {
8455 // no snaps in system dirs (root is ok)
8456 respond_to_request(mdr, -EPERM);
8457 return;
8458 }
8459
8460 const string &snapname = req->get_filepath().last_dentry();
8461
8462 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8463 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8464 respond_to_request(mdr, -EPERM);
8465 return;
8466 }
8467
8468 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
8469
8470 // lock snap
8471 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8472
8473 mds->locker->include_snap_rdlocks(rdlocks, diri);
8474 rdlocks.erase(&diri->snaplock);
8475 xlocks.insert(&diri->snaplock);
8476
8477 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8478 return;
8479
8480 if (!check_access(mdr, diri, MAY_WRITE))
8481 return;
8482
8483 // make sure name is unique
8484 if (diri->snaprealm &&
8485 diri->snaprealm->exists(snapname)) {
8486 respond_to_request(mdr, -EEXIST);
8487 return;
8488 }
8489 if (snapname.length() == 0 ||
8490 snapname[0] == '_') {
8491 respond_to_request(mdr, -EINVAL);
8492 return;
8493 }
8494
8495 // allocate a snapid
8496 if (!mdr->more()->stid) {
8497 // prepare an stid
8498 mds->snapclient->prepare_create(diri->ino(), snapname,
8499 mdr->get_mds_stamp(),
8500 &mdr->more()->stid, &mdr->more()->snapidbl,
8501 new C_MDS_RetryRequest(mdcache, mdr));
8502 return;
8503 }
8504
8505 version_t stid = mdr->more()->stid;
8506 snapid_t snapid;
8507 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8508 ::decode(snapid, p);
8509 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
8510
8511 // journal
8512 SnapInfo info;
8513 info.ino = diri->ino();
8514 info.snapid = snapid;
8515 info.name = snapname;
8516 info.stamp = mdr->get_op_stamp();
8517
8518 inode_t *pi = diri->project_inode();
8519 pi->ctime = info.stamp;
8520 pi->version = diri->pre_dirty();
8521
8522 // project the snaprealm
8523 sr_t *newsnap = diri->project_snaprealm(snapid);
8524 newsnap->snaps[snapid] = info;
8525 newsnap->seq = snapid;
8526 newsnap->last_created = snapid;
8527
8528 // journal the inode changes
8529 mdr->ls = mdlog->get_current_segment();
8530 EUpdate *le = new EUpdate(mdlog, "mksnap");
8531 mdlog->start_entry(le);
8532
8533 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8534 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8535 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8536 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8537
8538 // journal the snaprealm changes
8539 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
8540 mdr, __func__);
8541 mdlog->flush();
8542 }
8543
8544 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
8545 {
8546 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
8547
8548 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
8549
8550 diri->pop_and_dirty_projected_inode(mdr->ls);
8551 mdr->apply();
8552
8553 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8554
8555 // create snap
8556 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8557
8558 mdcache->do_realm_invalidate_and_update_notify(diri, op);
8559
8560 // yay
8561 mdr->in[0] = diri;
8562 mdr->snapid = info.snapid;
8563 mdr->tracei = diri;
8564 respond_to_request(mdr, 0);
8565 }
8566
8567
8568 // RMSNAP
8569
8570 struct C_MDS_rmsnap_finish : public ServerLogContext {
8571 CInode *diri;
8572 snapid_t snapid;
8573 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8574 ServerLogContext(s, r), diri(di), snapid(sn) {}
8575 void finish(int r) override {
8576 server->_rmsnap_finish(mdr, diri, snapid);
8577 }
8578 };
8579
8580 /* This function takes responsibility for the passed mdr*/
8581 void Server::handle_client_rmsnap(MDRequestRef& mdr)
8582 {
8583 MClientRequest *req = mdr->client_request;
8584
8585 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8586 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8587 respond_to_request(mdr, -ESTALE);
8588 return;
8589 }
8590 if (!diri->is_auth()) { // fw to auth?
8591 mdcache->request_forward(mdr, diri->authority().first);
8592 return;
8593 }
8594 if (!diri->is_dir()) {
8595 respond_to_request(mdr, -ENOTDIR);
8596 return;
8597 }
8598
8599 const string &snapname = req->get_filepath().last_dentry();
8600
8601 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8602 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8603 respond_to_request(mdr, -EPERM);
8604 return;
8605 }
8606
8607 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
8608
8609 // does snap exist?
8610 if (snapname.length() == 0 || snapname[0] == '_') {
8611 respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
8612 return;
8613 }
8614 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
8615 respond_to_request(mdr, -ENOENT);
8616 return;
8617 }
8618 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
8619 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
8620
8621 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8622 mds->locker->include_snap_rdlocks(rdlocks, diri);
8623 rdlocks.erase(&diri->snaplock);
8624 xlocks.insert(&diri->snaplock);
8625
8626 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8627 return;
8628
8629 if (!check_access(mdr, diri, MAY_WRITE))
8630 return;
8631
8632 // prepare
8633 if (!mdr->more()->stid) {
8634 mds->snapclient->prepare_destroy(diri->ino(), snapid,
8635 &mdr->more()->stid, &mdr->more()->snapidbl,
8636 new C_MDS_RetryRequest(mdcache, mdr));
8637 return;
8638 }
8639 version_t stid = mdr->more()->stid;
8640 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8641 snapid_t seq;
8642 ::decode(seq, p);
8643 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8644
8645 // journal
8646 inode_t *pi = diri->project_inode();
8647 pi->version = diri->pre_dirty();
8648 pi->ctime = mdr->get_op_stamp();
8649
8650 mdr->ls = mdlog->get_current_segment();
8651 EUpdate *le = new EUpdate(mdlog, "rmsnap");
8652 mdlog->start_entry(le);
8653
8654 // project the snaprealm
8655 sr_t *newnode = diri->project_snaprealm();
8656 newnode->snaps.erase(snapid);
8657 newnode->seq = seq;
8658 newnode->last_destroyed = seq;
8659
8660 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8661 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8662 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8663 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8664
8665 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
8666 mdr, __func__);
8667 mdlog->flush();
8668 }
8669
8670 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8671 {
8672 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
8673 snapid_t stid = mdr->more()->stid;
8674 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8675 snapid_t seq;
8676 ::decode(seq, p);
8677
8678 diri->pop_and_dirty_projected_inode(mdr->ls);
8679 mdr->apply();
8680
8681 mds->snapclient->commit(stid, mdr->ls);
8682
8683 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8684
8685 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
8686
8687 // yay
8688 mdr->in[0] = diri;
8689 respond_to_request(mdr, 0);
8690
8691 // purge snapshot data
8692 if (diri->snaprealm->have_past_parents_open())
8693 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
8694 }
8695
8696 struct C_MDS_renamesnap_finish : public ServerLogContext {
8697 CInode *diri;
8698 snapid_t snapid;
8699 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8700 ServerLogContext(s, r), diri(di), snapid(sn) {}
8701 void finish(int r) override {
8702 server->_renamesnap_finish(mdr, diri, snapid);
8703 }
8704 };
8705
8706 /* This function takes responsibility for the passed mdr*/
8707 void Server::handle_client_renamesnap(MDRequestRef& mdr)
8708 {
8709 MClientRequest *req = mdr->client_request;
8710 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
8711 respond_to_request(mdr, -EINVAL);
8712 return;
8713 }
8714
8715 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8716 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8717 respond_to_request(mdr, -ESTALE);
8718 return;
8719 }
8720
8721 if (!diri->is_auth()) { // fw to auth?
8722 mdcache->request_forward(mdr, diri->authority().first);
8723 return;
8724 }
8725
8726 if (!diri->is_dir()) { // dir only
8727 respond_to_request(mdr, -ENOTDIR);
8728 return;
8729 }
8730
8731 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid ||
8732 mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8733 respond_to_request(mdr, -EPERM);
8734 return;
8735 }
8736
8737 const string &dstname = req->get_filepath().last_dentry();
8738 const string &srcname = req->get_filepath2().last_dentry();
8739 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
8740
8741 if (srcname.length() == 0 || srcname[0] == '_') {
8742 respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
8743 return;
8744 }
8745 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
8746 respond_to_request(mdr, -ENOENT);
8747 return;
8748 }
8749 if (dstname.length() == 0 || dstname[0] == '_') {
8750 respond_to_request(mdr, -EINVAL);
8751 return;
8752 }
8753 if (diri->snaprealm->exists(dstname)) {
8754 respond_to_request(mdr, -EEXIST);
8755 return;
8756 }
8757
8758 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
8759 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
8760
8761 // lock snap
8762 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8763
8764 mds->locker->include_snap_rdlocks(rdlocks, diri);
8765 rdlocks.erase(&diri->snaplock);
8766 xlocks.insert(&diri->snaplock);
8767
8768 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8769 return;
8770
8771 if (!check_access(mdr, diri, MAY_WRITE))
8772 return;
8773
8774 // prepare
8775 if (!mdr->more()->stid) {
8776 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
8777 &mdr->more()->stid, &mdr->more()->snapidbl,
8778 new C_MDS_RetryRequest(mdcache, mdr));
8779 return;
8780 }
8781
8782 version_t stid = mdr->more()->stid;
8783 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8784 snapid_t seq;
8785 ::decode(seq, p);
8786 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8787
8788 // journal
8789 inode_t *pi = diri->project_inode();
8790 pi->ctime = mdr->get_op_stamp();
8791 pi->version = diri->pre_dirty();
8792
8793 // project the snaprealm
8794 sr_t *newsnap = diri->project_snaprealm();
8795 assert(newsnap->snaps.count(snapid));
8796 newsnap->snaps[snapid].name = dstname;
8797
8798 // journal the inode changes
8799 mdr->ls = mdlog->get_current_segment();
8800 EUpdate *le = new EUpdate(mdlog, "renamesnap");
8801 mdlog->start_entry(le);
8802
8803 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8804 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8805 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8806 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8807
8808 // journal the snaprealm changes
8809 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
8810 mdr, __func__);
8811 mdlog->flush();
8812 }
8813
8814 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8815 {
8816 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
8817
8818 diri->pop_and_dirty_projected_inode(mdr->ls);
8819 mdr->apply();
8820
8821 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8822
8823 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8824
8825 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE, true);
8826
8827 // yay
8828 mdr->in[0] = diri;
8829 mdr->tracei = diri;
8830 mdr->snapid = snapid;
8831 respond_to_request(mdr, 0);
8832 }
8833
8834 /**
8835 * Return true if server is in state RECONNECT and this
8836 * client has not yet reconnected.
8837 */
8838 bool Server::waiting_for_reconnect(client_t c) const
8839 {
8840 return client_reconnect_gather.count(c) > 0;
8841 }
8842
8843 void Server::dump_reconnect_status(Formatter *f) const
8844 {
8845 f->open_object_section("reconnect_status");
8846 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
8847 f->close_section();
8848 }