]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
2b403400652fdc8f67885affec76405825f3f0c9
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20
21 #include "MDSRank.h"
22 #include "Server.h"
23 #include "Locker.h"
24 #include "MDCache.h"
25 #include "MDLog.h"
26 #include "Migrator.h"
27 #include "MDBalancer.h"
28 #include "InoTable.h"
29 #include "SnapClient.h"
30 #include "Mutation.h"
31
32 #include "msg/Messenger.h"
33
34 #include "osdc/Objecter.h"
35
36 #include "messages/MClientSession.h"
37 #include "messages/MClientRequest.h"
38 #include "messages/MClientReply.h"
39 #include "messages/MClientReconnect.h"
40 #include "messages/MClientCaps.h"
41 #include "messages/MClientSnap.h"
42
43 #include "messages/MMDSSlaveRequest.h"
44
45 #include "messages/MLock.h"
46
47 #include "events/EUpdate.h"
48 #include "events/ESlaveUpdate.h"
49 #include "events/ESession.h"
50 #include "events/EOpen.h"
51 #include "events/ECommitted.h"
52
53 #include "include/filepath.h"
54 #include "common/errno.h"
55 #include "common/Timer.h"
56 #include "common/perf_counters.h"
57 #include "include/compat.h"
58 #include "osd/OSDMap.h"
59
60 #include <errno.h>
61
62 #include <list>
63 #include <iostream>
64 using namespace std;
65
66 #include "common/config.h"
67
68 #define dout_context g_ceph_context
69 #define dout_subsys ceph_subsys_mds
70 #undef dout_prefix
71 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
72
73
74 class ServerContext : public MDSInternalContextBase {
75 protected:
76 Server *server;
77 MDSRank *get_mds() override
78 {
79 return server->mds;
80 }
81
82 public:
83 explicit ServerContext(Server *s) : server(s) {
84 assert(server != NULL);
85 }
86 };
87
88 class ServerLogContext : public MDSLogContextBase {
89 protected:
90 Server *server;
91 MDSRank *get_mds() override
92 {
93 return server->mds;
94 }
95
96 MDRequestRef mdr;
97 void pre_finish(int r) override {
98 if (mdr)
99 mdr->mark_event("journal_committed: ");
100 }
101 public:
102 explicit ServerLogContext(Server *s) : server(s) {
103 assert(server != NULL);
104 }
105 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
106 assert(server != NULL);
107 }
108 };
109
110 void Server::create_logger()
111 {
112 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
113 plb.add_u64_counter(l_mdss_handle_client_request,"handle_client_request",
114 "Client requests", "hcr");
115 plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
116 "Slave requests", "hsr");
117 plb.add_u64_counter(l_mdss_handle_client_session, "handle_client_session",
118 "Client session messages", "hcs");
119 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", "Client requests dispatched");
120 plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request", "Server requests dispatched");
121 plb.add_u64_counter(l_mdss_req_lookuphash, "req_lookuphash",
122 "Request type lookup hash of inode");
123 plb.add_u64_counter(l_mdss_req_lookupino, "req_lookupino",
124 "Request type lookup inode");
125 plb.add_u64_counter(l_mdss_req_lookupparent, "req_lookupparent",
126 "Request type lookup parent");
127 plb.add_u64_counter(l_mdss_req_lookupname, "req_lookupname",
128 "Request type lookup name");
129 plb.add_u64_counter(l_mdss_req_lookup, "req_lookup",
130 "Request type lookup");
131 plb.add_u64_counter(l_mdss_req_lookupsnap, "req_lookupsnap",
132 "Request type lookup snapshot");
133 plb.add_u64_counter(l_mdss_req_getattr, "req_getattr",
134 "Request type get attribute");
135 plb.add_u64_counter(l_mdss_req_setattr, "req_setattr",
136 "Request type set attribute");
137 plb.add_u64_counter(l_mdss_req_setlayout, "req_setlayout",
138 "Request type set file layout");
139 plb.add_u64_counter(l_mdss_req_setdirlayout, "req_setdirlayout",
140 "Request type set directory layout");
141 plb.add_u64_counter(l_mdss_req_setxattr, "req_setxattr",
142 "Request type set extended attribute");
143 plb.add_u64_counter(l_mdss_req_rmxattr, "req_rmxattr",
144 "Request type remove extended attribute");
145 plb.add_u64_counter(l_mdss_req_readdir, "req_readdir",
146 "Request type read directory");
147 plb.add_u64_counter(l_mdss_req_setfilelock, "req_setfilelock",
148 "Request type set file lock");
149 plb.add_u64_counter(l_mdss_req_getfilelock, "req_getfilelock",
150 "Request type get file lock");
151 plb.add_u64_counter(l_mdss_req_create, "req_create",
152 "Request type create");
153 plb.add_u64_counter(l_mdss_req_open, "req_open",
154 "Request type open");
155 plb.add_u64_counter(l_mdss_req_mknod, "req_mknod",
156 "Request type make node");
157 plb.add_u64_counter(l_mdss_req_link, "req_link",
158 "Request type link");
159 plb.add_u64_counter(l_mdss_req_unlink, "req_unlink",
160 "Request type unlink");
161 plb.add_u64_counter(l_mdss_req_rmdir, "req_rmdir",
162 "Request type remove directory");
163 plb.add_u64_counter(l_mdss_req_rename, "req_rename",
164 "Request type rename");
165 plb.add_u64_counter(l_mdss_req_mkdir, "req_mkdir",
166 "Request type make directory");
167 plb.add_u64_counter(l_mdss_req_symlink, "req_symlink",
168 "Request type symbolic link");
169 plb.add_u64_counter(l_mdss_req_lssnap, "req_lssnap",
170 "Request type list snapshot");
171 plb.add_u64_counter(l_mdss_req_mksnap, "req_mksnap",
172 "Request type make snapshot");
173 plb.add_u64_counter(l_mdss_req_rmsnap, "req_rmsnap",
174 "Request type remove snapshot");
175 plb.add_u64_counter(l_mdss_req_renamesnap, "req_renamesnap",
176 "Request type rename snapshot");
177 logger = plb.create_perf_counters();
178 g_ceph_context->get_perfcounters_collection()->add(logger);
179 }
180
181 Server::Server(MDSRank *m) :
182 mds(m),
183 mdcache(mds->mdcache), mdlog(mds->mdlog),
184 logger(0),
185 is_full(false),
186 reconnect_done(NULL),
187 failed_reconnects(0),
188 reconnect_evicting(false),
189 terminating_sessions(false)
190 {
191 }
192
193
194 /* This function DOES put the passed message before returning*/
195 void Server::dispatch(Message *m)
196 {
197 switch (m->get_type()) {
198 case CEPH_MSG_CLIENT_RECONNECT:
199 handle_client_reconnect(static_cast<MClientReconnect*>(m));
200 return;
201 }
202
203 // active?
204 if (!mds->is_active() &&
205 !(mds->is_stopping() && m->get_source().is_mds())) {
206 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
207 (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT)) {
208 MClientRequest *req = static_cast<MClientRequest*>(m);
209 Session *session = get_session(req);
210 if (!session || session->is_closed()) {
211 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
212 req->put();
213 return;
214 }
215 bool queue_replay = false;
216 if (req->is_replay()) {
217 dout(3) << "queuing replayed op" << dendl;
218 queue_replay = true;
219 } else if (req->get_retry_attempt()) {
220 // process completed request in clientreplay stage. The completed request
221 // might have created new file/directorie. This guarantees MDS sends a reply
222 // to client before other request modifies the new file/directorie.
223 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
224 dout(3) << "queuing completed op" << dendl;
225 queue_replay = true;
226 }
227 // this request was created before the cap reconnect message, drop any embedded
228 // cap releases.
229 req->releases.clear();
230 }
231 if (queue_replay) {
232 req->mark_queued_for_replay();
233 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
234 return;
235 }
236 }
237
238 bool wait_for_active = true;
239 if (m->get_type() == MSG_MDS_SLAVE_REQUEST) {
240 // handle_slave_request() will wait if necessary
241 wait_for_active = false;
242 } else if (mds->is_clientreplay()) {
243 // session open requests need to be handled during replay,
244 // close requests need to be delayed
245 if ((m->get_type() == CEPH_MSG_CLIENT_SESSION &&
246 (static_cast<MClientSession*>(m))->get_op() != CEPH_SESSION_REQUEST_CLOSE)) {
247 wait_for_active = false;
248 } else if (m->get_type() == CEPH_MSG_CLIENT_REQUEST) {
249 MClientRequest *req = static_cast<MClientRequest*>(m);
250 if (req->is_queued_for_replay()) {
251 wait_for_active = false;
252 }
253 }
254 }
255 if (wait_for_active) {
256 dout(3) << "not active yet, waiting" << dendl;
257 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
258 return;
259 }
260 }
261
262 switch (m->get_type()) {
263 case CEPH_MSG_CLIENT_SESSION:
264 handle_client_session(static_cast<MClientSession*>(m));
265 return;
266 case CEPH_MSG_CLIENT_REQUEST:
267 handle_client_request(static_cast<MClientRequest*>(m));
268 return;
269 case MSG_MDS_SLAVE_REQUEST:
270 handle_slave_request(static_cast<MMDSSlaveRequest*>(m));
271 return;
272 default:
273 derr << "server unknown message " << m->get_type() << dendl;
274 assert(0 == "server unknown message");
275 }
276 }
277
278
279
280 // ----------------------------------------------------------
281 // SESSION management
282
283 class C_MDS_session_finish : public ServerLogContext {
284 Session *session;
285 uint64_t state_seq;
286 bool open;
287 version_t cmapv;
288 interval_set<inodeno_t> inos;
289 version_t inotablev;
290 Context *fin;
291 public:
292 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
293 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
294 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
295 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { }
296 void finish(int r) override {
297 assert(r == 0);
298 server->_session_logged(session, state_seq, open, cmapv, inos, inotablev);
299 if (fin) {
300 fin->complete(r);
301 }
302 }
303 };
304
305 Session *Server::get_session(Message *m)
306 {
307 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
308 if (session) {
309 dout(20) << "get_session have " << session << " " << session->info.inst
310 << " state " << session->get_state_name() << dendl;
311 session->put(); // not carry ref
312 } else {
313 dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
314 }
315 return session;
316 }
317
318 /* This function DOES put the passed message before returning*/
319 void Server::handle_client_session(MClientSession *m)
320 {
321 version_t pv;
322 bool blacklisted = false;
323 Session *session = get_session(m);
324
325 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
326 assert(m->get_source().is_client()); // should _not_ come from an mds!
327
328 if (!session) {
329 dout(0) << " ignoring sessionless msg " << *m << dendl;
330 m->put();
331 return;
332 }
333
334 if (logger)
335 logger->inc(l_mdss_handle_client_session);
336
337 uint64_t sseq = 0;
338 switch (m->get_op()) {
339 case CEPH_SESSION_REQUEST_OPEN:
340 if (session->is_opening() ||
341 session->is_open() ||
342 session->is_stale() ||
343 session->is_killing()) {
344 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
345 m->put();
346 return;
347 }
348 assert(session->is_closed() ||
349 session->is_closing());
350
351 blacklisted = mds->objecter->with_osdmap(
352 [session](const OSDMap &osd_map) -> bool {
353 return osd_map.is_blacklisted(session->info.inst.addr);
354 });
355
356 if (blacklisted) {
357 dout(10) << "ignoring blacklisted client " << session->info.inst.addr << dendl;
358 m->put();
359 return;
360 }
361
362 session->set_client_metadata(m->client_meta);
363 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN "
364 << session->info.client_metadata.size() << " metadata entries:" << dendl;
365 for (map<string, string>::iterator i = session->info.client_metadata.begin();
366 i != session->info.client_metadata.end(); ++i) {
367 dout(20) << " " << i->first << ": " << i->second << dendl;
368 }
369
370 // Special case for the 'root' metadata path; validate that the claimed
371 // root is actually within the caps of the session
372 if (session->info.client_metadata.count("root")) {
373 const auto claimed_root = session->info.client_metadata.at("root");
374 // claimed_root has a leading "/" which we strip before passing
375 // into caps check
376 if (claimed_root.empty() || claimed_root[0] != '/' ||
377 !session->auth_caps.path_capable(claimed_root.substr(1))) {
378 derr << __func__ << " forbidden path claimed as mount root: "
379 << claimed_root << " by " << m->get_source() << dendl;
380 // Tell the client we're rejecting their open
381 mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
382 mds->clog->warn() << "client session with invalid root '" <<
383 claimed_root << "' denied (" << session->info.inst << ")";
384 session->clear();
385 // Drop out; don't record this session in SessionMap or journal it.
386 break;
387 }
388 }
389
390 if (session->is_closed())
391 mds->sessionmap.add_session(session);
392
393 pv = mds->sessionmap.mark_projected(session);
394 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
395 mds->sessionmap.touch_session(session);
396 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, m->client_meta),
397 new C_MDS_session_finish(this, session, sseq, true, pv));
398 mdlog->flush();
399 break;
400
401 case CEPH_SESSION_REQUEST_RENEWCAPS:
402 if (session->is_open() ||
403 session->is_stale()) {
404 mds->sessionmap.touch_session(session);
405 if (session->is_stale()) {
406 mds->sessionmap.set_state(session, Session::STATE_OPEN);
407 mds->locker->resume_stale_caps(session);
408 mds->sessionmap.touch_session(session);
409 }
410 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_RENEWCAPS, m->get_seq()));
411 } else {
412 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
413 }
414 break;
415
416 case CEPH_SESSION_REQUEST_CLOSE:
417 {
418 if (session->is_closed() ||
419 session->is_closing() ||
420 session->is_killing()) {
421 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
422 m->put();
423 return;
424 }
425 if (session->is_importing()) {
426 dout(10) << "ignoring close req on importing session" << dendl;
427 m->put();
428 return;
429 }
430 assert(session->is_open() ||
431 session->is_stale() ||
432 session->is_opening());
433 if (m->get_seq() < session->get_push_seq()) {
434 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
435 << ", dropping" << dendl;
436 m->put();
437 return;
438 }
439 // We are getting a seq that is higher than expected.
440 // Handle the same as any other seqn error.
441 //
442 if (m->get_seq() != session->get_push_seq()) {
443 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
444 << ", BUGGY!" << dendl;
445 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
446 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
447 m->put();
448 return;
449 }
450 journal_close_session(session, Session::STATE_CLOSING, NULL);
451 }
452 break;
453
454 case CEPH_SESSION_FLUSHMSG_ACK:
455 finish_flush_session(session, m->get_seq());
456 break;
457
458 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
459 mdlog->flush();
460 break;
461
462 default:
463 ceph_abort();
464 }
465 m->put();
466 }
467
468 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
469 {
470 for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
471 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
472 assert(session);
473 if (!session->is_open() ||
474 !session->connection.get() ||
475 !session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER))
476 continue;
477 version_t seq = session->wait_for_flush(gather.new_sub());
478 mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
479 }
480 }
481
482 void Server::finish_flush_session(Session *session, version_t seq)
483 {
484 list<MDSInternalContextBase*> finished;
485 session->finish_flush(seq, finished);
486 mds->queue_waiters(finished);
487 }
488
489 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
490 interval_set<inodeno_t>& inos, version_t piv)
491 {
492 dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close")
493 << " " << pv << dendl;
494
495 if (piv) {
496 assert(session->is_closing() || session->is_killing() ||
497 session->is_opening()); // re-open closing session
498 session->info.prealloc_inos.subtract(inos);
499 mds->inotable->apply_release_ids(inos);
500 assert(mds->inotable->get_version() == piv);
501 }
502
503 mds->sessionmap.mark_dirty(session);
504
505 // apply
506 if (session->get_state_seq() != state_seq) {
507 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
508 << ", noop" << dendl;
509 // close must have been canceled (by an import?), or any number of other things..
510 } else if (open) {
511 assert(session->is_opening());
512 mds->sessionmap.set_state(session, Session::STATE_OPEN);
513 mds->sessionmap.touch_session(session);
514 assert(session->connection != NULL);
515 session->connection->send_message(new MClientSession(CEPH_SESSION_OPEN));
516 if (mdcache->is_readonly())
517 session->connection->send_message(new MClientSession(CEPH_SESSION_FORCE_RO));
518 } else if (session->is_closing() ||
519 session->is_killing()) {
520 // kill any lingering capabilities, leases, requests
521 while (!session->caps.empty()) {
522 Capability *cap = session->caps.front();
523 CInode *in = cap->get_inode();
524 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
525 mds->locker->remove_client_cap(in, session->info.inst.name.num());
526 }
527 while (!session->leases.empty()) {
528 ClientLease *r = session->leases.front();
529 CDentry *dn = static_cast<CDentry*>(r->parent);
530 dout(20) << " killing client lease of " << *dn << dendl;
531 dn->remove_client_lease(r, mds->locker);
532 }
533 if (client_reconnect_gather.count(session->info.get_client())) {
534 dout(20) << " removing client from reconnect set" << dendl;
535 client_reconnect_gather.erase(session->info.get_client());
536
537 if (client_reconnect_gather.empty()) {
538 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
539 reconnect_gather_finish();
540 }
541 }
542
543 if (session->is_closing()) {
544 // mark con disposable. if there is a fault, we will get a
545 // reset and clean it up. if the client hasn't received the
546 // CLOSE message yet, they will reconnect and get an
547 // ms_handle_remote_reset() and realize they had in fact closed.
548 // do this *before* sending the message to avoid a possible
549 // race.
550 if (session->connection != NULL) {
551 // Conditional because terminate_sessions will indiscrimately
552 // put sessions in CLOSING whether they ever had a conn or not.
553 session->connection->mark_disposable();
554 }
555
556 // reset session
557 mds->send_message_client(new MClientSession(CEPH_SESSION_CLOSE), session);
558 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
559 session->clear();
560 mds->sessionmap.remove_session(session);
561 } else if (session->is_killing()) {
562 // destroy session, close connection
563 if (session->connection != NULL) {
564 session->connection->mark_down();
565 }
566 mds->sessionmap.remove_session(session);
567 } else {
568 ceph_abort();
569 }
570 } else {
571 ceph_abort();
572 }
573 }
574
575 /**
576 * Inject sessions from some source other than actual connections.
577 *
578 * For example:
579 * - sessions inferred from journal replay
580 * - sessions learned from other MDSs during rejoin
581 * - sessions learned from other MDSs during dir/caps migration
582 * - sessions learned from other MDSs during a cross-MDS rename
583 */
584 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
585 map<client_t,uint64_t>& sseqmap)
586 {
587 version_t pv = mds->sessionmap.get_projected();
588
589 dout(10) << "prepare_force_open_sessions " << pv
590 << " on " << cm.size() << " clients"
591 << dendl;
592 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
593
594 Session *session = mds->sessionmap.get_or_add_session(p->second);
595 pv = mds->sessionmap.mark_projected(session);
596 if (session->is_closed() ||
597 session->is_closing() ||
598 session->is_killing())
599 sseqmap[p->first] = mds->sessionmap.set_state(session, Session::STATE_OPENING);
600 else
601 assert(session->is_open() ||
602 session->is_opening() ||
603 session->is_stale());
604 session->inc_importing();
605 }
606 return pv;
607 }
608
609 void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm,
610 map<client_t,uint64_t>& sseqmap,
611 bool dec_import)
612 {
613 /*
614 * FIXME: need to carefully consider the race conditions between a
615 * client trying to close a session and an MDS doing an import
616 * trying to force open a session...
617 */
618 dout(10) << "finish_force_open_sessions on " << cm.size() << " clients,"
619 << " initial v " << mds->sessionmap.get_version() << dendl;
620
621
622 int sessions_inserted = 0;
623 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
624 sessions_inserted++;
625
626 Session *session = mds->sessionmap.get_session(p->second.name);
627 assert(session);
628
629 if (sseqmap.count(p->first)) {
630 uint64_t sseq = sseqmap[p->first];
631 if (session->get_state_seq() != sseq) {
632 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
633 } else {
634 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
635 mds->sessionmap.set_state(session, Session::STATE_OPEN);
636 mds->sessionmap.touch_session(session);
637 mds->send_message_client(new MClientSession(CEPH_SESSION_OPEN), session);
638 if (mdcache->is_readonly())
639 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
640 }
641 } else {
642 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
643 assert(session->is_open() || session->is_stale());
644 }
645
646 if (dec_import) {
647 session->dec_importing();
648 }
649
650 mds->sessionmap.mark_dirty(session);
651 }
652
653 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
654 }
655
656 class C_MDS_TerminatedSessions : public ServerContext {
657 void finish(int r) override {
658 server->terminating_sessions = false;
659 }
660 public:
661 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
662 };
663
664 void Server::terminate_sessions()
665 {
666 dout(2) << "terminate_sessions" << dendl;
667
668 terminating_sessions = true;
669
670 // kill them off. clients will retry etc.
671 set<Session*> sessions;
672 mds->sessionmap.get_client_session_set(sessions);
673 for (set<Session*>::const_iterator p = sessions.begin();
674 p != sessions.end();
675 ++p) {
676 Session *session = *p;
677 if (session->is_closing() ||
678 session->is_killing() ||
679 session->is_closed())
680 continue;
681 journal_close_session(session, Session::STATE_CLOSING, NULL);
682 }
683
684 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
685 }
686
687
688 void Server::find_idle_sessions()
689 {
690 dout(10) << "find_idle_sessions. laggy until " << mds->get_laggy_until() << dendl;
691
692 // timeout/stale
693 // (caps go stale, lease die)
694 utime_t now = ceph_clock_now();
695 utime_t cutoff = now;
696 cutoff -= g_conf->mds_session_timeout;
697 while (1) {
698 Session *session = mds->sessionmap.get_oldest_session(Session::STATE_OPEN);
699 if (!session) break;
700 dout(20) << "laggiest active session is " << session->info.inst << dendl;
701 if (session->last_cap_renew >= cutoff) {
702 dout(20) << "laggiest active session is " << session->info.inst << " and sufficiently new ("
703 << session->last_cap_renew << ")" << dendl;
704 break;
705 }
706
707 dout(10) << "new stale session " << session->info.inst << " last " << session->last_cap_renew << dendl;
708 mds->sessionmap.set_state(session, Session::STATE_STALE);
709 mds->locker->revoke_stale_caps(session);
710 mds->locker->remove_stale_leases(session);
711 mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
712 finish_flush_session(session, session->get_push_seq());
713 }
714
715 // autoclose
716 cutoff = now;
717 cutoff -= g_conf->mds_session_autoclose;
718
719 // don't kick clients if we've been laggy
720 if (mds->get_laggy_until() > cutoff) {
721 dout(10) << " laggy_until " << mds->get_laggy_until() << " > cutoff " << cutoff
722 << ", not kicking any clients to be safe" << dendl;
723 return;
724 }
725
726 if (mds->sessionmap.get_sessions().size() == 1 &&
727 mds->mdsmap->get_num_in_mds() == 1) {
728 dout(20) << "not evicting a slow client, because there is only one"
729 << dendl;
730 return;
731 }
732
733 // Collect a list of sessions exceeding the autoclose threshold
734 std::vector<Session *> to_evict;
735 const auto sessions_p = mds->sessionmap.by_state.find(Session::STATE_STALE);
736 if (sessions_p == mds->sessionmap.by_state.end() || sessions_p->second->empty()) {
737 return;
738 }
739 const auto &stale_sessions = sessions_p->second;
740 assert(stale_sessions != nullptr);
741
742 for (const auto &session: *stale_sessions) {
743 if (session->is_importing()) {
744 dout(10) << "stopping at importing session " << session->info.inst << dendl;
745 break;
746 }
747 assert(session->is_stale());
748 if (session->last_cap_renew >= cutoff) {
749 dout(20) << "oldest stale session is " << session->info.inst << " and sufficiently new ("
750 << session->last_cap_renew << ")" << dendl;
751 break;
752 }
753
754 to_evict.push_back(session);
755 }
756
757 for (const auto &session: to_evict) {
758 utime_t age = now;
759 age -= session->last_cap_renew;
760 mds->clog->warn() << "evicting unresponsive client " << *session
761 << ", after " << age << " seconds";
762 dout(10) << "autoclosing stale session " << session->info.inst << " last "
763 << session->last_cap_renew << dendl;
764
765 if (g_conf->mds_session_blacklist_on_timeout) {
766 std::stringstream ss;
767 mds->evict_client(session->info.inst.name.num(), false, true,
768 ss, nullptr);
769 } else {
770 kill_session(session, NULL);
771 }
772 }
773 }
774
775 /*
776 * XXX bump in the interface here, not using an MDSInternalContextBase here
777 * because all the callers right now happen to use a SaferCond
778 */
779 void Server::kill_session(Session *session, Context *on_safe)
780 {
781 assert(mds->mds_lock.is_locked_by_me());
782
783 if ((session->is_opening() ||
784 session->is_open() ||
785 session->is_stale()) &&
786 !session->is_importing()) {
787 dout(10) << "kill_session " << session << dendl;
788 journal_close_session(session, Session::STATE_KILLING, on_safe);
789 } else {
790 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
791 assert(session->is_closing() ||
792 session->is_closed() ||
793 session->is_killing() ||
794 session->is_importing());
795 if (on_safe) {
796 on_safe->complete(0);
797 }
798 }
799 }
800
801 size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
802 {
803 std::list<Session*> victims;
804 const auto sessions = mds->sessionmap.get_sessions();
805 for (const auto p : sessions) {
806 if (!p.first.is_client()) {
807 // Do not apply OSDMap blacklist to MDS daemons, we find out
808 // about their death via MDSMap.
809 continue;
810 }
811
812 Session *s = p.second;
813 if (blacklist.count(s->info.inst.addr)) {
814 victims.push_back(s);
815 }
816 }
817
818 for (const auto s : victims) {
819 kill_session(s, nullptr);
820 }
821
822 dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
823
824 return victims.size();
825 }
826
827 void Server::journal_close_session(Session *session, int state, Context *on_safe)
828 {
829 uint64_t sseq = mds->sessionmap.set_state(session, state);
830 version_t pv = mds->sessionmap.mark_projected(session);
831 version_t piv = 0;
832
833 // release alloc and pending-alloc inos for this session
834 // and wipe out session state, in case the session close aborts for some reason
835 interval_set<inodeno_t> both;
836 both.insert(session->info.prealloc_inos);
837 both.insert(session->pending_prealloc_inos);
838 if (both.size()) {
839 mds->inotable->project_release_ids(both);
840 piv = mds->inotable->get_projected_version();
841 } else
842 piv = 0;
843
844 mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv),
845 new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe));
846 mdlog->flush();
847
848 // clean up requests, too
849 elist<MDRequestImpl*>::iterator p =
850 session->requests.begin(member_offset(MDRequestImpl,
851 item_session_request));
852 while (!p.end()) {
853 MDRequestRef mdr = mdcache->request_get((*p)->reqid);
854 ++p;
855 mdcache->request_kill(mdr);
856 }
857
858 finish_flush_session(session, session->get_push_seq());
859 }
860
861 void Server::reconnect_clients(MDSInternalContext *reconnect_done_)
862 {
863 reconnect_done = reconnect_done_;
864 mds->sessionmap.get_client_set(client_reconnect_gather);
865
866 if (client_reconnect_gather.empty()) {
867 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
868 reconnect_gather_finish();
869 return;
870 }
871
872 // clients will get the mdsmap and discover we're reconnecting via the monitor.
873
874 reconnect_start = ceph_clock_now();
875 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
876 mds->sessionmap.dump();
877 }
878
879 /* This function DOES put the passed message before returning*/
880 void Server::handle_client_reconnect(MClientReconnect *m)
881 {
882 dout(7) << "handle_client_reconnect " << m->get_source() << dendl;
883 client_t from = m->get_source().num();
884 Session *session = get_session(m);
885 assert(session);
886
887 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
888 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
889 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
890 return;
891 }
892
893 utime_t delay = ceph_clock_now();
894 delay -= reconnect_start;
895 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
896
897 bool deny = false;
898 if (!mds->is_reconnect()) {
899 // XXX maybe in the future we can do better than this?
900 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
901 mds->clog->info() << "denied reconnect attempt (mds is "
902 << ceph_mds_state_name(mds->get_state())
903 << ") from " << m->get_source_inst()
904 << " after " << delay << " (allowed interval " << g_conf->mds_reconnect_timeout << ")";
905 deny = true;
906 } else if (session->is_closed()) {
907 dout(1) << " session is closed, ignoring reconnect, sending close" << dendl;
908 mds->clog->info() << "denied reconnect attempt (mds is "
909 << ceph_mds_state_name(mds->get_state())
910 << ") from " << m->get_source_inst() << " (session is closed)";
911 deny = true;
912 } else if (mdcache->is_readonly()) {
913 dout(1) << " read-only FS, ignoring reconnect, sending close" << dendl;
914 mds->clog->info() << "denied reconnect attempt (mds is read-only)";
915 deny = true;
916 }
917
918 if (deny) {
919 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_CLOSE));
920 m->put();
921 return;
922 }
923
924 // notify client of success with an OPEN
925 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_OPEN));
926 session->last_cap_renew = ceph_clock_now();
927 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
928
929 // snaprealms
930 for (vector<ceph_mds_snaprealm_reconnect>::iterator p = m->realms.begin();
931 p != m->realms.end();
932 ++p) {
933 CInode *in = mdcache->get_inode(inodeno_t(p->ino));
934 if (in && in->state_test(CInode::STATE_PURGING))
935 continue;
936 if (in) {
937 assert(in->snaprealm);
938 if (in->snaprealm->have_past_parents_open()) {
939 dout(15) << "open snaprealm (w/ past parents) on " << *in << dendl;
940 mdcache->finish_snaprealm_reconnect(from, in->snaprealm, snapid_t(p->seq));
941 } else {
942 dout(15) << "open snaprealm (w/o past parents) on " << *in << dendl;
943 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
944 }
945 } else {
946 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(p->ino)
947 << " seq " << p->seq << dendl;
948 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
949 }
950 }
951
952 // caps
953 for (map<inodeno_t, cap_reconnect_t>::iterator p = m->caps.begin();
954 p != m->caps.end();
955 ++p) {
956 // make sure our last_cap_id is MAX over all issued caps
957 if (p->second.capinfo.cap_id > mdcache->last_cap_id)
958 mdcache->last_cap_id = p->second.capinfo.cap_id;
959
960 CInode *in = mdcache->get_inode(p->first);
961 if (in && in->state_test(CInode::STATE_PURGING))
962 continue;
963 if (in && in->is_auth()) {
964 // we recovered it, and it's ours. take note.
965 dout(15) << "open cap realm " << inodeno_t(p->second.capinfo.snaprealm)
966 << " on " << *in << dendl;
967 in->reconnect_cap(from, p->second, session);
968 mdcache->add_reconnected_cap(from, p->first, p->second);
969 recover_filelocks(in, p->second.flockbl, m->get_orig_source().num());
970 continue;
971 }
972
973 if (in && !in->is_auth()) {
974 // not mine.
975 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
976 // add to cap export list.
977 p->second.path.clear(); // we don't need path
978 mdcache->rejoin_export_caps(p->first, from, p->second,
979 in->authority().first);
980 } else {
981 // don't know if the inode is mine
982 dout(10) << "missing ino " << p->first << ", will load later" << dendl;
983 p->second.path.clear(); // we don't need path
984 mdcache->rejoin_recovered_caps(p->first, from, p->second, MDS_RANK_NONE);
985 }
986 }
987
988 // remove from gather set
989 client_reconnect_gather.erase(from);
990 if (client_reconnect_gather.empty())
991 reconnect_gather_finish();
992
993 m->put();
994 }
995
996
997
998 void Server::reconnect_gather_finish()
999 {
1000 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1001 assert(reconnect_done);
1002 reconnect_done->complete(0);
1003 reconnect_done = NULL;
1004 }
1005
1006 void Server::reconnect_tick()
1007 {
1008 if (reconnect_evicting) {
1009 dout(4) << "reconnect_tick: waiting for evictions" << dendl;
1010 return;
1011 }
1012
1013 utime_t reconnect_end = reconnect_start;
1014 reconnect_end += g_conf->mds_reconnect_timeout;
1015 if (ceph_clock_now() >= reconnect_end &&
1016 !client_reconnect_gather.empty()) {
1017 dout(10) << "reconnect timed out" << dendl;
1018
1019 // If we're doing blacklist evictions, use this to wait for them before
1020 // proceeding to reconnect_gather_finish
1021 MDSGatherBuilder gather(g_ceph_context);
1022
1023 for (set<client_t>::iterator p = client_reconnect_gather.begin();
1024 p != client_reconnect_gather.end();
1025 ++p) {
1026 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
1027 assert(session);
1028 dout(1) << "reconnect gave up on " << session->info.inst << dendl;
1029
1030 mds->clog->warn() << "evicting unresponsive client " << *session
1031 << ", after waiting " << g_conf->mds_reconnect_timeout
1032 << " seconds during MDS startup";
1033
1034 if (g_conf->mds_session_blacklist_on_timeout) {
1035 std::stringstream ss;
1036 mds->evict_client(session->info.inst.name.num(), false, true, ss,
1037 gather.new_sub());
1038 } else {
1039 kill_session(session, NULL);
1040 }
1041
1042 failed_reconnects++;
1043 }
1044 client_reconnect_gather.clear();
1045
1046 if (gather.has_subs()) {
1047 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1048 gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
1049 [this](int r){reconnect_gather_finish();})));
1050 gather.activate();
1051 reconnect_evicting = true;
1052 } else {
1053 reconnect_gather_finish();
1054 }
1055 }
1056 }
1057
1058 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1059 {
1060 if (!locks.length()) return;
1061 int numlocks;
1062 ceph_filelock lock;
1063 bufferlist::iterator p = locks.begin();
1064 ::decode(numlocks, p);
1065 for (int i = 0; i < numlocks; ++i) {
1066 ::decode(lock, p);
1067 lock.client = client;
1068 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1069 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1070 }
1071 ::decode(numlocks, p);
1072 for (int i = 0; i < numlocks; ++i) {
1073 ::decode(lock, p);
1074 lock.client = client;
1075 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1076 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1077 }
1078 }
1079
1080
1081 /**
1082 * Call this when the MDCache is oversized, to send requests to the clients
1083 * to trim some caps, and consequently unpin some inodes in the MDCache so
1084 * that it can trim too.
1085 */
1086 void Server::recall_client_state(float ratio)
1087 {
1088 int max_caps_per_client = (int)(g_conf->mds_cache_size * .8);
1089 int min_caps_per_client = 100;
1090
1091 dout(10) << "recall_client_state " << ratio
1092 << ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
1093 << dendl;
1094
1095 set<Session*> sessions;
1096 mds->sessionmap.get_client_session_set(sessions);
1097 for (set<Session*>::const_iterator p = sessions.begin();
1098 p != sessions.end();
1099 ++p) {
1100 Session *session = *p;
1101 if (!session->is_open() ||
1102 !session->info.inst.name.is_client())
1103 continue;
1104
1105 dout(10) << " session " << session->info.inst
1106 << " caps " << session->caps.size()
1107 << ", leases " << session->leases.size()
1108 << dendl;
1109
1110 if (session->caps.size() > min_caps_per_client) {
1111 int newlim = MIN((int)(session->caps.size() * ratio), max_caps_per_client);
1112 if (session->caps.size() > newlim) {
1113 MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
1114 m->head.max_caps = newlim;
1115 mds->send_message_client(m, session);
1116 session->notify_recall_sent(newlim);
1117 }
1118 }
1119 }
1120 }
1121
1122 void Server::force_clients_readonly()
1123 {
1124 dout(10) << "force_clients_readonly" << dendl;
1125 set<Session*> sessions;
1126 mds->sessionmap.get_client_session_set(sessions);
1127 for (set<Session*>::const_iterator p = sessions.begin();
1128 p != sessions.end();
1129 ++p) {
1130 Session *session = *p;
1131 if (!session->info.inst.name.is_client() ||
1132 !(session->is_open() || session->is_stale()))
1133 continue;
1134 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
1135 }
1136 }
1137
1138 /*******
1139 * some generic stuff for finishing off requests
1140 */
1141 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1142 {
1143 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1144 assert(!mdr->has_completed);
1145
1146 // note trace items for eventual reply.
1147 mdr->tracei = in;
1148 if (in)
1149 mdr->pin(in);
1150
1151 mdr->tracedn = dn;
1152 if (dn)
1153 mdr->pin(dn);
1154
1155 early_reply(mdr, in, dn);
1156
1157 mdr->committing = true;
1158 submit_mdlog_entry(le, fin, mdr, __func__);
1159
1160 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1161 if (mds->queue_one_replay()) {
1162 dout(10) << " queued next replay op" << dendl;
1163 } else {
1164 dout(10) << " journaled last replay op, flushing" << dendl;
1165 mdlog->flush();
1166 }
1167 } else if (mdr->did_early_reply)
1168 mds->locker->drop_rdlocks(mdr.get());
1169 else
1170 mdlog->flush();
1171 }
1172
1173 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1174 const char *event)
1175 {
1176 if (mdr) {
1177 string event_str("submit entry: ");
1178 event_str += event;
1179 mdr->mark_event_string(event_str);
1180 }
1181 mdlog->submit_entry(le, fin);
1182 }
1183
1184 /*
1185 * send response built from mdr contents and error code; clean up mdr
1186 */
1187 void Server::respond_to_request(MDRequestRef& mdr, int r)
1188 {
1189 if (mdr->client_request) {
1190 reply_client_request(mdr, new MClientReply(mdr->client_request, r));
1191
1192 // add here to avoid counting ops multiple times (e.g., locks, loading)
1193 switch(mdr->client_request->get_op()) {
1194 case CEPH_MDS_OP_LOOKUPHASH:
1195 logger->inc(l_mdss_req_lookuphash);
1196 break;
1197 case CEPH_MDS_OP_LOOKUPINO:
1198 logger->inc(l_mdss_req_lookupino);
1199 break;
1200 case CEPH_MDS_OP_LOOKUPPARENT:
1201 logger->inc(l_mdss_req_lookupparent);
1202 break;
1203 case CEPH_MDS_OP_LOOKUPNAME:
1204 logger->inc(l_mdss_req_lookupname);
1205 break;
1206 case CEPH_MDS_OP_LOOKUP:
1207 logger->inc(l_mdss_req_lookup);
1208 break;
1209 case CEPH_MDS_OP_LOOKUPSNAP:
1210 logger->inc(l_mdss_req_lookupsnap);
1211 break;
1212 case CEPH_MDS_OP_GETATTR:
1213 logger->inc(l_mdss_req_getattr);
1214 break;
1215 case CEPH_MDS_OP_SETATTR:
1216 logger->inc(l_mdss_req_setattr);
1217 break;
1218 case CEPH_MDS_OP_SETLAYOUT:
1219 logger->inc(l_mdss_req_setlayout);
1220 break;
1221 case CEPH_MDS_OP_SETDIRLAYOUT:
1222 logger->inc(l_mdss_req_setdirlayout);
1223 break;
1224 case CEPH_MDS_OP_SETXATTR:
1225 logger->inc(l_mdss_req_setxattr);
1226 break;
1227 case CEPH_MDS_OP_RMXATTR:
1228 logger->inc(l_mdss_req_rmxattr);
1229 break;
1230 case CEPH_MDS_OP_READDIR:
1231 logger->inc(l_mdss_req_readdir);
1232 break;
1233 case CEPH_MDS_OP_SETFILELOCK:
1234 logger->inc(l_mdss_req_setfilelock);
1235 break;
1236 case CEPH_MDS_OP_GETFILELOCK:
1237 logger->inc(l_mdss_req_getfilelock);
1238 break;
1239 case CEPH_MDS_OP_CREATE:
1240 logger->inc(l_mdss_req_create);
1241 case CEPH_MDS_OP_OPEN:
1242 logger->inc(l_mdss_req_open);
1243 break;
1244 case CEPH_MDS_OP_MKNOD:
1245 logger->inc(l_mdss_req_mknod);
1246 break;
1247 case CEPH_MDS_OP_LINK:
1248 logger->inc(l_mdss_req_link);
1249 break;
1250 case CEPH_MDS_OP_UNLINK:
1251 logger->inc(l_mdss_req_unlink);
1252 break;
1253 case CEPH_MDS_OP_RMDIR:
1254 logger->inc(l_mdss_req_rmdir);
1255 break;
1256 case CEPH_MDS_OP_RENAME:
1257 logger->inc(l_mdss_req_rename);
1258 break;
1259 case CEPH_MDS_OP_MKDIR:
1260 logger->inc(l_mdss_req_mkdir);
1261 break;
1262 case CEPH_MDS_OP_SYMLINK:
1263 logger->inc(l_mdss_req_symlink);
1264 break;
1265 case CEPH_MDS_OP_LSSNAP:
1266 logger->inc(l_mdss_req_lssnap);
1267 break;
1268 case CEPH_MDS_OP_MKSNAP:
1269 logger->inc(l_mdss_req_mksnap);
1270 break;
1271 case CEPH_MDS_OP_RMSNAP:
1272 logger->inc(l_mdss_req_rmsnap);
1273 break;
1274 case CEPH_MDS_OP_RENAMESNAP:
1275 logger->inc(l_mdss_req_renamesnap);
1276 break;
1277 }
1278 } else if (mdr->internal_op > -1) {
1279 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1280 if (!mdr->internal_op_finish)
1281 assert(0 == "trying to respond to internal op without finisher");
1282 mdr->internal_op_finish->complete(r);
1283 mdcache->request_finish(mdr);
1284 }
1285 }
1286
1287 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
1288 {
1289 if (!g_conf->mds_early_reply)
1290 return;
1291
1292 if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
1293 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
1294 return;
1295 }
1296
1297 if (mdr->alloc_ino) {
1298 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
1299 return;
1300 }
1301
1302 MClientRequest *req = mdr->client_request;
1303 entity_inst_t client_inst = req->get_source_inst();
1304 if (client_inst.name.is_mds())
1305 return;
1306
1307 if (req->is_replay()) {
1308 dout(10) << " no early reply on replay op" << dendl;
1309 return;
1310 }
1311
1312
1313 MClientReply *reply = new MClientReply(req, 0);
1314 reply->set_unsafe();
1315
1316 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1317 //
1318 //_rename_finish() does not send dentry link/unlink message to replicas.
1319 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1320 // that have projected linkages from getting new replica.
1321 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
1322
1323 dout(10) << "early_reply " << reply->get_result()
1324 << " (" << cpp_strerror(reply->get_result())
1325 << ") " << *req << dendl;
1326
1327 if (tracei || tracedn) {
1328 if (tracei)
1329 mdr->cap_releases.erase(tracei->vino());
1330 if (tracedn)
1331 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1332
1333 set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid,
1334 req->get_dentry_wanted(), mdr);
1335 }
1336
1337 reply->set_extra_bl(mdr->reply_extra_bl);
1338 req->get_connection()->send_message(reply);
1339
1340 mdr->did_early_reply = true;
1341
1342 mds->logger->inc(l_mds_reply);
1343 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
1344 mds->logger->tinc(l_mds_reply_latency, lat);
1345 dout(20) << "lat " << lat << dendl;
1346
1347 mdr->mark_event("early_replied");
1348 }
1349
1350 /*
1351 * send given reply
1352 * include a trace to tracei
1353 * Clean up mdr
1354 */
1355 void Server::reply_client_request(MDRequestRef& mdr, MClientReply *reply)
1356 {
1357 assert(mdr.get());
1358 MClientRequest *req = mdr->client_request;
1359
1360 dout(7) << "reply_client_request " << reply->get_result()
1361 << " (" << cpp_strerror(reply->get_result())
1362 << ") " << *req << dendl;
1363
1364 mdr->mark_event("replying");
1365
1366 Session *session = mdr->session;
1367
1368 // note successful request in session map?
1369 //
1370 // setfilelock requests are special, they only modify states in MDS memory.
1371 // The states get lost when MDS fails. If Client re-send a completed
1372 // setfilelock request, it means that client did not receive corresponding
1373 // setfilelock reply. So MDS should re-execute the setfilelock request.
1374 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
1375 reply->get_result() == 0 && session) {
1376 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
1377 session->add_completed_request(mdr->reqid.tid, created);
1378 if (mdr->ls) {
1379 mdr->ls->touched_sessions.insert(session->info.inst.name);
1380 }
1381 }
1382
1383 // give any preallocated inos to the session
1384 apply_allocated_inos(mdr, session);
1385
1386 // get tracei/tracedn from mdr?
1387 snapid_t snapid = mdr->snapid;
1388 CInode *tracei = mdr->tracei;
1389 CDentry *tracedn = mdr->tracedn;
1390
1391 bool is_replay = mdr->client_request->is_replay();
1392 bool did_early_reply = mdr->did_early_reply;
1393 entity_inst_t client_inst = req->get_source_inst();
1394 int dentry_wanted = req->get_dentry_wanted();
1395
1396 if (!did_early_reply && !is_replay) {
1397
1398 mds->logger->inc(l_mds_reply);
1399 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
1400 mds->logger->tinc(l_mds_reply_latency, lat);
1401 dout(20) << "lat " << lat << dendl;
1402
1403 if (tracei)
1404 mdr->cap_releases.erase(tracei->vino());
1405 if (tracedn)
1406 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1407 }
1408
1409 // drop non-rdlocks before replying, so that we can issue leases
1410 mdcache->request_drop_non_rdlocks(mdr);
1411
1412 // reply at all?
1413 if (client_inst.name.is_mds() || !session) {
1414 reply->put(); // mds doesn't need a reply
1415 reply = 0;
1416 } else {
1417 // send reply.
1418 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
1419 (tracei || tracedn)) {
1420 if (is_replay) {
1421 if (tracei)
1422 mdcache->try_reconnect_cap(tracei, session);
1423 } else {
1424 // include metadata in reply
1425 set_trace_dist(session, reply, tracei, tracedn,
1426 snapid, dentry_wanted,
1427 mdr);
1428 }
1429 }
1430
1431 // We can set the extra bl unconditionally: if it's already been sent in the
1432 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
1433 reply->set_extra_bl(mdr->reply_extra_bl);
1434
1435 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
1436 req->get_connection()->send_message(reply);
1437 }
1438
1439 if (req->is_queued_for_replay() &&
1440 (mdr->has_completed || reply->get_result() < 0)) {
1441 if (reply->get_result() < 0) {
1442 int r = reply->get_result();
1443 derr << "reply_client_request: failed to replay " << *req
1444 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
1445 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
1446 }
1447 mds->queue_one_replay();
1448 }
1449
1450 // clean up request
1451 mdcache->request_finish(mdr);
1452
1453 // take a closer look at tracei, if it happens to be a remote link
1454 if (tracei &&
1455 tracedn &&
1456 tracedn->get_projected_linkage()->is_remote()) {
1457 mdcache->eval_remote(tracedn);
1458 }
1459 }
1460
1461
1462 void Server::encode_empty_dirstat(bufferlist& bl)
1463 {
1464 static DirStat empty;
1465 empty.encode(bl);
1466 }
1467
1468 void Server::encode_infinite_lease(bufferlist& bl)
1469 {
1470 LeaseStat e;
1471 e.seq = 0;
1472 e.mask = -1;
1473 e.duration_ms = -1;
1474 ::encode(e, bl);
1475 dout(20) << "encode_infinite_lease " << e << dendl;
1476 }
1477
1478 void Server::encode_null_lease(bufferlist& bl)
1479 {
1480 LeaseStat e;
1481 e.seq = 0;
1482 e.mask = 0;
1483 e.duration_ms = 0;
1484 ::encode(e, bl);
1485 dout(20) << "encode_null_lease " << e << dendl;
1486 }
1487
1488
1489 /*
1490 * pass inode OR dentry (not both, or we may get confused)
1491 *
1492 * trace is in reverse order (i.e. root inode comes last)
1493 */
1494 void Server::set_trace_dist(Session *session, MClientReply *reply,
1495 CInode *in, CDentry *dn,
1496 snapid_t snapid,
1497 int dentry_wanted,
1498 MDRequestRef& mdr)
1499 {
1500 // skip doing this for debugging purposes?
1501 if (g_conf->mds_inject_traceless_reply_probability &&
1502 mdr->ls && !mdr->o_trunc &&
1503 (rand() % 10000 < g_conf->mds_inject_traceless_reply_probability * 10000.0)) {
1504 dout(5) << "deliberately skipping trace for " << *reply << dendl;
1505 return;
1506 }
1507
1508 // inode, dentry, dir, ..., inode
1509 bufferlist bl;
1510 mds_rank_t whoami = mds->get_nodeid();
1511 client_t client = session->get_client();
1512 utime_t now = ceph_clock_now();
1513
1514 dout(20) << "set_trace_dist snapid " << snapid << dendl;
1515
1516 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
1517
1518 // realm
1519 if (snapid == CEPH_NOSNAP) {
1520 SnapRealm *realm;
1521 if (in)
1522 realm = in->find_snaprealm();
1523 else
1524 realm = dn->get_dir()->get_inode()->find_snaprealm();
1525 reply->snapbl = realm->get_snap_trace();
1526 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
1527 }
1528
1529 // dir + dentry?
1530 if (dn) {
1531 reply->head.is_dentry = 1;
1532 CDir *dir = dn->get_dir();
1533 CInode *diri = dir->get_inode();
1534
1535 diri->encode_inodestat(bl, session, NULL, snapid);
1536 dout(20) << "set_trace_dist added diri " << *diri << dendl;
1537
1538 #ifdef MDS_VERIFY_FRAGSTAT
1539 if (dir->is_complete())
1540 dir->verify_fragstat();
1541 #endif
1542 dir->encode_dirstat(bl, whoami);
1543 dout(20) << "set_trace_dist added dir " << *dir << dendl;
1544
1545 ::encode(dn->get_name(), bl);
1546 if (snapid == CEPH_NOSNAP)
1547 mds->locker->issue_client_lease(dn, client, bl, now, session);
1548 else
1549 encode_null_lease(bl);
1550 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
1551 } else
1552 reply->head.is_dentry = 0;
1553
1554 // inode
1555 if (in) {
1556 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
1557 dout(20) << "set_trace_dist added in " << *in << dendl;
1558 reply->head.is_target = 1;
1559 } else
1560 reply->head.is_target = 0;
1561
1562 reply->set_trace(bl);
1563 }
1564
1565
1566
1567
1568 /***
1569 * process a client request
1570 * This function DOES put the passed message before returning
1571 */
1572 void Server::handle_client_request(MClientRequest *req)
1573 {
1574 dout(4) << "handle_client_request " << *req << dendl;
1575
1576 if (mds->logger)
1577 mds->logger->inc(l_mds_request);
1578 if (logger)
1579 logger->inc(l_mdss_handle_client_request);
1580
1581 if (!mdcache->is_open()) {
1582 dout(5) << "waiting for root" << dendl;
1583 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
1584 return;
1585 }
1586
1587 // active session?
1588 Session *session = 0;
1589 if (req->get_source().is_client()) {
1590 session = get_session(req);
1591 if (!session) {
1592 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
1593 } else if (session->is_closed() ||
1594 session->is_closing() ||
1595 session->is_killing()) {
1596 dout(5) << "session closed|closing|killing, dropping" << dendl;
1597 session = NULL;
1598 }
1599 if (!session) {
1600 if (req->is_queued_for_replay())
1601 mds->queue_one_replay();
1602 req->put();
1603 return;
1604 }
1605 }
1606
1607 // old mdsmap?
1608 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
1609 // send it? hrm, this isn't ideal; they may get a lot of copies if
1610 // they have a high request rate.
1611 }
1612
1613 // completed request?
1614 bool has_completed = false;
1615 if (req->is_replay() || req->get_retry_attempt()) {
1616 assert(session);
1617 inodeno_t created;
1618 if (session->have_completed_request(req->get_reqid().tid, &created)) {
1619 has_completed = true;
1620 // Don't send traceless reply if the completed request has created
1621 // new inode. Treat the request as lookup request instead.
1622 if (req->is_replay() ||
1623 ((created == inodeno_t() || !mds->is_clientreplay()) &&
1624 req->get_op() != CEPH_MDS_OP_OPEN &&
1625 req->get_op() != CEPH_MDS_OP_CREATE)) {
1626 dout(5) << "already completed " << req->get_reqid() << dendl;
1627 MClientReply *reply = new MClientReply(req, 0);
1628 if (created != inodeno_t()) {
1629 bufferlist extra;
1630 ::encode(created, extra);
1631 reply->set_extra_bl(extra);
1632 }
1633 req->get_connection()->send_message(reply);
1634
1635 if (req->is_queued_for_replay())
1636 mds->queue_one_replay();
1637
1638 req->put();
1639 return;
1640 }
1641 if (req->get_op() != CEPH_MDS_OP_OPEN &&
1642 req->get_op() != CEPH_MDS_OP_CREATE) {
1643 dout(10) << " completed request which created new inode " << created
1644 << ", convert it to lookup request" << dendl;
1645 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
1646 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
1647 }
1648 }
1649 }
1650
1651 // trim completed_request list
1652 if (req->get_oldest_client_tid() > 0) {
1653 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
1654 assert(session);
1655 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
1656 // Sessions 'completed_requests' was dirtied, mark it to be
1657 // potentially flushed at segment expiry.
1658 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
1659
1660 if (session->get_num_trim_requests_warnings() > 0 &&
1661 session->get_num_completed_requests() * 2 < g_conf->mds_max_completed_requests)
1662 session->reset_num_trim_requests_warnings();
1663 } else {
1664 if (session->get_num_completed_requests() >=
1665 (g_conf->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
1666 session->inc_num_trim_requests_warnings();
1667 stringstream ss;
1668 ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
1669 << req->get_oldest_client_tid() << "), "
1670 << session->get_num_completed_requests()
1671 << " completed requests recorded in session\n";
1672 mds->clog->warn() << ss.str();
1673 dout(20) << __func__ << " " << ss.str() << dendl;
1674 }
1675 }
1676 }
1677
1678 // register + dispatch
1679 MDRequestRef mdr = mdcache->request_start(req);
1680 if (!mdr.get())
1681 return;
1682
1683 if (session) {
1684 mdr->session = session;
1685 session->requests.push_back(&mdr->item_session_request);
1686 }
1687
1688 if (has_completed)
1689 mdr->has_completed = true;
1690
1691 // process embedded cap releases?
1692 // (only if NOT replay!)
1693 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
1694 client_t client = req->get_source().num();
1695 for (vector<MClientRequest::Release>::iterator p = req->releases.begin();
1696 p != req->releases.end();
1697 ++p)
1698 mds->locker->process_request_cap_release(mdr, client, p->item, p->dname);
1699 req->releases.clear();
1700 }
1701
1702 dispatch_client_request(mdr);
1703 return;
1704 }
1705
1706 void Server::handle_osd_map()
1707 {
1708 /* Note that we check the OSDMAP_FULL flag directly rather than
1709 * using osdmap_full_flag(), because we want to know "is the flag set"
1710 * rather than "does the flag apply to us?" */
1711 mds->objecter->with_osdmap([this](const OSDMap& o) {
1712 is_full = o.test_flag(CEPH_OSDMAP_FULL);
1713 dout(7) << __func__ << ": full = " << is_full << " epoch = "
1714 << o.get_epoch() << dendl;
1715 });
1716 }
1717
1718 void Server::dispatch_client_request(MDRequestRef& mdr)
1719 {
1720 // we shouldn't be waiting on anyone.
1721 assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
1722
1723 if (mdr->killed) {
1724 dout(10) << "request " << *mdr << " was killed" << dendl;
1725 return;
1726 }
1727
1728 MClientRequest *req = mdr->client_request;
1729
1730 if (logger) logger->inc(l_mdss_dispatch_client_request);
1731
1732 dout(7) << "dispatch_client_request " << *req << dendl;
1733
1734 if (req->may_write()) {
1735 if (mdcache->is_readonly()) {
1736 dout(10) << " read-only FS" << dendl;
1737 respond_to_request(mdr, -EROFS);
1738 return;
1739 }
1740 if (mdr->has_more() && mdr->more()->slave_error) {
1741 dout(10) << " got error from slaves" << dendl;
1742 respond_to_request(mdr, mdr->more()->slave_error);
1743 return;
1744 }
1745 }
1746
1747 if (is_full) {
1748 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1749 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
1750 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1751 req->get_op() == CEPH_MDS_OP_RMXATTR ||
1752 req->get_op() == CEPH_MDS_OP_SETXATTR ||
1753 req->get_op() == CEPH_MDS_OP_CREATE ||
1754 req->get_op() == CEPH_MDS_OP_SYMLINK ||
1755 req->get_op() == CEPH_MDS_OP_MKSNAP ||
1756 ((req->get_op() == CEPH_MDS_OP_LINK ||
1757 req->get_op() == CEPH_MDS_OP_RENAME) &&
1758 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
1759 ) {
1760
1761 dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
1762 respond_to_request(mdr, -ENOSPC);
1763 return;
1764 } else {
1765 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
1766 }
1767 }
1768
1769 switch (req->get_op()) {
1770 case CEPH_MDS_OP_LOOKUPHASH:
1771 case CEPH_MDS_OP_LOOKUPINO:
1772 handle_client_lookup_ino(mdr, false, false);
1773 break;
1774 case CEPH_MDS_OP_LOOKUPPARENT:
1775 handle_client_lookup_ino(mdr, true, false);
1776 break;
1777 case CEPH_MDS_OP_LOOKUPNAME:
1778 handle_client_lookup_ino(mdr, false, true);
1779 break;
1780
1781 // inodes ops.
1782 case CEPH_MDS_OP_LOOKUP:
1783 handle_client_getattr(mdr, true);
1784 break;
1785
1786 case CEPH_MDS_OP_LOOKUPSNAP:
1787 // lookupsnap does not reference a CDentry; treat it as a getattr
1788 case CEPH_MDS_OP_GETATTR:
1789 handle_client_getattr(mdr, false);
1790 break;
1791
1792 case CEPH_MDS_OP_SETATTR:
1793 handle_client_setattr(mdr);
1794 break;
1795 case CEPH_MDS_OP_SETLAYOUT:
1796 handle_client_setlayout(mdr);
1797 break;
1798 case CEPH_MDS_OP_SETDIRLAYOUT:
1799 handle_client_setdirlayout(mdr);
1800 break;
1801 case CEPH_MDS_OP_SETXATTR:
1802 handle_client_setxattr(mdr);
1803 break;
1804 case CEPH_MDS_OP_RMXATTR:
1805 handle_client_removexattr(mdr);
1806 break;
1807
1808 case CEPH_MDS_OP_READDIR:
1809 handle_client_readdir(mdr);
1810 break;
1811
1812 case CEPH_MDS_OP_SETFILELOCK:
1813 handle_client_file_setlock(mdr);
1814 break;
1815
1816 case CEPH_MDS_OP_GETFILELOCK:
1817 handle_client_file_readlock(mdr);
1818 break;
1819
1820 // funky.
1821 case CEPH_MDS_OP_CREATE:
1822 if (mdr->has_completed)
1823 handle_client_open(mdr); // already created.. just open
1824 else
1825 handle_client_openc(mdr);
1826 break;
1827
1828 case CEPH_MDS_OP_OPEN:
1829 handle_client_open(mdr);
1830 break;
1831
1832 // namespace.
1833 // no prior locks.
1834 case CEPH_MDS_OP_MKNOD:
1835 handle_client_mknod(mdr);
1836 break;
1837 case CEPH_MDS_OP_LINK:
1838 handle_client_link(mdr);
1839 break;
1840 case CEPH_MDS_OP_UNLINK:
1841 case CEPH_MDS_OP_RMDIR:
1842 handle_client_unlink(mdr);
1843 break;
1844 case CEPH_MDS_OP_RENAME:
1845 handle_client_rename(mdr);
1846 break;
1847 case CEPH_MDS_OP_MKDIR:
1848 handle_client_mkdir(mdr);
1849 break;
1850 case CEPH_MDS_OP_SYMLINK:
1851 handle_client_symlink(mdr);
1852 break;
1853
1854
1855 // snaps
1856 case CEPH_MDS_OP_LSSNAP:
1857 handle_client_lssnap(mdr);
1858 break;
1859 case CEPH_MDS_OP_MKSNAP:
1860 handle_client_mksnap(mdr);
1861 break;
1862 case CEPH_MDS_OP_RMSNAP:
1863 handle_client_rmsnap(mdr);
1864 break;
1865 case CEPH_MDS_OP_RENAMESNAP:
1866 handle_client_renamesnap(mdr);
1867 break;
1868
1869 default:
1870 dout(1) << " unknown client op " << req->get_op() << dendl;
1871 respond_to_request(mdr, -EOPNOTSUPP);
1872 }
1873 }
1874
1875
1876 // ---------------------------------------
1877 // SLAVE REQUESTS
1878
1879 /* This function DOES put the passed message before returning*/
1880 void Server::handle_slave_request(MMDSSlaveRequest *m)
1881 {
1882 dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
1883 mds_rank_t from = mds_rank_t(m->get_source().num());
1884
1885 if (logger) logger->inc(l_mdss_handle_slave_request);
1886
1887 // reply?
1888 if (m->is_reply())
1889 return handle_slave_request_reply(m);
1890
1891 // the purpose of rename notify is enforcing causal message ordering. making sure
1892 // bystanders have received all messages from rename srcdn's auth MDS.
1893 if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
1894 MMDSSlaveRequest *reply = new MMDSSlaveRequest(m->get_reqid(), m->get_attempt(),
1895 MMDSSlaveRequest::OP_RENAMENOTIFYACK);
1896 mds->send_message(reply, m->get_connection());
1897 m->put();
1898 return;
1899 }
1900
1901 CDentry *straydn = NULL;
1902 if (m->stray.length() > 0) {
1903 straydn = mdcache->add_replica_stray(m->stray, from);
1904 assert(straydn);
1905 m->stray.clear();
1906 }
1907
1908 // am i a new slave?
1909 MDRequestRef mdr;
1910 if (mdcache->have_request(m->get_reqid())) {
1911 // existing?
1912 mdr = mdcache->request_get(m->get_reqid());
1913
1914 // is my request newer?
1915 if (mdr->attempt > m->get_attempt()) {
1916 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
1917 << ", dropping " << *m << dendl;
1918 m->put();
1919 return;
1920 }
1921
1922
1923 if (mdr->attempt < m->get_attempt()) {
1924 // mine is old, close it out
1925 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
1926 << ", closing out" << dendl;
1927 mdcache->request_finish(mdr);
1928 mdr.reset();
1929 } else if (mdr->slave_to_mds != from) {
1930 dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
1931 m->put();
1932 return;
1933 }
1934
1935 if (m->get_op() == MMDSSlaveRequest::OP_FINISH && m->is_abort()) {
1936 mdr->aborted = true;
1937 if (mdr->slave_request) {
1938 // only abort on-going xlock, wrlock and auth pin
1939 assert(!mdr->slave_did_prepare());
1940 } else {
1941 mdcache->request_finish(mdr);
1942 }
1943 return;
1944 }
1945 }
1946 if (!mdr.get()) {
1947 // new?
1948 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
1949 dout(10) << "missing slave request for " << m->get_reqid()
1950 << " OP_FINISH, must have lost race with a forward" << dendl;
1951 m->put();
1952 return;
1953 }
1954 mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
1955 mdr->set_op_stamp(m->op_stamp);
1956 }
1957 assert(mdr->slave_request == 0); // only one at a time, please!
1958
1959 if (straydn) {
1960 mdr->pin(straydn);
1961 mdr->straydn = straydn;
1962 }
1963
1964 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
1965 dout(3) << "not clientreplay|active yet, waiting" << dendl;
1966 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
1967 return;
1968 } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
1969 mdr->locks.empty()) {
1970 dout(3) << "not active yet, waiting" << dendl;
1971 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
1972 return;
1973 }
1974
1975 mdr->slave_request = m;
1976
1977 dispatch_slave_request(mdr);
1978 }
1979
1980 /* This function DOES put the passed message before returning*/
1981 void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
1982 {
1983 mds_rank_t from = mds_rank_t(m->get_source().num());
1984
1985 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
1986 metareqid_t r = m->get_reqid();
1987 if (!mdcache->have_uncommitted_master(r, from)) {
1988 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
1989 << from << " reqid " << r << dendl;
1990 m->put();
1991 return;
1992 }
1993 dout(3) << "not clientreplay|active yet, waiting" << dendl;
1994 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
1995 return;
1996 }
1997
1998 if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
1999 metareqid_t r = m->get_reqid();
2000 mdcache->committed_master_slave(r, from);
2001 m->put();
2002 return;
2003 }
2004
2005 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2006 if (m->get_attempt() != mdr->attempt) {
2007 dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
2008 << m->get_attempt() << dendl;
2009 m->put();
2010 return;
2011 }
2012
2013 switch (m->get_op()) {
2014 case MMDSSlaveRequest::OP_XLOCKACK:
2015 {
2016 // identify lock, master request
2017 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2018 m->get_object_info());
2019 mdr->more()->slaves.insert(from);
2020 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2021 mdr->xlocks.insert(lock);
2022 mdr->locks.insert(lock);
2023 mdr->finish_locking(lock);
2024 lock->get_xlock(mdr, mdr->get_client());
2025
2026 assert(mdr->more()->waiting_on_slave.count(from));
2027 mdr->more()->waiting_on_slave.erase(from);
2028 assert(mdr->more()->waiting_on_slave.empty());
2029 mdcache->dispatch_request(mdr);
2030 }
2031 break;
2032
2033 case MMDSSlaveRequest::OP_WRLOCKACK:
2034 {
2035 // identify lock, master request
2036 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2037 m->get_object_info());
2038 mdr->more()->slaves.insert(from);
2039 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2040 mdr->remote_wrlocks[lock] = from;
2041 mdr->locks.insert(lock);
2042 mdr->finish_locking(lock);
2043
2044 assert(mdr->more()->waiting_on_slave.count(from));
2045 mdr->more()->waiting_on_slave.erase(from);
2046 assert(mdr->more()->waiting_on_slave.empty());
2047 mdcache->dispatch_request(mdr);
2048 }
2049 break;
2050
2051 case MMDSSlaveRequest::OP_AUTHPINACK:
2052 handle_slave_auth_pin_ack(mdr, m);
2053 break;
2054
2055 case MMDSSlaveRequest::OP_LINKPREPACK:
2056 handle_slave_link_prep_ack(mdr, m);
2057 break;
2058
2059 case MMDSSlaveRequest::OP_RMDIRPREPACK:
2060 handle_slave_rmdir_prep_ack(mdr, m);
2061 break;
2062
2063 case MMDSSlaveRequest::OP_RENAMEPREPACK:
2064 handle_slave_rename_prep_ack(mdr, m);
2065 break;
2066
2067 case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
2068 handle_slave_rename_notify_ack(mdr, m);
2069 break;
2070
2071 default:
2072 ceph_abort();
2073 }
2074
2075 // done with reply.
2076 m->put();
2077 }
2078
2079 /* This function DOES put the mdr->slave_request before returning*/
2080 void Server::dispatch_slave_request(MDRequestRef& mdr)
2081 {
2082 dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
2083
2084 if (mdr->aborted) {
2085 dout(7) << " abort flag set, finishing" << dendl;
2086 mdcache->request_finish(mdr);
2087 return;
2088 }
2089
2090 if (logger) logger->inc(l_mdss_dispatch_slave_request);
2091
2092 int op = mdr->slave_request->get_op();
2093 switch (op) {
2094 case MMDSSlaveRequest::OP_XLOCK:
2095 case MMDSSlaveRequest::OP_WRLOCK:
2096 {
2097 // identify object
2098 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2099 mdr->slave_request->get_object_info());
2100
2101 if (!lock) {
2102 dout(10) << "don't have object, dropping" << dendl;
2103 ceph_abort(); // can this happen, if we auth pinned properly.
2104 }
2105 if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2106 dout(10) << "not auth for remote xlock attempt, dropping on "
2107 << *lock << " on " << *lock->get_parent() << dendl;
2108 } else {
2109 // use acquire_locks so that we get auth_pinning.
2110 set<SimpleLock*> rdlocks;
2111 set<SimpleLock*> wrlocks = mdr->wrlocks;
2112 set<SimpleLock*> xlocks = mdr->xlocks;
2113
2114 int replycode = 0;
2115 switch (op) {
2116 case MMDSSlaveRequest::OP_XLOCK:
2117 xlocks.insert(lock);
2118 replycode = MMDSSlaveRequest::OP_XLOCKACK;
2119 break;
2120 case MMDSSlaveRequest::OP_WRLOCK:
2121 wrlocks.insert(lock);
2122 replycode = MMDSSlaveRequest::OP_WRLOCKACK;
2123 break;
2124 }
2125
2126 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
2127 return;
2128
2129 // ack
2130 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, replycode);
2131 r->set_lock_type(lock->get_type());
2132 lock->get_parent()->set_object_info(r->get_object_info());
2133 mds->send_message(r, mdr->slave_request->get_connection());
2134 }
2135
2136 // done.
2137 mdr->slave_request->put();
2138 mdr->slave_request = 0;
2139 }
2140 break;
2141
2142 case MMDSSlaveRequest::OP_UNXLOCK:
2143 case MMDSSlaveRequest::OP_UNWRLOCK:
2144 {
2145 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2146 mdr->slave_request->get_object_info());
2147 assert(lock);
2148 bool need_issue = false;
2149 switch (op) {
2150 case MMDSSlaveRequest::OP_UNXLOCK:
2151 mds->locker->xlock_finish(lock, mdr.get(), &need_issue);
2152 break;
2153 case MMDSSlaveRequest::OP_UNWRLOCK:
2154 mds->locker->wrlock_finish(lock, mdr.get(), &need_issue);
2155 break;
2156 }
2157 if (need_issue)
2158 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2159
2160 // done. no ack necessary.
2161 mdr->slave_request->put();
2162 mdr->slave_request = 0;
2163 }
2164 break;
2165
2166 case MMDSSlaveRequest::OP_DROPLOCKS:
2167 mds->locker->drop_locks(mdr.get());
2168 mdr->slave_request->put();
2169 mdr->slave_request = 0;
2170 break;
2171
2172 case MMDSSlaveRequest::OP_AUTHPIN:
2173 handle_slave_auth_pin(mdr);
2174 break;
2175
2176 case MMDSSlaveRequest::OP_LINKPREP:
2177 case MMDSSlaveRequest::OP_UNLINKPREP:
2178 handle_slave_link_prep(mdr);
2179 break;
2180
2181 case MMDSSlaveRequest::OP_RMDIRPREP:
2182 handle_slave_rmdir_prep(mdr);
2183 break;
2184
2185 case MMDSSlaveRequest::OP_RENAMEPREP:
2186 handle_slave_rename_prep(mdr);
2187 break;
2188
2189 case MMDSSlaveRequest::OP_FINISH:
2190 // information about rename imported caps
2191 if (mdr->slave_request->inode_export.length() > 0)
2192 mdr->more()->inode_import.claim(mdr->slave_request->inode_export);
2193 // finish off request.
2194 mdcache->request_finish(mdr);
2195 break;
2196
2197 default:
2198 ceph_abort();
2199 }
2200 }
2201
2202 /* This function DOES put the mdr->slave_request before returning*/
2203 void Server::handle_slave_auth_pin(MDRequestRef& mdr)
2204 {
2205 dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
2206
2207 // build list of objects
2208 list<MDSCacheObject*> objects;
2209 CInode *auth_pin_freeze = NULL;
2210 bool fail = false, wouldblock = false, readonly = false;
2211
2212 if (mdcache->is_readonly()) {
2213 dout(10) << " read-only FS" << dendl;
2214 readonly = true;
2215 fail = true;
2216 }
2217
2218 if (!fail) {
2219 for (vector<MDSCacheObjectInfo>::iterator p = mdr->slave_request->get_authpins().begin();
2220 p != mdr->slave_request->get_authpins().end();
2221 ++p) {
2222 MDSCacheObject *object = mdcache->get_object(*p);
2223 if (!object) {
2224 dout(10) << " don't have " << *p << dendl;
2225 fail = true;
2226 break;
2227 }
2228
2229 objects.push_back(object);
2230 if (*p == mdr->slave_request->get_authpin_freeze())
2231 auth_pin_freeze = static_cast<CInode*>(object);
2232 }
2233 }
2234
2235 // can we auth pin them?
2236 if (!fail) {
2237 for (list<MDSCacheObject*>::iterator p = objects.begin();
2238 p != objects.end();
2239 ++p) {
2240 if (!(*p)->is_auth()) {
2241 dout(10) << " not auth for " << **p << dendl;
2242 fail = true;
2243 break;
2244 }
2245 if (mdr->is_auth_pinned(*p))
2246 continue;
2247 if (!mdr->can_auth_pin(*p)) {
2248 if (mdr->slave_request->is_nonblock()) {
2249 dout(10) << " can't auth_pin (freezing?) " << **p << " nonblocking" << dendl;
2250 fail = true;
2251 wouldblock = true;
2252 break;
2253 }
2254 // wait
2255 dout(10) << " waiting for authpinnable on " << **p << dendl;
2256 (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2257 mdr->drop_local_auth_pins();
2258
2259 CDir *dir = NULL;
2260 if (CInode *in = dynamic_cast<CInode*>(*p)) {
2261 if (!in->is_root())
2262 dir = in->get_parent_dir();
2263 } else if (CDentry *dn = dynamic_cast<CDentry*>(*p)) {
2264 dir = dn->get_dir();
2265 } else {
2266 ceph_abort();
2267 }
2268 if (dir) {
2269 if (dir->is_freezing_dir())
2270 mdcache->fragment_freeze_inc_num_waiters(dir);
2271 if (dir->is_freezing_tree()) {
2272 while (!dir->is_freezing_tree_root())
2273 dir = dir->get_parent_dir();
2274 mdcache->migrator->export_freeze_inc_num_waiters(dir);
2275 }
2276 }
2277 return;
2278 }
2279 }
2280 }
2281
2282 // auth pin!
2283 if (fail) {
2284 mdr->drop_local_auth_pins(); // just in case
2285 } else {
2286 /* freeze authpin wrong inode */
2287 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
2288 mdr->more()->rename_inode != auth_pin_freeze)
2289 mdr->unfreeze_auth_pin(true);
2290
2291 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2292 * on the source inode to complete. This happens after all locks for the rename
2293 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2294 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2295 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2296 * The solution is freeze the inode and prevent other MDRequests from getting new
2297 * auth pins.
2298 */
2299 if (auth_pin_freeze) {
2300 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
2301 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
2302 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
2303 mds->mdlog->flush();
2304 return;
2305 }
2306 }
2307 for (list<MDSCacheObject*>::iterator p = objects.begin();
2308 p != objects.end();
2309 ++p) {
2310 dout(10) << "auth_pinning " << **p << dendl;
2311 mdr->auth_pin(*p);
2312 }
2313 }
2314
2315 // ack!
2316 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
2317
2318 // return list of my auth_pins (if any)
2319 for (set<MDSCacheObject*>::iterator p = mdr->auth_pins.begin();
2320 p != mdr->auth_pins.end();
2321 ++p) {
2322 MDSCacheObjectInfo info;
2323 (*p)->set_object_info(info);
2324 reply->get_authpins().push_back(info);
2325 if (*p == (MDSCacheObject*)auth_pin_freeze)
2326 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
2327 }
2328
2329 if (wouldblock)
2330 reply->mark_error_wouldblock();
2331 if (readonly)
2332 reply->mark_error_rofs();
2333
2334 mds->send_message_mds(reply, mdr->slave_to_mds);
2335
2336 // clean up this request
2337 mdr->slave_request->put();
2338 mdr->slave_request = 0;
2339 return;
2340 }
2341
2342 /* This function DOES NOT put the passed ack before returning*/
2343 void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
2344 {
2345 dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
2346 mds_rank_t from = mds_rank_t(ack->get_source().num());
2347
2348 // added auth pins?
2349 set<MDSCacheObject*> pinned;
2350 for (vector<MDSCacheObjectInfo>::iterator p = ack->get_authpins().begin();
2351 p != ack->get_authpins().end();
2352 ++p) {
2353 MDSCacheObject *object = mdcache->get_object(*p);
2354 assert(object); // we pinned it
2355 dout(10) << " remote has pinned " << *object << dendl;
2356 if (!mdr->is_auth_pinned(object))
2357 mdr->remote_auth_pins[object] = from;
2358 if (*p == ack->get_authpin_freeze())
2359 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
2360 pinned.insert(object);
2361 }
2362
2363 // removed frozen auth pin ?
2364 if (mdr->more()->is_remote_frozen_authpin &&
2365 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
2366 auto p = mdr->remote_auth_pins.find(mdr->more()->rename_inode);
2367 assert(p != mdr->remote_auth_pins.end());
2368 if (p->second == from) {
2369 mdr->more()->is_remote_frozen_authpin = false;
2370 }
2371 }
2372
2373 // removed auth pins?
2374 map<MDSCacheObject*, mds_rank_t>::iterator p = mdr->remote_auth_pins.begin();
2375 while (p != mdr->remote_auth_pins.end()) {
2376 MDSCacheObject* object = p->first;
2377 if (p->second == from && pinned.count(object) == 0) {
2378 dout(10) << " remote has unpinned " << *object << dendl;
2379 mdr->remote_auth_pins.erase(p++);
2380 } else {
2381 ++p;
2382 }
2383 }
2384
2385 if (ack->is_error_rofs()) {
2386 mdr->more()->slave_error = -EROFS;
2387 mdr->aborted = true;
2388 } else if (ack->is_error_wouldblock()) {
2389 mdr->more()->slave_error = -EWOULDBLOCK;
2390 mdr->aborted = true;
2391 }
2392
2393 // note slave
2394 mdr->more()->slaves.insert(from);
2395
2396 // clear from waiting list
2397 assert(mdr->more()->waiting_on_slave.count(from));
2398 mdr->more()->waiting_on_slave.erase(from);
2399
2400 // go again?
2401 if (mdr->more()->waiting_on_slave.empty())
2402 mdcache->dispatch_request(mdr);
2403 else
2404 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
2405 }
2406
2407
2408 // ---------------------------------------
2409 // HELPERS
2410
2411
2412 /**
2413 * check whether we are permitted to complete a request
2414 *
2415 * Check whether we have permission to perform the operation specified
2416 * by mask on the given inode, based on the capability in the mdr's
2417 * session.
2418 */
2419 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
2420 {
2421 if (mdr->session) {
2422 int r = mdr->session->check_access(
2423 in, mask,
2424 mdr->client_request->get_caller_uid(),
2425 mdr->client_request->get_caller_gid(),
2426 &mdr->client_request->get_caller_gid_list(),
2427 mdr->client_request->head.args.setattr.uid,
2428 mdr->client_request->head.args.setattr.gid);
2429 if (r < 0) {
2430 respond_to_request(mdr, r);
2431 return false;
2432 }
2433 }
2434 return true;
2435 }
2436
2437 /**
2438 * check whether fragment has reached maximum size
2439 *
2440 */
2441 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
2442 {
2443 const auto size = in->get_frag_size();
2444 if (size >= g_conf->mds_bal_fragment_size_max) {
2445 dout(10) << "fragment " << *in << " size exceeds " << g_conf->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
2446 respond_to_request(mdr, -ENOSPC);
2447 return false;
2448 }
2449
2450 return true;
2451 }
2452
2453
2454 /** validate_dentry_dir
2455 *
2456 * verify that the dir exists and would own the dname.
2457 * do not check if the dentry exists.
2458 */
2459 CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, const string& dname)
2460 {
2461 // make sure parent is a dir?
2462 if (!diri->is_dir()) {
2463 dout(7) << "validate_dentry_dir: not a dir" << dendl;
2464 respond_to_request(mdr, -ENOTDIR);
2465 return NULL;
2466 }
2467
2468 // which dirfrag?
2469 frag_t fg = diri->pick_dirfrag(dname);
2470 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
2471 if (!dir)
2472 return 0;
2473
2474 // frozen?
2475 if (dir->is_frozen()) {
2476 dout(7) << "dir is frozen " << *dir << dendl;
2477 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2478 return NULL;
2479 }
2480
2481 return dir;
2482 }
2483
2484
2485 /** prepare_null_dentry
2486 * prepare a null (or existing) dentry in given dir.
2487 * wait for any dn lock.
2488 */
2489 CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, const string& dname, bool okexist)
2490 {
2491 dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
2492 assert(dir->is_auth());
2493
2494 client_t client = mdr->get_client();
2495
2496 // does it already exist?
2497 CDentry *dn = dir->lookup(dname);
2498 if (dn) {
2499 /*
2500 if (dn->lock.is_xlocked_by_other(mdr)) {
2501 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2502 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2503 return 0;
2504 }
2505 */
2506 if (!dn->get_linkage(client, mdr)->is_null()) {
2507 // name already exists
2508 dout(10) << "dentry " << dname << " exists in " << *dir << dendl;
2509 if (!okexist) {
2510 respond_to_request(mdr, -EEXIST);
2511 return 0;
2512 }
2513 } else {
2514 dn->first = dir->inode->find_snaprealm()->get_newest_seq() + 1;
2515 }
2516
2517 return dn;
2518 }
2519
2520 // make sure dir is complete
2521 if (!dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2522 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2523 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2524 return 0;
2525 }
2526
2527 // create
2528 dn = dir->add_null_dentry(dname, dir->inode->find_snaprealm()->get_newest_seq() + 1);
2529 dn->mark_new();
2530 dout(10) << "prepare_null_dentry added " << *dn << dendl;
2531 return dn;
2532 }
2533
2534 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
2535 {
2536 CDentry *straydn = mdr->straydn;
2537 if (straydn) {
2538 string straydname;
2539 in->name_stray_dentry(straydname);
2540 if (straydn->get_name() == straydname)
2541 return straydn;
2542
2543 assert(!mdr->done_locking);
2544 mdr->unpin(straydn);
2545 }
2546
2547 CDir *straydir = mdcache->get_stray_dir(in);
2548
2549 if (!mdr->client_request->is_replay() &&
2550 !check_fragment_space(mdr, straydir))
2551 return NULL;
2552
2553 straydn = mdcache->get_or_create_stray_dentry(in);
2554 mdr->straydn = straydn;
2555 mdr->pin(straydn);
2556 return straydn;
2557 }
2558
2559 /** prepare_new_inode
2560 *
2561 * create a new inode. set c/m/atime. hit dir pop.
2562 */
2563 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
2564 file_layout_t *layout)
2565 {
2566 CInode *in = new CInode(mdcache);
2567
2568 // Server::prepare_force_open_sessions() can re-open session in closing
2569 // state. In that corner case, session's prealloc_inos are being freed.
2570 // To simplify the code, we disallow using/refilling session's prealloc_ino
2571 // while session is opening.
2572 bool allow_prealloc_inos = !mdr->session->is_opening();
2573
2574 // assign ino
2575 if (allow_prealloc_inos &&
2576 mdr->session->info.prealloc_inos.size()) {
2577 mdr->used_prealloc_ino =
2578 in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used
2579 mds->sessionmap.mark_projected(mdr->session);
2580
2581 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
2582 << " (" << mdr->session->info.prealloc_inos
2583 << ", " << mdr->session->info.prealloc_inos.size() << " left)"
2584 << dendl;
2585 } else {
2586 mdr->alloc_ino =
2587 in->inode.ino = mds->inotable->project_alloc_id();
2588 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
2589 }
2590
2591 if (useino && useino != in->inode.ino) {
2592 dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
2593 mds->clog->error() << mdr->client_request->get_source()
2594 << " specified ino " << useino
2595 << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
2596 //ceph_abort(); // just for now.
2597 }
2598
2599 if (allow_prealloc_inos &&
2600 mdr->session->get_num_projected_prealloc_inos() < g_conf->mds_client_prealloc_inos / 2) {
2601 int need = g_conf->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
2602 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
2603 assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
2604 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
2605 mds->sessionmap.mark_projected(mdr->session);
2606 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
2607 }
2608
2609 in->inode.version = 1;
2610 in->inode.xattr_version = 1;
2611 in->inode.nlink = 1; // FIXME
2612
2613 in->inode.mode = mode;
2614
2615 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
2616 if (in->inode.is_dir()) {
2617 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
2618 } else if (layout) {
2619 in->inode.layout = *layout;
2620 } else {
2621 in->inode.layout = mdcache->default_file_layout;
2622 }
2623
2624 in->inode.truncate_size = -1ull; // not truncated, yet!
2625 in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
2626
2627 CInode *diri = dir->get_inode();
2628
2629 dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
2630
2631 if (diri->inode.mode & S_ISGID) {
2632 dout(10) << " dir is sticky" << dendl;
2633 in->inode.gid = diri->inode.gid;
2634 if (S_ISDIR(mode)) {
2635 dout(10) << " new dir also sticky" << dendl;
2636 in->inode.mode |= S_ISGID;
2637 }
2638 } else
2639 in->inode.gid = mdr->client_request->get_caller_gid();
2640
2641 in->inode.uid = mdr->client_request->get_caller_uid();
2642
2643 in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
2644 mdr->get_op_stamp();
2645
2646 in->inode.change_attr = 0;
2647
2648 MClientRequest *req = mdr->client_request;
2649 if (req->get_data().length()) {
2650 bufferlist::iterator p = req->get_data().begin();
2651
2652 // xattrs on new inode?
2653 map<string,bufferptr> xattrs;
2654 ::decode(xattrs, p);
2655 for (map<string,bufferptr>::iterator p = xattrs.begin(); p != xattrs.end(); ++p) {
2656 dout(10) << "prepare_new_inode setting xattr " << p->first << dendl;
2657 in->xattrs[p->first] = p->second;
2658 }
2659 }
2660
2661 if (!mds->mdsmap->get_inline_data_enabled() ||
2662 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
2663 in->inode.inline_data.version = CEPH_INLINE_NONE;
2664
2665 mdcache->add_inode(in); // add
2666 dout(10) << "prepare_new_inode " << *in << dendl;
2667 return in;
2668 }
2669
2670 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
2671 {
2672 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
2673 << " inotablev " << mds->inotable->get_projected_version()
2674 << dendl;
2675 blob->set_ino_alloc(mdr->alloc_ino,
2676 mdr->used_prealloc_ino,
2677 mdr->prealloc_inos,
2678 mdr->client_request->get_source(),
2679 mds->sessionmap.get_projected(),
2680 mds->inotable->get_projected_version());
2681 }
2682
2683 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
2684 {
2685 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
2686 << " / " << mdr->prealloc_inos
2687 << " / " << mdr->used_prealloc_ino << dendl;
2688
2689 if (mdr->alloc_ino) {
2690 mds->inotable->apply_alloc_id(mdr->alloc_ino);
2691 }
2692 if (mdr->prealloc_inos.size()) {
2693 assert(session);
2694 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
2695 session->info.prealloc_inos.insert(mdr->prealloc_inos);
2696 mds->sessionmap.mark_dirty(session);
2697 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
2698 }
2699 if (mdr->used_prealloc_ino) {
2700 assert(session);
2701 session->info.used_inos.erase(mdr->used_prealloc_ino);
2702 mds->sessionmap.mark_dirty(session);
2703 }
2704 }
2705
2706 class C_MDS_TryFindInode : public ServerContext {
2707 MDRequestRef mdr;
2708 public:
2709 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
2710 void finish(int r) override {
2711 if (r == -ESTALE) // :( find_ino_peers failed
2712 server->respond_to_request(mdr, r);
2713 else
2714 server->dispatch_client_request(mdr);
2715 }
2716 };
2717
2718 CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath)
2719 {
2720 // figure parent dir vs dname
2721 if (refpath.depth() == 0) {
2722 dout(7) << "can't do that to root" << dendl;
2723 respond_to_request(mdr, -EINVAL);
2724 return 0;
2725 }
2726 string dname = refpath.last_dentry();
2727 refpath.pop_dentry();
2728
2729 dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl;
2730
2731 // traverse to parent dir
2732 CInode *diri;
2733 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &trace, &diri, MDS_TRAVERSE_FORWARD);
2734 if (r > 0) return 0; // delayed
2735 if (r < 0) {
2736 if (r == -ESTALE) {
2737 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2738 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
2739 return 0;
2740 }
2741 respond_to_request(mdr, r);
2742 return 0;
2743 }
2744
2745 // is it an auth dir?
2746 CDir *dir = validate_dentry_dir(mdr, diri, dname);
2747 if (!dir)
2748 return 0; // forwarded or waiting for freeze
2749
2750 dout(10) << "traverse_to_auth_dir " << *dir << dendl;
2751 return dir;
2752 }
2753
2754 /* If this returns null, the request has been handled
2755 * as appropriate: forwarded on, or the client's been replied to */
2756 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n,
2757 set<SimpleLock*> &rdlocks,
2758 bool want_auth,
2759 bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's
2760 a snapped dir */
2761 file_layout_t **layout,
2762 bool no_lookup) // true if we cannot return a null dentry lease
2763 {
2764 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2765 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
2766
2767 if (mdr->done_locking)
2768 return mdr->in[n];
2769
2770 // traverse
2771 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &mdr->dn[n], &mdr->in[n], MDS_TRAVERSE_FORWARD);
2772 if (r > 0)
2773 return NULL; // delayed
2774 if (r < 0) { // error
2775 if (r == -ENOENT && n == 0 && mdr->dn[n].size()) {
2776 if (!no_lookup)
2777 mdr->tracedn = mdr->dn[n][mdr->dn[n].size()-1];
2778 respond_to_request(mdr, r);
2779 } else if (r == -ESTALE) {
2780 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2781 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
2782 mdcache->find_ino_peers(refpath.get_ino(), c);
2783 } else {
2784 dout(10) << "FAIL on error " << r << dendl;
2785 respond_to_request(mdr, r);
2786 }
2787 return 0;
2788 }
2789 CInode *ref = mdr->in[n];
2790 dout(10) << "ref is " << *ref << dendl;
2791
2792 // fw to inode auth?
2793 if (mdr->snapid != CEPH_NOSNAP && !no_want_auth)
2794 want_auth = true;
2795
2796 if (want_auth) {
2797 if (ref->is_ambiguous_auth()) {
2798 dout(10) << "waiting for single auth on " << *ref << dendl;
2799 ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
2800 return 0;
2801 }
2802 if (!ref->is_auth()) {
2803 dout(10) << "fw to auth for " << *ref << dendl;
2804 mdcache->request_forward(mdr, ref->authority().first);
2805 return 0;
2806 }
2807
2808 // auth_pin?
2809 // do NOT proceed if freezing, as cap release may defer in that case, and
2810 // we could deadlock when we try to lock @ref.
2811 // if we're already auth_pinned, continue; the release has already been processed.
2812 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
2813 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
2814 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
2815 ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2816 /* If we have any auth pins, this will deadlock.
2817 * But the only way to get here if we've already got auth pins
2818 * is because we're on an inode with snapshots that got updated
2819 * between dispatches of this request. So we're going to drop
2820 * our locks and our auth pins and reacquire them later.
2821 *
2822 * This is safe since we're only in this function when working on
2823 * a single MDS request; otherwise we'd be in
2824 * rdlock_path_xlock_dentry.
2825 */
2826 mds->locker->drop_locks(mdr.get(), NULL);
2827 mdr->drop_local_auth_pins();
2828 return 0;
2829 }
2830
2831 mdr->auth_pin(ref);
2832 }
2833
2834 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2835 rdlocks.insert(&mdr->dn[n][i]->lock);
2836 if (layout)
2837 mds->locker->include_snap_rdlocks_wlayout(rdlocks, ref, layout);
2838 else
2839 mds->locker->include_snap_rdlocks(rdlocks, ref);
2840
2841 // set and pin ref
2842 mdr->pin(ref);
2843 return ref;
2844 }
2845
2846
2847 /** rdlock_path_xlock_dentry
2848 * traverse path to the directory that could/would contain dentry.
2849 * make sure i am auth for that dentry, forward as necessary.
2850 * create null dentry in place (or use existing if okexist).
2851 * get rdlocks on traversed dentries, xlock on new dentry.
2852 */
2853 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
2854 set<SimpleLock*>& rdlocks, set<SimpleLock*>& wrlocks, set<SimpleLock*>& xlocks,
2855 bool okexist, bool mustexist, bool alwaysxlock,
2856 file_layout_t **layout)
2857 {
2858 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2859
2860 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
2861
2862 client_t client = mdr->get_client();
2863
2864 if (mdr->done_locking)
2865 return mdr->dn[n].back();
2866
2867 CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
2868 if (!dir) return 0;
2869 dout(10) << "rdlock_path_xlock_dentry dir " << *dir << dendl;
2870
2871 // make sure we can auth_pin (or have already authpinned) dir
2872 if (dir->is_frozen()) {
2873 dout(7) << "waiting for !frozen/authpinnable on " << *dir << dendl;
2874 dir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2875 return 0;
2876 }
2877
2878 CInode *diri = dir->get_inode();
2879 if (!mdr->reqid.name.is_mds()) {
2880 if (diri->is_system() && !diri->is_root()) {
2881 respond_to_request(mdr, -EROFS);
2882 return 0;
2883 }
2884 }
2885 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
2886 respond_to_request(mdr, -ENOENT);
2887 return 0;
2888 }
2889
2890 // make a null dentry?
2891 const string &dname = refpath.last_dentry();
2892 CDentry *dn;
2893 if (mustexist) {
2894 dn = dir->lookup(dname);
2895
2896 // make sure dir is complete
2897 if (!dn && !dir->is_complete() &&
2898 (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2899 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2900 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2901 return 0;
2902 }
2903
2904 // readable?
2905 if (dn && !dn->lock.can_read(client) && dn->lock.get_xlock_by() != mdr) {
2906 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2907 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2908 return 0;
2909 }
2910
2911 // exists?
2912 if (!dn || dn->get_linkage(client, mdr)->is_null()) {
2913 dout(7) << "dentry " << dname << " dne in " << *dir << dendl;
2914 respond_to_request(mdr, -ENOENT);
2915 return 0;
2916 }
2917 } else {
2918 dn = prepare_null_dentry(mdr, dir, dname, okexist);
2919 if (!dn)
2920 return 0;
2921 }
2922
2923 mdr->dn[n].push_back(dn);
2924 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
2925 mdr->in[n] = dnl->get_inode();
2926
2927 // -- lock --
2928 // NOTE: rename takes the same set of locks for srcdn
2929 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2930 rdlocks.insert(&mdr->dn[n][i]->lock);
2931 if (alwaysxlock || dnl->is_null())
2932 xlocks.insert(&dn->lock); // new dn, xlock
2933 else
2934 rdlocks.insert(&dn->lock); // existing dn, rdlock
2935 wrlocks.insert(&dn->get_dir()->inode->filelock); // also, wrlock on dir mtime
2936 wrlocks.insert(&dn->get_dir()->inode->nestlock); // also, wrlock on dir mtime
2937 if (layout)
2938 mds->locker->include_snap_rdlocks_wlayout(rdlocks, dn->get_dir()->inode, layout);
2939 else
2940 mds->locker->include_snap_rdlocks(rdlocks, dn->get_dir()->inode);
2941
2942 return dn;
2943 }
2944
2945
2946
2947
2948
2949 /**
2950 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
2951 *
2952 * @param diri base inode
2953 * @param fg the exact frag we want
2954 * @param mdr request
2955 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
2956 */
2957 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
2958 {
2959 CDir *dir = diri->get_dirfrag(fg);
2960
2961 // not open and inode not mine?
2962 if (!dir && !diri->is_auth()) {
2963 mds_rank_t inauth = diri->authority().first;
2964 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
2965 mdcache->request_forward(mdr, inauth);
2966 return 0;
2967 }
2968
2969 // not open and inode frozen?
2970 if (!dir && diri->is_frozen()) {
2971 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
2972 assert(diri->get_parent_dir());
2973 diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2974 return 0;
2975 }
2976
2977 // invent?
2978 if (!dir)
2979 dir = diri->get_or_open_dirfrag(mdcache, fg);
2980
2981 // am i auth for the dirfrag?
2982 if (!dir->is_auth()) {
2983 mds_rank_t auth = dir->authority().first;
2984 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
2985 << ", fw to mds." << auth << dendl;
2986 mdcache->request_forward(mdr, auth);
2987 return 0;
2988 }
2989
2990 return dir;
2991 }
2992
2993
2994 // ===============================================================================
2995 // STAT
2996
2997 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
2998 {
2999 MClientRequest *req = mdr->client_request;
3000 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3001
3002 if (req->get_filepath().depth() == 0 && is_lookup) {
3003 // refpath can't be empty for lookup but it can for
3004 // getattr (we do getattr with empty refpath for mount of '/')
3005 respond_to_request(mdr, -EINVAL);
3006 return;
3007 }
3008
3009 CInode *ref = rdlock_path_pin_ref(mdr, 0, rdlocks, false, false, NULL, !is_lookup);
3010 if (!ref) return;
3011
3012 /*
3013 * if client currently holds the EXCL cap on a field, do not rdlock
3014 * it; client's stat() will result in valid info if _either_ EXCL
3015 * cap is held or MDS rdlocks and reads the value here.
3016 *
3017 * handling this case here is easier than weakening rdlock
3018 * semantics... that would cause problems elsewhere.
3019 */
3020 client_t client = mdr->get_client();
3021 int issued = 0;
3022 Capability *cap = ref->get_client_cap(client);
3023 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3024 mdr->snapid <= cap->client_follows))
3025 issued = cap->issued();
3026
3027 int mask = req->head.args.getattr.mask;
3028 if ((mask & CEPH_CAP_LINK_SHARED) && (issued & CEPH_CAP_LINK_EXCL) == 0) rdlocks.insert(&ref->linklock);
3029 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0) rdlocks.insert(&ref->authlock);
3030 if ((mask & CEPH_CAP_FILE_SHARED) && (issued & CEPH_CAP_FILE_EXCL) == 0) rdlocks.insert(&ref->filelock);
3031 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0) rdlocks.insert(&ref->xattrlock);
3032
3033 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3034 return;
3035
3036 if (!check_access(mdr, ref, MAY_READ))
3037 return;
3038
3039 // note which caps are requested, so we return at least a snapshot
3040 // value for them. (currently this matters for xattrs and inline data)
3041 mdr->getattr_caps = mask;
3042
3043 mds->balancer->hit_inode(ceph_clock_now(), ref, META_POP_IRD,
3044 req->get_source().num());
3045
3046 // reply
3047 dout(10) << "reply to stat on " << *req << dendl;
3048 mdr->tracei = ref;
3049 if (is_lookup)
3050 mdr->tracedn = mdr->dn[0].back();
3051 respond_to_request(mdr, 0);
3052 }
3053
3054 struct C_MDS_LookupIno2 : public ServerContext {
3055 MDRequestRef mdr;
3056 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3057 void finish(int r) override {
3058 server->_lookup_ino_2(mdr, r);
3059 }
3060 };
3061
3062 /* This function DOES clean up the mdr before returning*/
3063 /*
3064 * filepath: ino
3065 */
3066 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3067 bool want_parent, bool want_dentry)
3068 {
3069 MClientRequest *req = mdr->client_request;
3070
3071 inodeno_t ino = req->get_filepath().get_ino();
3072 CInode *in = mdcache->get_inode(ino);
3073 if (in && in->state_test(CInode::STATE_PURGING)) {
3074 respond_to_request(mdr, -ESTALE);
3075 return;
3076 }
3077 if (!in) {
3078 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3079 return;
3080 }
3081
3082 if (mdr && in->snaprealm && !in->snaprealm->is_open() &&
3083 !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
3084 return;
3085 }
3086
3087 // check for nothing (not read or write); this still applies the
3088 // path check.
3089 if (!check_access(mdr, in, 0))
3090 return;
3091
3092 CDentry *dn = in->get_projected_parent_dn();
3093 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3094
3095 set<SimpleLock*> rdlocks;
3096 if (dn && (want_parent || want_dentry)) {
3097 mdr->pin(dn);
3098 rdlocks.insert(&dn->lock);
3099 }
3100
3101 unsigned mask = req->head.args.getattr.mask;
3102 if (mask) {
3103 Capability *cap = in->get_client_cap(mdr->get_client());
3104 int issued = 0;
3105 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3106 issued = cap->issued();
3107 // permission bits, ACL/security xattrs
3108 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3109 rdlocks.insert(&in->authlock);
3110 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3111 rdlocks.insert(&in->xattrlock);
3112
3113 mdr->getattr_caps = mask;
3114 }
3115
3116 if (!rdlocks.empty()) {
3117 set<SimpleLock*> wrlocks, xlocks;
3118 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3119 return;
3120
3121 // need read access to directory inode
3122 if (!check_access(mdr, diri, MAY_READ))
3123 return;
3124 }
3125
3126 if (want_parent) {
3127 if (in->is_base()) {
3128 respond_to_request(mdr, -EINVAL);
3129 return;
3130 }
3131 if (!diri || diri->is_stray()) {
3132 respond_to_request(mdr, -ESTALE);
3133 return;
3134 }
3135 dout(10) << "reply to lookup_parent " << *in << dendl;
3136 mdr->tracei = diri;
3137 respond_to_request(mdr, 0);
3138 } else {
3139 if (want_dentry) {
3140 inodeno_t dirino = req->get_filepath2().get_ino();
3141 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3142 respond_to_request(mdr, -ENOENT);
3143 return;
3144 }
3145 dout(10) << "reply to lookup_name " << *in << dendl;
3146 } else
3147 dout(10) << "reply to lookup_ino " << *in << dendl;
3148
3149 mdr->tracei = in;
3150 if (want_dentry)
3151 mdr->tracedn = dn;
3152 respond_to_request(mdr, 0);
3153 }
3154 }
3155
3156 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
3157 {
3158 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
3159 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3160
3161 // `r` is a rank if >=0, else an error code
3162 if (r >= 0) {
3163 mds_rank_t dest_rank(r);
3164 if (dest_rank == mds->get_nodeid())
3165 dispatch_client_request(mdr);
3166 else
3167 mdcache->request_forward(mdr, dest_rank);
3168 return;
3169 }
3170
3171 // give up
3172 if (r == -ENOENT || r == -ENODATA)
3173 r = -ESTALE;
3174 respond_to_request(mdr, r);
3175 }
3176
3177
3178 /* This function takes responsibility for the passed mdr*/
3179 void Server::handle_client_open(MDRequestRef& mdr)
3180 {
3181 MClientRequest *req = mdr->client_request;
3182 dout(7) << "open on " << req->get_filepath() << dendl;
3183
3184 int flags = req->head.args.open.flags;
3185 int cmode = ceph_flags_to_mode(flags);
3186 if (cmode < 0) {
3187 respond_to_request(mdr, -EINVAL);
3188 return;
3189 }
3190
3191 bool need_auth = !file_mode_is_readonly(cmode) || (flags & CEPH_O_TRUNC);
3192
3193 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
3194 dout(7) << "read-only FS" << dendl;
3195 respond_to_request(mdr, -EROFS);
3196 return;
3197 }
3198
3199 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3200 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, need_auth);
3201 if (!cur)
3202 return;
3203
3204 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
3205 assert(!need_auth);
3206 mdr->done_locking = false;
3207 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3208 if (!cur)
3209 return;
3210 }
3211
3212 if (!cur->inode.is_file()) {
3213 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3214 cmode = CEPH_FILE_MODE_PIN;
3215 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3216 if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
3217 flags &= ~CEPH_O_TRUNC;
3218 }
3219
3220 dout(10) << "open flags = " << flags
3221 << ", filemode = " << cmode
3222 << ", need_auth = " << need_auth
3223 << dendl;
3224
3225 // regular file?
3226 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3227 dout(7) << "not a file or dir " << *cur << dendl;
3228 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3229 return;
3230 }*/
3231 if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
3232 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
3233 respond_to_request(mdr, -EINVAL);
3234 return;
3235 }
3236
3237 if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
3238 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
3239 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3240 respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
3241 return;
3242 }
3243
3244 if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
3245 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
3246 dout(7) << "old client cannot open inline data file " << *cur << dendl;
3247 respond_to_request(mdr, -EPERM);
3248 return;
3249 }
3250
3251 // snapped data is read only
3252 if (mdr->snapid != CEPH_NOSNAP &&
3253 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
3254 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
3255 respond_to_request(mdr, -EROFS);
3256 return;
3257 }
3258
3259 unsigned mask = req->head.args.open.mask;
3260 if (mask) {
3261 Capability *cap = cur->get_client_cap(mdr->get_client());
3262 int issued = 0;
3263 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3264 issued = cap->issued();
3265 // permission bits, ACL/security xattrs
3266 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3267 rdlocks.insert(&cur->authlock);
3268 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3269 rdlocks.insert(&cur->xattrlock);
3270
3271 mdr->getattr_caps = mask;
3272 }
3273
3274 // O_TRUNC
3275 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
3276 assert(cur->is_auth());
3277
3278 xlocks.insert(&cur->filelock);
3279 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3280 return;
3281
3282 if (!check_access(mdr, cur, MAY_WRITE))
3283 return;
3284
3285 // wait for pending truncate?
3286 const inode_t *pi = cur->get_projected_inode();
3287 if (pi->is_truncating()) {
3288 dout(10) << " waiting for pending truncate from " << pi->truncate_from
3289 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
3290 mds->locker->drop_locks(mdr.get());
3291 mdr->drop_local_auth_pins();
3292 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
3293 return;
3294 }
3295
3296 do_open_truncate(mdr, cmode);
3297 return;
3298 }
3299
3300 // sync filelock if snapped.
3301 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3302 // and that data itself is flushed so that we can read the snapped data off disk.
3303 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
3304 rdlocks.insert(&cur->filelock);
3305 }
3306
3307 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3308 return;
3309
3310 mask = MAY_READ;
3311 if (cmode & CEPH_FILE_MODE_WR)
3312 mask |= MAY_WRITE;
3313 if (!check_access(mdr, cur, mask))
3314 return;
3315
3316 if (cur->is_file() || cur->is_dir()) {
3317 if (mdr->snapid == CEPH_NOSNAP) {
3318 // register new cap
3319 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr->session, 0, req->is_replay());
3320 if (cap)
3321 dout(12) << "open issued caps " << ccap_string(cap->pending())
3322 << " for " << req->get_source()
3323 << " on " << *cur << dendl;
3324 } else {
3325 int caps = ceph_caps_for_mode(cmode);
3326 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
3327 << " for " << req->get_source()
3328 << " snapid " << mdr->snapid
3329 << " on " << *cur << dendl;
3330 mdr->snap_caps = caps;
3331 }
3332 }
3333
3334 // increase max_size?
3335 if (cmode & CEPH_FILE_MODE_WR)
3336 mds->locker->check_inode_max_size(cur);
3337
3338 // make sure this inode gets into the journal
3339 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
3340 !cur->item_open_file.is_on_list()) {
3341 LogSegment *ls = mds->mdlog->get_current_segment();
3342 EOpen *le = new EOpen(mds->mdlog);
3343 mdlog->start_entry(le);
3344 le->add_clean_inode(cur);
3345 ls->open_files.push_back(&cur->item_open_file);
3346 mdlog->submit_entry(le);
3347 }
3348
3349 // hit pop
3350 if (cmode & CEPH_FILE_MODE_WR)
3351 mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IWR);
3352 else
3353 mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IRD,
3354 mdr->client_request->get_source().num());
3355
3356 CDentry *dn = 0;
3357 if (req->get_dentry_wanted()) {
3358 assert(mdr->dn[0].size());
3359 dn = mdr->dn[0].back();
3360 }
3361
3362 mdr->tracei = cur;
3363 mdr->tracedn = dn;
3364 respond_to_request(mdr, 0);
3365 }
3366
3367 class C_MDS_openc_finish : public ServerLogContext {
3368 CDentry *dn;
3369 CInode *newi;
3370 snapid_t follows;
3371 public:
3372 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni, snapid_t f) :
3373 ServerLogContext(s, r), dn(d), newi(ni), follows(f) {}
3374 void finish(int r) override {
3375 assert(r == 0);
3376
3377 dn->pop_projected_linkage();
3378
3379 // dirty inode, dn, dir
3380 newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
3381 newi->mark_dirty(newi->inode.version+1, mdr->ls);
3382 newi->_mark_dirty_parent(mdr->ls, true);
3383
3384 mdr->apply();
3385
3386 get_mds()->locker->share_inode_max_size(newi);
3387
3388 MDRequestRef null_ref;
3389 get_mds()->mdcache->send_dentry_link(dn, null_ref);
3390
3391 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
3392
3393 server->respond_to_request(mdr, 0);
3394
3395 assert(g_conf->mds_kill_openc_at != 1);
3396 }
3397 };
3398
3399 /* This function takes responsibility for the passed mdr*/
3400 void Server::handle_client_openc(MDRequestRef& mdr)
3401 {
3402 MClientRequest *req = mdr->client_request;
3403 client_t client = mdr->get_client();
3404
3405 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
3406
3407 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
3408 if (cmode < 0) {
3409 respond_to_request(mdr, -EINVAL);
3410 return;
3411 }
3412
3413 if (!(req->head.args.open.flags & CEPH_O_EXCL)) {
3414 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(),
3415 &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
3416 if (r > 0) return;
3417 if (r == 0) {
3418 // it existed.
3419 handle_client_open(mdr);
3420 return;
3421 }
3422 if (r < 0 && r != -ENOENT) {
3423 if (r == -ESTALE) {
3424 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3425 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
3426 mdcache->find_ino_peers(req->get_filepath().get_ino(), c);
3427 } else {
3428 dout(10) << "FAIL on error " << r << dendl;
3429 respond_to_request(mdr, r);
3430 }
3431 return;
3432 }
3433 // r == -ENOENT
3434 }
3435
3436 bool excl = (req->head.args.open.flags & CEPH_O_EXCL);
3437 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3438 file_layout_t *dir_layout = NULL;
3439 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks,
3440 !excl, false, false, &dir_layout);
3441 if (!dn) return;
3442 if (mdr->snapid != CEPH_NOSNAP) {
3443 respond_to_request(mdr, -EROFS);
3444 return;
3445 }
3446 // set layout
3447 file_layout_t layout;
3448 if (dir_layout)
3449 layout = *dir_layout;
3450 else
3451 layout = mdcache->default_file_layout;
3452
3453 // What kind of client caps are required to complete this operation
3454 uint64_t access = MAY_WRITE;
3455
3456 const auto default_layout = layout;
3457
3458 // fill in any special params from client
3459 if (req->head.args.open.stripe_unit)
3460 layout.stripe_unit = req->head.args.open.stripe_unit;
3461 if (req->head.args.open.stripe_count)
3462 layout.stripe_count = req->head.args.open.stripe_count;
3463 if (req->head.args.open.object_size)
3464 layout.object_size = req->head.args.open.object_size;
3465 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
3466 (__s32)req->head.args.open.pool >= 0) {
3467 layout.pool_id = req->head.args.open.pool;
3468
3469 // make sure we have as new a map as the client
3470 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
3471 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
3472 return;
3473 }
3474 }
3475
3476 // If client doesn't have capability to modify layout pools, then
3477 // only permit this request if the requested pool matches what the
3478 // file would have inherited anyway from its parent.
3479 if (default_layout != layout) {
3480 access |= MAY_SET_VXATTR;
3481 }
3482
3483 if (!layout.is_valid()) {
3484 dout(10) << " invalid initial file layout" << dendl;
3485 respond_to_request(mdr, -EINVAL);
3486 return;
3487 }
3488 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
3489 dout(10) << " invalid data pool " << layout.pool_id << dendl;
3490 respond_to_request(mdr, -EINVAL);
3491 return;
3492 }
3493
3494 CDir *dir = dn->get_dir();
3495 CInode *diri = dir->get_inode();
3496 rdlocks.insert(&diri->authlock);
3497 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3498 return;
3499
3500 if (!check_access(mdr, diri, access))
3501 return;
3502
3503 if (!check_fragment_space(mdr, dir))
3504 return;
3505
3506 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3507
3508 if (!dnl->is_null()) {
3509 // it existed.
3510 assert(req->head.args.open.flags & CEPH_O_EXCL);
3511 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
3512 mdr->tracei = dnl->get_inode();
3513 mdr->tracedn = dn;
3514 respond_to_request(mdr, -EEXIST);
3515 return;
3516 }
3517
3518 // created null dn.
3519
3520 // create inode.
3521 SnapRealm *realm = diri->find_snaprealm(); // use directory's realm; inode isn't attached yet.
3522 snapid_t follows = realm->get_newest_seq();
3523
3524 CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
3525 req->head.args.open.mode | S_IFREG, &layout);
3526 assert(in);
3527
3528 // it's a file.
3529 dn->push_projected_linkage(in);
3530
3531 in->inode.version = dn->pre_dirty();
3532 if (layout.pool_id != mdcache->default_file_layout.pool_id)
3533 in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
3534 in->inode.update_backtrace();
3535 if (cmode & CEPH_FILE_MODE_WR) {
3536 in->inode.client_ranges[client].range.first = 0;
3537 in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
3538 in->inode.client_ranges[client].follows = follows;
3539 }
3540 in->inode.rstat.rfiles = 1;
3541
3542 assert(dn->first == follows+1);
3543 in->first = dn->first;
3544
3545 // prepare finisher
3546 mdr->ls = mdlog->get_current_segment();
3547 EUpdate *le = new EUpdate(mdlog, "openc");
3548 mdlog->start_entry(le);
3549 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
3550 journal_allocated_inos(mdr, &le->metablob);
3551 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
3552 le->metablob.add_primary_dentry(dn, in, true, true, true);
3553
3554 // do the open
3555 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
3556 in->authlock.set_state(LOCK_EXCL);
3557 in->xattrlock.set_state(LOCK_EXCL);
3558
3559 // make sure this inode gets into the journal
3560 le->metablob.add_opened_ino(in->ino());
3561 LogSegment *ls = mds->mdlog->get_current_segment();
3562 ls->open_files.push_back(&in->item_open_file);
3563
3564 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in, follows);
3565
3566 if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
3567 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
3568 // add the file created flag onto the reply if create_flags features is supported
3569 ::encode(in->inode.ino, mdr->reply_extra_bl);
3570 }
3571
3572 journal_and_reply(mdr, in, dn, le, fin);
3573
3574 // We hit_dir (via hit_inode) in our finish callback, but by then we might
3575 // have overshot the split size (multiple opencs in flight), so here is
3576 // an early chance to split the dir if this openc makes it oversized.
3577 mds->balancer->maybe_fragment(dir, false);
3578 }
3579
3580
3581
3582 void Server::handle_client_readdir(MDRequestRef& mdr)
3583 {
3584 MClientRequest *req = mdr->client_request;
3585 client_t client = req->get_source().num();
3586 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3587 CInode *diri = rdlock_path_pin_ref(mdr, 0, rdlocks, false, true);
3588 if (!diri) return;
3589
3590 // it's a directory, right?
3591 if (!diri->is_dir()) {
3592 // not a dir
3593 dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
3594 respond_to_request(mdr, -ENOTDIR);
3595 return;
3596 }
3597
3598 rdlocks.insert(&diri->filelock);
3599 rdlocks.insert(&diri->dirfragtreelock);
3600
3601 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3602 return;
3603
3604 if (!check_access(mdr, diri, MAY_READ))
3605 return;
3606
3607 // which frag?
3608 frag_t fg = (__u32)req->head.args.readdir.frag;
3609 unsigned req_flags = (__u32)req->head.args.readdir.flags;
3610 string offset_str = req->get_path2();
3611
3612 __u32 offset_hash = 0;
3613 if (!offset_str.empty())
3614 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
3615 else
3616 offset_hash = (__u32)req->head.args.readdir.offset_hash;
3617
3618 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
3619 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
3620
3621 // does the frag exist?
3622 if (diri->dirfragtree[fg.value()] != fg) {
3623 frag_t newfg;
3624 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3625 if (fg.contains((unsigned)offset_hash)) {
3626 newfg = diri->dirfragtree[offset_hash];
3627 } else {
3628 // client actually wants next frag
3629 newfg = diri->dirfragtree[fg.value()];
3630 }
3631 } else {
3632 offset_str.clear();
3633 newfg = diri->dirfragtree[fg.value()];
3634 }
3635 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
3636 fg = newfg;
3637 }
3638
3639 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
3640 if (!dir) return;
3641
3642 // ok!
3643 dout(10) << "handle_client_readdir on " << *dir << dendl;
3644 assert(dir->is_auth());
3645
3646 if (!dir->is_complete()) {
3647 if (dir->is_frozen()) {
3648 dout(7) << "dir is frozen " << *dir << dendl;
3649 mds->locker->drop_locks(mdr.get());
3650 mdr->drop_local_auth_pins();
3651 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3652 return;
3653 }
3654 // fetch
3655 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
3656 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
3657 return;
3658 }
3659
3660 #ifdef MDS_VERIFY_FRAGSTAT
3661 dir->verify_fragstat();
3662 #endif
3663
3664 utime_t now = ceph_clock_now();
3665 mdr->set_mds_stamp(now);
3666
3667 snapid_t snapid = mdr->snapid;
3668 dout(10) << "snapid " << snapid << dendl;
3669
3670 SnapRealm *realm = diri->find_snaprealm();
3671
3672 unsigned max = req->head.args.readdir.max_entries;
3673 if (!max)
3674 max = dir->get_num_any(); // whatever, something big.
3675 unsigned max_bytes = req->head.args.readdir.max_bytes;
3676 if (!max_bytes)
3677 // make sure at least one item can be encoded
3678 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
3679
3680 // start final blob
3681 bufferlist dirbl;
3682 dir->encode_dirstat(dirbl, mds->get_nodeid());
3683
3684 // count bytes available.
3685 // this isn't perfect, but we should capture the main variable/unbounded size items!
3686 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
3687 int bytes_left = max_bytes - front_bytes;
3688 bytes_left -= realm->get_snap_trace().length();
3689
3690 // build dir contents
3691 bufferlist dnbl;
3692 __u32 numfiles = 0;
3693 bool start = !offset_hash && offset_str.empty();
3694 bool end = (dir->begin() == dir->end());
3695 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
3696 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
3697 for (CDir::map_t::iterator it = start ? dir->begin() : dir->lower_bound(skip_key);
3698 !end && numfiles < max;
3699 end = (it == dir->end())) {
3700 CDentry *dn = it->second;
3701 ++it;
3702
3703 if (dn->state_test(CDentry::STATE_PURGING))
3704 continue;
3705
3706 bool dnp = dn->use_projected(client, mdr);
3707 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
3708
3709 if (dnl->is_null())
3710 continue;
3711
3712 if (dn->last < snapid || dn->first > snapid) {
3713 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
3714 continue;
3715 }
3716
3717 if (!start) {
3718 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
3719 if (!(offset_key < dn->key()))
3720 continue;
3721 }
3722
3723 CInode *in = dnl->get_inode();
3724
3725 if (in && in->ino() == CEPH_INO_CEPH)
3726 continue;
3727
3728 // remote link?
3729 // better for the MDS to do the work, if we think the client will stat any of these files.
3730 if (dnl->is_remote() && !in) {
3731 in = mdcache->get_inode(dnl->get_remote_ino());
3732 if (in) {
3733 dn->link_remote(dnl, in);
3734 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
3735 dout(10) << "skipping bad remote ino on " << *dn << dendl;
3736 continue;
3737 } else {
3738 // touch everything i _do_ have
3739 for (CDir::map_t::iterator p = dir->begin(); p != dir->end(); ++p)
3740 if (!p->second->get_linkage()->is_null())
3741 mdcache->lru.lru_touch(p->second);
3742
3743 // already issued caps and leases, reply immediately.
3744 if (dnbl.length() > 0) {
3745 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
3746 dout(10) << " open remote dentry after caps were issued, stopping at "
3747 << dnbl.length() << " < " << bytes_left << dendl;
3748 break;
3749 }
3750
3751 mds->locker->drop_locks(mdr.get());
3752 mdr->drop_local_auth_pins();
3753 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
3754 return;
3755 }
3756 }
3757 assert(in);
3758
3759 if ((int)(dnbl.length() + dn->name.length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
3760 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
3761 break;
3762 }
3763
3764 unsigned start_len = dnbl.length();
3765
3766 // dentry
3767 dout(12) << "including dn " << *dn << dendl;
3768 ::encode(dn->name, dnbl);
3769 mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session);
3770
3771 // inode
3772 dout(12) << "including inode " << *in << dendl;
3773 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
3774 if (r < 0) {
3775 // chop off dn->name, lease
3776 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
3777 bufferlist keep;
3778 keep.substr_of(dnbl, 0, start_len);
3779 dnbl.swap(keep);
3780 break;
3781 }
3782 assert(r >= 0);
3783 numfiles++;
3784
3785 // touch dn
3786 mdcache->lru.lru_touch(dn);
3787 }
3788
3789 __u16 flags = 0;
3790 if (end) {
3791 flags = CEPH_READDIR_FRAG_END;
3792 if (start)
3793 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
3794 }
3795 // client only understand END and COMPLETE flags ?
3796 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3797 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
3798 }
3799
3800 // finish final blob
3801 ::encode(numfiles, dirbl);
3802 ::encode(flags, dirbl);
3803 dirbl.claim_append(dnbl);
3804
3805 // yay, reply
3806 dout(10) << "reply to " << *req << " readdir num=" << numfiles
3807 << " bytes=" << dirbl.length()
3808 << " start=" << (int)start
3809 << " end=" << (int)end
3810 << dendl;
3811 mdr->reply_extra_bl = dirbl;
3812
3813 // bump popularity. NOTE: this doesn't quite capture it.
3814 mds->balancer->hit_dir(now, dir, META_POP_IRD, -1, numfiles);
3815
3816 // reply
3817 mdr->tracei = diri;
3818 respond_to_request(mdr, 0);
3819 }
3820
3821
3822
3823 // ===============================================================================
3824 // INODE UPDATES
3825
3826
3827 /*
3828 * finisher for basic inode updates
3829 */
3830 class C_MDS_inode_update_finish : public ServerLogContext {
3831 CInode *in;
3832 bool truncating_smaller, changed_ranges;
3833 public:
3834 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
3835 bool sm=false, bool cr=false) :
3836 ServerLogContext(s, r), in(i), truncating_smaller(sm), changed_ranges(cr) { }
3837 void finish(int r) override {
3838 assert(r == 0);
3839
3840 // apply
3841 in->pop_and_dirty_projected_inode(mdr->ls);
3842 mdr->apply();
3843
3844 // notify any clients
3845 if (truncating_smaller && in->inode.is_truncating()) {
3846 get_mds()->locker->issue_truncate(in);
3847 get_mds()->mdcache->truncate_inode(in, mdr->ls);
3848 }
3849
3850 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
3851
3852 server->respond_to_request(mdr, 0);
3853
3854 if (changed_ranges)
3855 get_mds()->locker->share_inode_max_size(in);
3856 }
3857 };
3858
3859 void Server::handle_client_file_setlock(MDRequestRef& mdr)
3860 {
3861 MClientRequest *req = mdr->client_request;
3862 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3863
3864 // get the inode to operate on, and set up any locks needed for that
3865 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3866 if (!cur)
3867 return;
3868
3869 xlocks.insert(&cur->flocklock);
3870 /* acquire_locks will return true if it gets the locks. If it fails,
3871 it will redeliver this request at a later date, so drop the request.
3872 */
3873 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
3874 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
3875 return;
3876 }
3877
3878 // copy the lock change into a ceph_filelock so we can store/apply it
3879 ceph_filelock set_lock;
3880 set_lock.start = req->head.args.filelock_change.start;
3881 set_lock.length = req->head.args.filelock_change.length;
3882 set_lock.client = req->get_orig_source().num();
3883 set_lock.owner = req->head.args.filelock_change.owner;
3884 set_lock.pid = req->head.args.filelock_change.pid;
3885 set_lock.type = req->head.args.filelock_change.type;
3886 bool will_wait = req->head.args.filelock_change.wait;
3887
3888 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
3889
3890 ceph_lock_state_t *lock_state = NULL;
3891 bool interrupt = false;
3892
3893 // get the appropriate lock state
3894 switch (req->head.args.filelock_change.rule) {
3895 case CEPH_LOCK_FLOCK_INTR:
3896 interrupt = true;
3897 // fall-thru
3898 case CEPH_LOCK_FLOCK:
3899 lock_state = cur->get_flock_lock_state();
3900 break;
3901
3902 case CEPH_LOCK_FCNTL_INTR:
3903 interrupt = true;
3904 // fall-thru
3905 case CEPH_LOCK_FCNTL:
3906 lock_state = cur->get_fcntl_lock_state();
3907 break;
3908
3909 default:
3910 dout(10) << "got unknown lock type " << set_lock.type
3911 << ", dropping request!" << dendl;
3912 respond_to_request(mdr, -EOPNOTSUPP);
3913 return;
3914 }
3915
3916 dout(10) << " state prior to lock change: " << *lock_state << dendl;
3917 if (CEPH_LOCK_UNLOCK == set_lock.type) {
3918 list<ceph_filelock> activated_locks;
3919 list<MDSInternalContextBase*> waiters;
3920 if (lock_state->is_waiting(set_lock)) {
3921 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
3922 lock_state->remove_waiting(set_lock);
3923 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
3924 } else if (!interrupt) {
3925 dout(10) << " unlock attempt on " << set_lock << dendl;
3926 lock_state->remove_lock(set_lock, activated_locks);
3927 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
3928 }
3929 mds->queue_waiters(waiters);
3930
3931 respond_to_request(mdr, 0);
3932 } else {
3933 dout(10) << " lock attempt on " << set_lock << dendl;
3934 bool deadlock = false;
3935 if (mdr->more()->flock_was_waiting &&
3936 !lock_state->is_waiting(set_lock)) {
3937 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
3938 respond_to_request(mdr, -EINTR);
3939 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
3940 dout(10) << " it failed on this attempt" << dendl;
3941 // couldn't set lock right now
3942 if (deadlock) {
3943 respond_to_request(mdr, -EDEADLK);
3944 } else if (!will_wait) {
3945 respond_to_request(mdr, -EWOULDBLOCK);
3946 } else {
3947 dout(10) << " added to waiting list" << dendl;
3948 assert(lock_state->is_waiting(set_lock));
3949 mdr->more()->flock_was_waiting = true;
3950 mds->locker->drop_locks(mdr.get());
3951 mdr->drop_local_auth_pins();
3952 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
3953 }
3954 } else
3955 respond_to_request(mdr, 0);
3956 }
3957 dout(10) << " state after lock change: " << *lock_state << dendl;
3958 }
3959
3960 void Server::handle_client_file_readlock(MDRequestRef& mdr)
3961 {
3962 MClientRequest *req = mdr->client_request;
3963 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3964
3965 // get the inode to operate on, and set up any locks needed for that
3966 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3967 if (!cur)
3968 return;
3969
3970 /* acquire_locks will return true if it gets the locks. If it fails,
3971 it will redeliver this request at a later date, so drop the request.
3972 */
3973 rdlocks.insert(&cur->flocklock);
3974 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
3975 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
3976 return;
3977 }
3978
3979 // copy the lock change into a ceph_filelock so we can store/apply it
3980 ceph_filelock checking_lock;
3981 checking_lock.start = req->head.args.filelock_change.start;
3982 checking_lock.length = req->head.args.filelock_change.length;
3983 checking_lock.client = req->get_orig_source().num();
3984 checking_lock.owner = req->head.args.filelock_change.owner;
3985 checking_lock.pid = req->head.args.filelock_change.pid;
3986 checking_lock.type = req->head.args.filelock_change.type;
3987
3988 // get the appropriate lock state
3989 ceph_lock_state_t *lock_state = NULL;
3990 switch (req->head.args.filelock_change.rule) {
3991 case CEPH_LOCK_FLOCK:
3992 lock_state = cur->get_flock_lock_state();
3993 break;
3994
3995 case CEPH_LOCK_FCNTL:
3996 lock_state = cur->get_fcntl_lock_state();
3997 break;
3998
3999 default:
4000 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
4001 respond_to_request(mdr, -EINVAL);
4002 return;
4003 }
4004 lock_state->look_for_lock(checking_lock);
4005
4006 bufferlist lock_bl;
4007 ::encode(checking_lock, lock_bl);
4008
4009 mdr->reply_extra_bl = lock_bl;
4010 respond_to_request(mdr, 0);
4011 }
4012
4013 void Server::handle_client_setattr(MDRequestRef& mdr)
4014 {
4015 MClientRequest *req = mdr->client_request;
4016 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4017 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4018 if (!cur) return;
4019
4020 if (mdr->snapid != CEPH_NOSNAP) {
4021 respond_to_request(mdr, -EROFS);
4022 return;
4023 }
4024 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
4025 respond_to_request(mdr, -EPERM);
4026 return;
4027 }
4028
4029 __u32 mask = req->head.args.setattr.mask;
4030 __u32 access_mask = MAY_WRITE;
4031
4032 // xlock inode
4033 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
4034 xlocks.insert(&cur->authlock);
4035 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
4036 xlocks.insert(&cur->filelock);
4037 if (mask & CEPH_SETATTR_CTIME)
4038 wrlocks.insert(&cur->versionlock);
4039
4040 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4041 return;
4042
4043 if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
4044 access_mask |= MAY_CHOWN;
4045
4046 if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
4047 access_mask |= MAY_CHGRP;
4048
4049 if (!check_access(mdr, cur, access_mask))
4050 return;
4051
4052 // trunc from bigger -> smaller?
4053 inode_t *pi = cur->get_projected_inode();
4054
4055 uint64_t old_size = MAX(pi->size, req->head.args.setattr.old_size);
4056
4057 // ENOSPC on growing file while full, but allow shrinks
4058 if (is_full && req->head.args.setattr.size > old_size) {
4059 dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
4060 respond_to_request(mdr, -ENOSPC);
4061 return;
4062 }
4063
4064 bool truncating_smaller = false;
4065 if (mask & CEPH_SETATTR_SIZE) {
4066 truncating_smaller = req->head.args.setattr.size < old_size;
4067 if (truncating_smaller && pi->is_truncating()) {
4068 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4069 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4070 mds->locker->drop_locks(mdr.get());
4071 mdr->drop_local_auth_pins();
4072 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4073 return;
4074 }
4075 }
4076
4077 bool changed_ranges = false;
4078
4079 // project update
4080 mdr->ls = mdlog->get_current_segment();
4081 EUpdate *le = new EUpdate(mdlog, "setattr");
4082 mdlog->start_entry(le);
4083
4084 pi = cur->project_inode();
4085
4086 if (mask & CEPH_SETATTR_UID)
4087 pi->uid = req->head.args.setattr.uid;
4088 if (mask & CEPH_SETATTR_GID)
4089 pi->gid = req->head.args.setattr.gid;
4090
4091 if (mask & CEPH_SETATTR_MODE)
4092 pi->mode = (pi->mode & ~07777) | (req->head.args.setattr.mode & 07777);
4093 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
4094 S_ISREG(pi->mode)) {
4095 pi->mode &= ~S_ISUID;
4096 if ((pi->mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP))
4097 pi->mode &= ~S_ISGID;
4098 }
4099
4100 if (mask & CEPH_SETATTR_MTIME)
4101 pi->mtime = req->head.args.setattr.mtime;
4102 if (mask & CEPH_SETATTR_ATIME)
4103 pi->atime = req->head.args.setattr.atime;
4104 if (mask & CEPH_SETATTR_BTIME)
4105 pi->btime = req->head.args.setattr.btime;
4106 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
4107 pi->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
4108 if (mask & CEPH_SETATTR_SIZE) {
4109 if (truncating_smaller) {
4110 pi->truncate(old_size, req->head.args.setattr.size);
4111 le->metablob.add_truncate_start(cur->ino());
4112 } else {
4113 pi->size = req->head.args.setattr.size;
4114 pi->rstat.rbytes = pi->size;
4115 }
4116 pi->mtime = mdr->get_op_stamp();
4117
4118 // adjust client's max_size?
4119 map<client_t,client_writeable_range_t> new_ranges;
4120 bool max_increased = false;
4121 mds->locker->calc_new_client_ranges(cur, pi->size, &new_ranges, &max_increased);
4122 if (pi->client_ranges != new_ranges) {
4123 dout(10) << " client_ranges " << pi->client_ranges << " -> " << new_ranges << dendl;
4124 pi->client_ranges = new_ranges;
4125 changed_ranges = true;
4126 }
4127 }
4128
4129 pi->version = cur->pre_dirty();
4130 pi->ctime = mdr->get_op_stamp();
4131 pi->change_attr++;
4132
4133 // log + wait
4134 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4135 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4136 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4137
4138 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
4139 truncating_smaller, changed_ranges));
4140
4141 // flush immediately if there are readers/writers waiting
4142 if (xlocks.count(&cur->filelock) &&
4143 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
4144 mds->mdlog->flush();
4145 }
4146
4147 /* Takes responsibility for mdr */
4148 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
4149 {
4150 CInode *in = mdr->in[0];
4151 client_t client = mdr->get_client();
4152 assert(in);
4153
4154 dout(10) << "do_open_truncate " << *in << dendl;
4155
4156 SnapRealm *realm = in->find_snaprealm();
4157 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
4158
4159 mdr->ls = mdlog->get_current_segment();
4160 EUpdate *le = new EUpdate(mdlog, "open_truncate");
4161 mdlog->start_entry(le);
4162
4163 // prepare
4164 inode_t *pi = in->project_inode();
4165 pi->version = in->pre_dirty();
4166 pi->mtime = pi->ctime = mdr->get_op_stamp();
4167 pi->change_attr++;
4168
4169 uint64_t old_size = MAX(pi->size, mdr->client_request->head.args.open.old_size);
4170 if (old_size > 0) {
4171 pi->truncate(old_size, 0);
4172 le->metablob.add_truncate_start(in->ino());
4173 }
4174
4175 bool changed_ranges = false;
4176 if (cmode & CEPH_FILE_MODE_WR) {
4177 pi->client_ranges[client].range.first = 0;
4178 pi->client_ranges[client].range.last = pi->get_layout_size_increment();
4179 pi->client_ranges[client].follows = in->find_snaprealm()->get_newest_seq();
4180 changed_ranges = true;
4181 }
4182
4183 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
4184
4185 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
4186 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
4187
4188 // make sure ino gets into the journal
4189 le->metablob.add_opened_ino(in->ino());
4190 LogSegment *ls = mds->mdlog->get_current_segment();
4191 ls->open_files.push_back(&in->item_open_file);
4192
4193 mdr->o_trunc = true;
4194
4195 CDentry *dn = 0;
4196 if (mdr->client_request->get_dentry_wanted()) {
4197 assert(mdr->dn[0].size());
4198 dn = mdr->dn[0].back();
4199 }
4200
4201 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
4202 changed_ranges));
4203 // Although the `open` part can give an early reply, the truncation won't
4204 // happen until our EUpdate is persistent, to give the client a prompt
4205 // response we must also flush that event.
4206 mdlog->flush();
4207 }
4208
4209
4210 /* This function cleans up the passed mdr */
4211 void Server::handle_client_setlayout(MDRequestRef& mdr)
4212 {
4213 MClientRequest *req = mdr->client_request;
4214 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4215 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4216 if (!cur) return;
4217
4218 if (mdr->snapid != CEPH_NOSNAP) {
4219 respond_to_request(mdr, -EROFS);
4220 return;
4221 }
4222 if (!cur->is_file()) {
4223 respond_to_request(mdr, -EINVAL);
4224 return;
4225 }
4226 if (cur->get_projected_inode()->size ||
4227 cur->get_projected_inode()->truncate_seq > 1) {
4228 respond_to_request(mdr, -ENOTEMPTY);
4229 return;
4230 }
4231
4232 // validate layout
4233 file_layout_t layout = cur->get_projected_inode()->layout;
4234 // save existing layout for later
4235 const auto old_layout = layout;
4236
4237 int access = MAY_WRITE;
4238
4239 if (req->head.args.setlayout.layout.fl_object_size > 0)
4240 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4241 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4242 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4243 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4244 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4245 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4246 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4247
4248 // make sure we have as new a map as the client
4249 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4250 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4251 return;
4252 }
4253 }
4254
4255 // Don't permit layout modifications without 'p' caps
4256 if (layout != old_layout) {
4257 access |= MAY_SET_VXATTR;
4258 }
4259
4260 if (!layout.is_valid()) {
4261 dout(10) << "bad layout" << dendl;
4262 respond_to_request(mdr, -EINVAL);
4263 return;
4264 }
4265 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4266 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4267 respond_to_request(mdr, -EINVAL);
4268 return;
4269 }
4270
4271 xlocks.insert(&cur->filelock);
4272 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4273 return;
4274
4275 if (!check_access(mdr, cur, access))
4276 return;
4277
4278 // project update
4279 inode_t *pi = cur->project_inode();
4280 pi->layout = layout;
4281 // add the old pool to the inode
4282 pi->add_old_pool(old_layout.pool_id);
4283 pi->version = cur->pre_dirty();
4284 pi->ctime = mdr->get_op_stamp();
4285 pi->change_attr++;
4286
4287 // log + wait
4288 mdr->ls = mdlog->get_current_segment();
4289 EUpdate *le = new EUpdate(mdlog, "setlayout");
4290 mdlog->start_entry(le);
4291 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4292 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4293 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4294
4295 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4296 }
4297
4298 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
4299 {
4300 MClientRequest *req = mdr->client_request;
4301 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4302 file_layout_t *dir_layout = NULL;
4303 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4304 if (!cur) return;
4305
4306 if (mdr->snapid != CEPH_NOSNAP) {
4307 respond_to_request(mdr, -EROFS);
4308 return;
4309 }
4310
4311 if (!cur->is_dir()) {
4312 respond_to_request(mdr, -ENOTDIR);
4313 return;
4314 }
4315
4316 xlocks.insert(&cur->policylock);
4317 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4318 return;
4319
4320 // validate layout
4321 const inode_t *old_pi = cur->get_projected_inode();
4322 file_layout_t layout;
4323 if (old_pi->has_layout())
4324 layout = old_pi->layout;
4325 else if (dir_layout)
4326 layout = *dir_layout;
4327 else
4328 layout = mdcache->default_file_layout;
4329
4330 // Level of access required to complete
4331 int access = MAY_WRITE;
4332
4333 const auto old_layout = layout;
4334
4335 if (req->head.args.setlayout.layout.fl_object_size > 0)
4336 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4337 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4338 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4339 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4340 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4341 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4342 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4343 // make sure we have as new a map as the client
4344 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4345 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4346 return;
4347 }
4348 }
4349
4350 if (layout != old_layout) {
4351 access |= MAY_SET_VXATTR;
4352 }
4353
4354 if (!layout.is_valid()) {
4355 dout(10) << "bad layout" << dendl;
4356 respond_to_request(mdr, -EINVAL);
4357 return;
4358 }
4359 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4360 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4361 respond_to_request(mdr, -EINVAL);
4362 return;
4363 }
4364
4365 if (!check_access(mdr, cur, access))
4366 return;
4367
4368 inode_t *pi = cur->project_inode();
4369 pi->layout = layout;
4370 pi->version = cur->pre_dirty();
4371
4372 // log + wait
4373 mdr->ls = mdlog->get_current_segment();
4374 EUpdate *le = new EUpdate(mdlog, "setlayout");
4375 mdlog->start_entry(le);
4376 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4377 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4378 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4379
4380 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4381 }
4382
4383 // XATTRS
4384
4385 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
4386 file_layout_t *layout, bool validate)
4387 {
4388 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
4389 try {
4390 if (name == "layout") {
4391 string::iterator begin = value.begin();
4392 string::iterator end = value.end();
4393 keys_and_values<string::iterator> p; // create instance of parser
4394 std::map<string, string> m; // map to receive results
4395 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4396 return -EINVAL;
4397 }
4398 string left(begin, end);
4399 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4400 if (begin != end)
4401 return -EINVAL;
4402 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4403 // Skip validation on each attr, we do it once at the end (avoid
4404 // rejecting intermediate states if the overall result is ok)
4405 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
4406 osdmap, layout, false);
4407 if (r < 0)
4408 return r;
4409 }
4410 } else if (name == "layout.object_size") {
4411 layout->object_size = boost::lexical_cast<unsigned>(value);
4412 } else if (name == "layout.stripe_unit") {
4413 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
4414 } else if (name == "layout.stripe_count") {
4415 layout->stripe_count = boost::lexical_cast<unsigned>(value);
4416 } else if (name == "layout.pool") {
4417 try {
4418 layout->pool_id = boost::lexical_cast<unsigned>(value);
4419 } catch (boost::bad_lexical_cast const&) {
4420 int64_t pool = osdmap.lookup_pg_pool_name(value);
4421 if (pool < 0) {
4422 dout(10) << " unknown pool " << value << dendl;
4423 return -ENOENT;
4424 }
4425 layout->pool_id = pool;
4426 }
4427 } else if (name == "layout.pool_namespace") {
4428 layout->pool_ns = value;
4429 } else {
4430 dout(10) << " unknown layout vxattr " << name << dendl;
4431 return -EINVAL;
4432 }
4433 } catch (boost::bad_lexical_cast const&) {
4434 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4435 return -EINVAL;
4436 }
4437
4438 if (validate && !layout->is_valid()) {
4439 dout(10) << "bad layout" << dendl;
4440 return -EINVAL;
4441 }
4442 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
4443 dout(10) << " invalid data pool " << layout->pool_id << dendl;
4444 return -EINVAL;
4445 }
4446 return 0;
4447 }
4448
4449 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
4450 {
4451 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
4452 try {
4453 if (name == "quota") {
4454 string::iterator begin = value.begin();
4455 string::iterator end = value.end();
4456 keys_and_values<string::iterator> p; // create instance of parser
4457 std::map<string, string> m; // map to receive results
4458 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4459 return -EINVAL;
4460 }
4461 string left(begin, end);
4462 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4463 if (begin != end)
4464 return -EINVAL;
4465 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4466 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
4467 if (r < 0)
4468 return r;
4469 }
4470 } else if (name == "quota.max_bytes") {
4471 int64_t q = boost::lexical_cast<int64_t>(value);
4472 if (q < 0)
4473 return -EINVAL;
4474 quota->max_bytes = q;
4475 } else if (name == "quota.max_files") {
4476 int64_t q = boost::lexical_cast<int64_t>(value);
4477 if (q < 0)
4478 return -EINVAL;
4479 quota->max_files = q;
4480 } else {
4481 dout(10) << " unknown quota vxattr " << name << dendl;
4482 return -EINVAL;
4483 }
4484 } catch (boost::bad_lexical_cast const&) {
4485 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4486 return -EINVAL;
4487 }
4488
4489 if (!quota->is_valid()) {
4490 dout(10) << "bad quota" << dendl;
4491 return -EINVAL;
4492 }
4493 return 0;
4494 }
4495
4496 /*
4497 * Verify that the file layout attribute carried by client
4498 * is well-formatted.
4499 * Return 0 on success, otherwise this function takes
4500 * responsibility for the passed mdr.
4501 */
4502 int Server::check_layout_vxattr(MDRequestRef& mdr,
4503 string name,
4504 string value,
4505 file_layout_t *layout)
4506 {
4507 MClientRequest *req = mdr->client_request;
4508 epoch_t epoch;
4509 int r;
4510
4511 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4512 r = parse_layout_vxattr(name, value, osdmap, layout);
4513 epoch = osdmap.get_epoch();
4514 });
4515
4516 if (r == -ENOENT) {
4517
4518 // we don't have the specified pool, make sure our map
4519 // is newer than or as new as the client.
4520 epoch_t req_epoch = req->get_osdmap_epoch();
4521
4522 if (req_epoch > epoch) {
4523
4524 // well, our map is older. consult mds.
4525 Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
4526
4527 if (!mds->objecter->wait_for_map(req_epoch, fin))
4528 return r; // wait, fin will retry this request later
4529
4530 delete fin;
4531
4532 // now we have at least as new a map as the client, try again.
4533 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4534 r = parse_layout_vxattr(name, value, osdmap, layout);
4535 epoch = osdmap.get_epoch();
4536 });
4537
4538 assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
4539
4540 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
4541
4542 // For compatibility with client w/ old code, we still need get the
4543 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
4544 // we can remove those code.
4545 mdr->waited_for_osdmap = true;
4546 mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
4547 mds, new C_MDS_RetryRequest(mdcache, mdr)));
4548 return r;
4549 }
4550 }
4551
4552 if (r < 0) {
4553
4554 if (r == -ENOENT)
4555 r = -EINVAL;
4556
4557 respond_to_request(mdr, r);
4558 return r;
4559 }
4560
4561 // all is well
4562 return 0;
4563 }
4564
4565 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
4566 file_layout_t *dir_layout,
4567 set<SimpleLock*> rdlocks,
4568 set<SimpleLock*> wrlocks,
4569 set<SimpleLock*> xlocks)
4570 {
4571 MClientRequest *req = mdr->client_request;
4572 string name(req->get_path2());
4573 bufferlist bl = req->get_data();
4574 string value (bl.c_str(), bl.length());
4575 dout(10) << "handle_set_vxattr " << name
4576 << " val " << value.length()
4577 << " bytes on " << *cur
4578 << dendl;
4579
4580 inode_t *pi = NULL;
4581 string rest;
4582
4583 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
4584 return;
4585 }
4586
4587 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
4588 if (!cur->is_dir()) {
4589 respond_to_request(mdr, -EINVAL);
4590 return;
4591 }
4592
4593 file_layout_t layout;
4594 if (cur->get_projected_inode()->has_layout())
4595 layout = cur->get_projected_inode()->layout;
4596 else if (dir_layout)
4597 layout = *dir_layout;
4598 else
4599 layout = mdcache->default_file_layout;
4600
4601 rest = name.substr(name.find("layout"));
4602 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4603 return;
4604
4605 xlocks.insert(&cur->policylock);
4606 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4607 return;
4608
4609 pi = cur->project_inode();
4610 pi->layout = layout;
4611 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
4612 if (!cur->is_file()) {
4613 respond_to_request(mdr, -EINVAL);
4614 return;
4615 }
4616 if (cur->get_projected_inode()->size ||
4617 cur->get_projected_inode()->truncate_seq > 1) {
4618 respond_to_request(mdr, -ENOTEMPTY);
4619 return;
4620 }
4621 file_layout_t layout = cur->get_projected_inode()->layout;
4622 rest = name.substr(name.find("layout"));
4623 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4624 return;
4625
4626 xlocks.insert(&cur->filelock);
4627 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4628 return;
4629
4630 pi = cur->project_inode();
4631 int64_t old_pool = pi->layout.pool_id;
4632 pi->add_old_pool(old_pool);
4633 pi->layout = layout;
4634 pi->ctime = mdr->get_op_stamp();
4635 } else if (name.compare(0, 10, "ceph.quota") == 0) {
4636 if (!cur->is_dir() || cur->is_root()) {
4637 respond_to_request(mdr, -EINVAL);
4638 return;
4639 }
4640
4641 quota_info_t quota = cur->get_projected_inode()->quota;
4642
4643 rest = name.substr(name.find("quota"));
4644 int r = parse_quota_vxattr(rest, value, &quota);
4645 if (r < 0) {
4646 respond_to_request(mdr, r);
4647 return;
4648 }
4649
4650 xlocks.insert(&cur->policylock);
4651 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4652 return;
4653
4654 pi = cur->project_inode();
4655 pi->quota = quota;
4656 } else if (name.find("ceph.dir.pin") == 0) {
4657 if (!cur->is_dir() || cur->is_root()) {
4658 respond_to_request(mdr, -EINVAL);
4659 return;
4660 }
4661
4662 mds_rank_t rank;
4663 try {
4664 rank = boost::lexical_cast<mds_rank_t>(value);
4665 if (rank < 0) rank = MDS_RANK_NONE;
4666 } catch (boost::bad_lexical_cast const&) {
4667 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4668 respond_to_request(mdr, -EINVAL);
4669 return;
4670 }
4671
4672 xlocks.insert(&cur->policylock);
4673 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4674 return;
4675
4676 pi = cur->project_inode();
4677 cur->set_export_pin(rank);
4678 } else {
4679 dout(10) << " unknown vxattr " << name << dendl;
4680 respond_to_request(mdr, -EINVAL);
4681 return;
4682 }
4683
4684 pi->change_attr++;
4685 pi->ctime = mdr->get_op_stamp();
4686 pi->version = cur->pre_dirty();
4687 if (cur->is_file())
4688 pi->update_backtrace();
4689
4690 // log + wait
4691 mdr->ls = mdlog->get_current_segment();
4692 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
4693 mdlog->start_entry(le);
4694 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4695 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4696 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4697
4698 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4699 return;
4700 }
4701
4702 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
4703 file_layout_t *dir_layout,
4704 set<SimpleLock*> rdlocks,
4705 set<SimpleLock*> wrlocks,
4706 set<SimpleLock*> xlocks)
4707 {
4708 MClientRequest *req = mdr->client_request;
4709 string name(req->get_path2());
4710
4711 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
4712
4713 if (name == "ceph.dir.layout") {
4714 if (!cur->is_dir()) {
4715 respond_to_request(mdr, -ENODATA);
4716 return;
4717 }
4718 if (cur->is_root()) {
4719 dout(10) << "can't remove layout policy on the root directory" << dendl;
4720 respond_to_request(mdr, -EINVAL);
4721 return;
4722 }
4723
4724 if (!cur->get_projected_inode()->has_layout()) {
4725 respond_to_request(mdr, -ENODATA);
4726 return;
4727 }
4728
4729 xlocks.insert(&cur->policylock);
4730 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4731 return;
4732
4733 inode_t *pi = cur->project_inode();
4734 pi->clear_layout();
4735 pi->version = cur->pre_dirty();
4736
4737 // log + wait
4738 mdr->ls = mdlog->get_current_segment();
4739 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
4740 mdlog->start_entry(le);
4741 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4742 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4743 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4744
4745 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4746 return;
4747 } else if (name == "ceph.dir.layout.pool_namespace"
4748 || name == "ceph.file.layout.pool_namespace") {
4749 // Namespace is the only layout field that has a meaningful
4750 // null/none value (empty string, means default layout). Is equivalent
4751 // to a setxattr with empty string: pass through the empty payload of
4752 // the rmxattr request to do this.
4753 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4754 return;
4755 }
4756
4757 respond_to_request(mdr, -ENODATA);
4758 }
4759
4760 class C_MDS_inode_xattr_update_finish : public ServerLogContext {
4761 CInode *in;
4762 public:
4763
4764 C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
4765 ServerLogContext(s, r), in(i) { }
4766 void finish(int r) override {
4767 assert(r == 0);
4768
4769 // apply
4770 in->pop_and_dirty_projected_inode(mdr->ls);
4771
4772 mdr->apply();
4773
4774 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
4775
4776 server->respond_to_request(mdr, 0);
4777 }
4778 };
4779
4780 void Server::handle_client_setxattr(MDRequestRef& mdr)
4781 {
4782 MClientRequest *req = mdr->client_request;
4783 string name(req->get_path2());
4784 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4785 CInode *cur;
4786
4787 file_layout_t *dir_layout = NULL;
4788 if (name.compare(0, 15, "ceph.dir.layout") == 0)
4789 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4790 else
4791 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4792 if (!cur)
4793 return;
4794
4795 if (mdr->snapid != CEPH_NOSNAP) {
4796 respond_to_request(mdr, -EROFS);
4797 return;
4798 }
4799
4800 int flags = req->head.args.setxattr.flags;
4801
4802 // magic ceph.* namespace?
4803 if (name.compare(0, 5, "ceph.") == 0) {
4804 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4805 return;
4806 }
4807
4808 xlocks.insert(&cur->xattrlock);
4809 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4810 return;
4811
4812 if (!check_access(mdr, cur, MAY_WRITE))
4813 return;
4814
4815 map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
4816 size_t len = req->get_data().length();
4817 size_t inc = len + name.length();
4818
4819 // check xattrs kv pairs size
4820 size_t cur_xattrs_size = 0;
4821 for (const auto& p : *pxattrs) {
4822 if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
4823 continue;
4824 }
4825 cur_xattrs_size += p.first.length() + p.second.length();
4826 }
4827
4828 if (((cur_xattrs_size + inc) > g_conf->mds_max_xattr_pairs_size)) {
4829 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
4830 << cur_xattrs_size << ", inc " << inc << dendl;
4831 respond_to_request(mdr, -ENOSPC);
4832 return;
4833 }
4834
4835 if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(name)) {
4836 dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
4837 respond_to_request(mdr, -EEXIST);
4838 return;
4839 }
4840 if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(name)) {
4841 dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
4842 respond_to_request(mdr, -ENODATA);
4843 return;
4844 }
4845
4846 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
4847
4848 // project update
4849 map<string,bufferptr> *px = new map<string,bufferptr>;
4850 inode_t *pi = cur->project_inode(px);
4851 pi->version = cur->pre_dirty();
4852 pi->ctime = mdr->get_op_stamp();
4853 pi->change_attr++;
4854 pi->xattr_version++;
4855 px->erase(name);
4856 if (!(flags & CEPH_XATTR_REMOVE)) {
4857 (*px)[name] = buffer::create(len);
4858 if (len)
4859 req->get_data().copy(0, len, (*px)[name].c_str());
4860 }
4861
4862 // log + wait
4863 mdr->ls = mdlog->get_current_segment();
4864 EUpdate *le = new EUpdate(mdlog, "setxattr");
4865 mdlog->start_entry(le);
4866 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4867 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4868 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4869
4870 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4871 }
4872
4873 void Server::handle_client_removexattr(MDRequestRef& mdr)
4874 {
4875 MClientRequest *req = mdr->client_request;
4876 string name(req->get_path2());
4877 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4878 file_layout_t *dir_layout = NULL;
4879 CInode *cur;
4880 if (name == "ceph.dir.layout")
4881 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4882 else
4883 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4884 if (!cur)
4885 return;
4886
4887 if (mdr->snapid != CEPH_NOSNAP) {
4888 respond_to_request(mdr, -EROFS);
4889 return;
4890 }
4891
4892 if (name.compare(0, 5, "ceph.") == 0) {
4893 handle_remove_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4894 return;
4895 }
4896
4897 xlocks.insert(&cur->xattrlock);
4898 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4899 return;
4900
4901 map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
4902 if (pxattrs->count(name) == 0) {
4903 dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
4904 respond_to_request(mdr, -ENODATA);
4905 return;
4906 }
4907
4908 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
4909
4910 // project update
4911 map<string,bufferptr> *px = new map<string,bufferptr>;
4912 inode_t *pi = cur->project_inode(px);
4913 pi->version = cur->pre_dirty();
4914 pi->ctime = mdr->get_op_stamp();
4915 pi->change_attr++;
4916 pi->xattr_version++;
4917 px->erase(name);
4918
4919 // log + wait
4920 mdr->ls = mdlog->get_current_segment();
4921 EUpdate *le = new EUpdate(mdlog, "removexattr");
4922 mdlog->start_entry(le);
4923 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4924 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4925 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4926
4927 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4928 }
4929
4930
4931 // =================================================================
4932 // DIRECTORY and NAMESPACE OPS
4933
4934
4935 // ------------------------------------------------
4936
4937 // MKNOD
4938
4939 class C_MDS_mknod_finish : public ServerLogContext {
4940 CDentry *dn;
4941 CInode *newi;
4942 public:
4943 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4944 ServerLogContext(s, r), dn(d), newi(ni) {}
4945 void finish(int r) override {
4946 assert(r == 0);
4947
4948 // link the inode
4949 dn->pop_projected_linkage();
4950
4951 // be a bit hacky with the inode version, here.. we decrement it
4952 // just to keep mark_dirty() happen. (we didn't bother projecting
4953 // a new version of hte inode since it's just been created)
4954 newi->inode.version--;
4955 newi->mark_dirty(newi->inode.version + 1, mdr->ls);
4956 newi->_mark_dirty_parent(mdr->ls, true);
4957
4958 // mkdir?
4959 if (newi->inode.is_dir()) {
4960 CDir *dir = newi->get_dirfrag(frag_t());
4961 assert(dir);
4962 dir->fnode.version--;
4963 dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
4964 dir->mark_new(mdr->ls);
4965 }
4966
4967 mdr->apply();
4968
4969 MDRequestRef null_ref;
4970 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4971
4972 if (newi->inode.is_file())
4973 get_mds()->locker->share_inode_max_size(newi);
4974
4975 // hit pop
4976 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
4977
4978 // reply
4979 server->respond_to_request(mdr, 0);
4980 }
4981 };
4982
4983
4984 void Server::handle_client_mknod(MDRequestRef& mdr)
4985 {
4986 MClientRequest *req = mdr->client_request;
4987 client_t client = mdr->get_client();
4988 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4989 file_layout_t *dir_layout = NULL;
4990 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false,
4991 &dir_layout);
4992 if (!dn) return;
4993 if (mdr->snapid != CEPH_NOSNAP) {
4994 respond_to_request(mdr, -EROFS);
4995 return;
4996 }
4997 CInode *diri = dn->get_dir()->get_inode();
4998 rdlocks.insert(&diri->authlock);
4999 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5000 return;
5001
5002 if (!check_access(mdr, diri, MAY_WRITE))
5003 return;
5004
5005 if (!check_fragment_space(mdr, dn->get_dir()))
5006 return;
5007
5008 unsigned mode = req->head.args.mknod.mode;
5009 if ((mode & S_IFMT) == 0)
5010 mode |= S_IFREG;
5011
5012 // set layout
5013 file_layout_t layout;
5014 if (dir_layout && S_ISREG(mode))
5015 layout = *dir_layout;
5016 else
5017 layout = mdcache->default_file_layout;
5018
5019 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5020 snapid_t follows = realm->get_newest_seq();
5021 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
5022 mode, &layout);
5023 assert(newi);
5024
5025 dn->push_projected_linkage(newi);
5026
5027 newi->inode.rdev = req->head.args.mknod.rdev;
5028 newi->inode.version = dn->pre_dirty();
5029 newi->inode.rstat.rfiles = 1;
5030 if (layout.pool_id != mdcache->default_file_layout.pool_id)
5031 newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
5032 newi->inode.update_backtrace();
5033
5034 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5035 // want to write to it (e.g., if they are reexporting NFS)
5036 if (S_ISREG(newi->inode.mode)) {
5037 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
5038 newi->inode.client_ranges[client].range.first = 0;
5039 newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
5040 newi->inode.client_ranges[client].follows = follows;
5041
5042 // issue a cap on the file
5043 int cmode = CEPH_FILE_MODE_RDWR;
5044 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5045 if (cap) {
5046 cap->set_wanted(0);
5047
5048 // put locks in excl mode
5049 newi->filelock.set_state(LOCK_EXCL);
5050 newi->authlock.set_state(LOCK_EXCL);
5051 newi->xattrlock.set_state(LOCK_EXCL);
5052 }
5053 }
5054
5055 assert(dn->first == follows + 1);
5056 newi->first = dn->first;
5057
5058 dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
5059
5060 // prepare finisher
5061 mdr->ls = mdlog->get_current_segment();
5062 EUpdate *le = new EUpdate(mdlog, "mknod");
5063 mdlog->start_entry(le);
5064 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5065 journal_allocated_inos(mdr, &le->metablob);
5066
5067 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
5068 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5069 le->metablob.add_primary_dentry(dn, newi, true, true, true);
5070
5071 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5072 }
5073
5074
5075
5076 // MKDIR
5077 /* This function takes responsibility for the passed mdr*/
5078 void Server::handle_client_mkdir(MDRequestRef& mdr)
5079 {
5080 MClientRequest *req = mdr->client_request;
5081 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5082 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5083 if (!dn) return;
5084 if (mdr->snapid != CEPH_NOSNAP) {
5085 respond_to_request(mdr, -EROFS);
5086 return;
5087 }
5088 CDir *dir = dn->get_dir();
5089 CInode *diri = dir->get_inode();
5090 rdlocks.insert(&diri->authlock);
5091 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5092 return;
5093
5094 // mkdir check access
5095 if (!check_access(mdr, diri, MAY_WRITE))
5096 return;
5097
5098 if (!check_fragment_space(mdr, dir))
5099 return;
5100
5101 // new inode
5102 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5103 snapid_t follows = realm->get_newest_seq();
5104
5105 unsigned mode = req->head.args.mkdir.mode;
5106 mode &= ~S_IFMT;
5107 mode |= S_IFDIR;
5108 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5109 assert(newi);
5110
5111 // it's a directory.
5112 dn->push_projected_linkage(newi);
5113
5114 newi->inode.version = dn->pre_dirty();
5115 newi->inode.rstat.rsubdirs = 1;
5116 newi->inode.update_backtrace();
5117
5118 dout(12) << " follows " << follows << dendl;
5119 assert(dn->first == follows + 1);
5120 newi->first = dn->first;
5121
5122 // ...and that new dir is empty.
5123 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
5124 newdir->state_set(CDir::STATE_CREATING);
5125 newdir->mark_complete();
5126 newdir->fnode.version = newdir->pre_dirty();
5127
5128 // prepare finisher
5129 mdr->ls = mdlog->get_current_segment();
5130 EUpdate *le = new EUpdate(mdlog, "mkdir");
5131 mdlog->start_entry(le);
5132 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5133 journal_allocated_inos(mdr, &le->metablob);
5134 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5135 le->metablob.add_primary_dentry(dn, newi, true, true);
5136 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
5137
5138 // issue a cap on the directory
5139 int cmode = CEPH_FILE_MODE_RDWR;
5140 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5141 if (cap) {
5142 cap->set_wanted(0);
5143
5144 // put locks in excl mode
5145 newi->filelock.set_state(LOCK_EXCL);
5146 newi->authlock.set_state(LOCK_EXCL);
5147 newi->xattrlock.set_state(LOCK_EXCL);
5148 }
5149
5150 // make sure this inode gets into the journal
5151 le->metablob.add_opened_ino(newi->ino());
5152 LogSegment *ls = mds->mdlog->get_current_segment();
5153 ls->open_files.push_back(&newi->item_open_file);
5154
5155 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5156 }
5157
5158
5159 // SYMLINK
5160
5161 void Server::handle_client_symlink(MDRequestRef& mdr)
5162 {
5163 MClientRequest *req = mdr->client_request;
5164 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5165 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5166 if (!dn) return;
5167 if (mdr->snapid != CEPH_NOSNAP) {
5168 respond_to_request(mdr, -EROFS);
5169 return;
5170 }
5171 CDir *dir = dn->get_dir();
5172 CInode *diri = dir->get_inode();
5173 rdlocks.insert(&diri->authlock);
5174 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5175 return;
5176
5177 if (!check_access(mdr, diri, MAY_WRITE))
5178 return;
5179
5180 if (!check_fragment_space(mdr, dir))
5181 return;
5182
5183 unsigned mode = S_IFLNK | 0777;
5184 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5185 assert(newi);
5186
5187 // it's a symlink
5188 dn->push_projected_linkage(newi);
5189
5190 newi->symlink = req->get_path2();
5191 newi->inode.size = newi->symlink.length();
5192 newi->inode.rstat.rbytes = newi->inode.size;
5193 newi->inode.rstat.rfiles = 1;
5194 newi->inode.version = dn->pre_dirty();
5195 newi->inode.update_backtrace();
5196
5197 newi->first = dn->first;
5198
5199 // prepare finisher
5200 mdr->ls = mdlog->get_current_segment();
5201 EUpdate *le = new EUpdate(mdlog, "symlink");
5202 mdlog->start_entry(le);
5203 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5204 journal_allocated_inos(mdr, &le->metablob);
5205 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5206 le->metablob.add_primary_dentry(dn, newi, true, true);
5207
5208 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5209 }
5210
5211
5212
5213
5214
5215 // LINK
5216
5217 void Server::handle_client_link(MDRequestRef& mdr)
5218 {
5219 MClientRequest *req = mdr->client_request;
5220
5221 dout(7) << "handle_client_link " << req->get_filepath()
5222 << " to " << req->get_filepath2()
5223 << dendl;
5224
5225 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5226
5227 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5228 if (!dn) return;
5229 CInode *targeti = rdlock_path_pin_ref(mdr, 1, rdlocks, false);
5230 if (!targeti) return;
5231 if (mdr->snapid != CEPH_NOSNAP) {
5232 respond_to_request(mdr, -EROFS);
5233 return;
5234 }
5235
5236 CDir *dir = dn->get_dir();
5237 dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
5238 dout(7) << "target is " << *targeti << dendl;
5239 if (targeti->is_dir()) {
5240 dout(7) << "target is a dir, failing..." << dendl;
5241 respond_to_request(mdr, -EINVAL);
5242 return;
5243 }
5244
5245 xlocks.insert(&targeti->linklock);
5246
5247 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5248 return;
5249
5250 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5251 if (!check_access(mdr, targeti, MAY_WRITE))
5252 return;
5253
5254 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
5255 return;
5256
5257 if (!check_fragment_space(mdr, dir))
5258 return;
5259 }
5260
5261 // go!
5262 assert(g_conf->mds_kill_link_at != 1);
5263
5264 // local or remote?
5265 if (targeti->is_auth())
5266 _link_local(mdr, dn, targeti);
5267 else
5268 _link_remote(mdr, true, dn, targeti);
5269 }
5270
5271
5272 class C_MDS_link_local_finish : public ServerLogContext {
5273 CDentry *dn;
5274 CInode *targeti;
5275 version_t dnpv;
5276 version_t tipv;
5277 public:
5278 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
5279 version_t dnpv_, version_t tipv_) :
5280 ServerLogContext(s, r), dn(d), targeti(ti),
5281 dnpv(dnpv_), tipv(tipv_) { }
5282 void finish(int r) override {
5283 assert(r == 0);
5284 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv);
5285 }
5286 };
5287
5288
5289 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
5290 {
5291 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
5292
5293 mdr->ls = mdlog->get_current_segment();
5294
5295 // predirty NEW dentry
5296 version_t dnpv = dn->pre_dirty();
5297 version_t tipv = targeti->pre_dirty();
5298
5299 // project inode update
5300 inode_t *pi = targeti->project_inode();
5301 pi->nlink++;
5302 pi->ctime = mdr->get_op_stamp();
5303 pi->change_attr++;
5304 pi->version = tipv;
5305
5306 // log + wait
5307 EUpdate *le = new EUpdate(mdlog, "link_local");
5308 mdlog->start_entry(le);
5309 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5310 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
5311 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
5312 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5313 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
5314
5315 // do this after predirty_*, to avoid funky extra dnl arg
5316 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5317
5318 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv));
5319 }
5320
5321 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
5322 version_t dnpv, version_t tipv)
5323 {
5324 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
5325
5326 // link and unlock the NEW dentry
5327 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
5328 if (!dnl->get_inode())
5329 dn->link_remote(dnl, targeti);
5330 dn->mark_dirty(dnpv, mdr->ls);
5331
5332 // target inode
5333 targeti->pop_and_dirty_projected_inode(mdr->ls);
5334
5335 mdr->apply();
5336
5337 MDRequestRef null_ref;
5338 mdcache->send_dentry_link(dn, null_ref);
5339
5340 // bump target popularity
5341 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5342 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
5343
5344 // reply
5345 respond_to_request(mdr, 0);
5346 }
5347
5348
5349 // link / unlink remote
5350
5351 class C_MDS_link_remote_finish : public ServerLogContext {
5352 bool inc;
5353 CDentry *dn;
5354 CInode *targeti;
5355 version_t dpv;
5356 public:
5357 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
5358 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
5359 dpv(d->get_projected_version()) {}
5360 void finish(int r) override {
5361 assert(r == 0);
5362 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
5363 }
5364 };
5365
5366 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
5367 {
5368 dout(10) << "_link_remote "
5369 << (inc ? "link ":"unlink ")
5370 << *dn << " to " << *targeti << dendl;
5371
5372 // 1. send LinkPrepare to dest (journal nlink++ prepare)
5373 mds_rank_t linkauth = targeti->authority().first;
5374 if (mdr->more()->witnessed.count(linkauth) == 0) {
5375 if (mds->is_cluster_degraded() &&
5376 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
5377 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
5378 if (mdr->more()->waiting_on_slave.empty())
5379 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
5380 return;
5381 }
5382
5383 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
5384 int op;
5385 if (inc)
5386 op = MMDSSlaveRequest::OP_LINKPREP;
5387 else
5388 op = MMDSSlaveRequest::OP_UNLINKPREP;
5389 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, op);
5390 targeti->set_object_info(req->get_object_info());
5391 req->op_stamp = mdr->get_op_stamp();
5392 mds->send_message_mds(req, linkauth);
5393
5394 assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
5395 mdr->more()->waiting_on_slave.insert(linkauth);
5396 return;
5397 }
5398 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
5399
5400 assert(g_conf->mds_kill_link_at != 2);
5401
5402 mdr->set_mds_stamp(ceph_clock_now());
5403
5404 // add to event
5405 mdr->ls = mdlog->get_current_segment();
5406 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
5407 mdlog->start_entry(le);
5408 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5409 if (!mdr->more()->witnessed.empty()) {
5410 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
5411 le->reqid = mdr->reqid;
5412 le->had_slaves = true;
5413 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
5414 }
5415
5416 if (inc) {
5417 dn->pre_dirty();
5418 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
5419 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5420 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5421 } else {
5422 dn->pre_dirty();
5423 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
5424 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
5425 le->metablob.add_null_dentry(dn, true);
5426 dn->push_projected_linkage();
5427 }
5428
5429 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
5430 }
5431
5432 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
5433 CDentry *dn, CInode *targeti,
5434 version_t dpv)
5435 {
5436 dout(10) << "_link_remote_finish "
5437 << (inc ? "link ":"unlink ")
5438 << *dn << " to " << *targeti << dendl;
5439
5440 assert(g_conf->mds_kill_link_at != 3);
5441
5442 if (!mdr->more()->witnessed.empty())
5443 mdcache->logged_master_update(mdr->reqid);
5444
5445 if (inc) {
5446 // link the new dentry
5447 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
5448 if (!dnl->get_inode())
5449 dn->link_remote(dnl, targeti);
5450 dn->mark_dirty(dpv, mdr->ls);
5451 } else {
5452 // unlink main dentry
5453 dn->get_dir()->unlink_inode(dn);
5454 dn->pop_projected_linkage();
5455 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
5456 }
5457
5458 mdr->apply();
5459
5460 MDRequestRef null_ref;
5461 if (inc)
5462 mdcache->send_dentry_link(dn, null_ref);
5463 else
5464 mdcache->send_dentry_unlink(dn, NULL, null_ref);
5465
5466 // bump target popularity
5467 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5468 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
5469
5470 // reply
5471 respond_to_request(mdr, 0);
5472
5473 if (!inc)
5474 // removing a new dn?
5475 dn->get_dir()->try_remove_unlinked_dn(dn);
5476 }
5477
5478
5479 // remote linking/unlinking
5480
5481 class C_MDS_SlaveLinkPrep : public ServerLogContext {
5482 CInode *targeti;
5483 public:
5484 C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t) :
5485 ServerLogContext(s, r), targeti(t) { }
5486 void finish(int r) override {
5487 assert(r == 0);
5488 server->_logged_slave_link(mdr, targeti);
5489 }
5490 };
5491
5492 class C_MDS_SlaveLinkCommit : public ServerContext {
5493 MDRequestRef mdr;
5494 CInode *targeti;
5495 public:
5496 C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
5497 ServerContext(s), mdr(r), targeti(t) { }
5498 void finish(int r) override {
5499 server->_commit_slave_link(mdr, r, targeti);
5500 }
5501 };
5502
5503 /* This function DOES put the mdr->slave_request before returning*/
5504 void Server::handle_slave_link_prep(MDRequestRef& mdr)
5505 {
5506 dout(10) << "handle_slave_link_prep " << *mdr
5507 << " on " << mdr->slave_request->get_object_info()
5508 << dendl;
5509
5510 assert(g_conf->mds_kill_link_at != 4);
5511
5512 CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
5513 assert(targeti);
5514 dout(10) << "targeti " << *targeti << dendl;
5515 CDentry *dn = targeti->get_parent_dn();
5516 CDentry::linkage_t *dnl = dn->get_linkage();
5517 assert(dnl->is_primary());
5518
5519 mdr->set_op_stamp(mdr->slave_request->op_stamp);
5520
5521 mdr->auth_pin(targeti);
5522
5523 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
5524 assert(g_conf->mds_kill_link_at != 5);
5525
5526 // journal it
5527 mdr->ls = mdlog->get_current_segment();
5528 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
5529 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
5530 mdlog->start_entry(le);
5531
5532 inode_t *pi = dnl->get_inode()->project_inode();
5533
5534 // update journaled target inode
5535 bool inc;
5536 if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
5537 inc = true;
5538 pi->nlink++;
5539 } else {
5540 inc = false;
5541 pi->nlink--;
5542 }
5543
5544 link_rollback rollback;
5545 rollback.reqid = mdr->reqid;
5546 rollback.ino = targeti->ino();
5547 rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
5548 const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
5549 rollback.old_dir_mtime = pf->fragstat.mtime;
5550 rollback.old_dir_rctime = pf->rstat.rctime;
5551 rollback.was_inc = inc;
5552 ::encode(rollback, le->rollback);
5553 mdr->more()->rollback_bl = le->rollback;
5554
5555 pi->ctime = mdr->get_op_stamp();
5556 pi->version = targeti->pre_dirty();
5557
5558 dout(10) << " projected inode " << pi << " v " << pi->version << dendl;
5559
5560 // commit case
5561 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
5562 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
5563
5564 // set up commit waiter
5565 mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
5566
5567 mdr->more()->slave_update_journaled = true;
5568 submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti),
5569 mdr, __func__);
5570 mdlog->flush();
5571 }
5572
5573 void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti)
5574 {
5575 dout(10) << "_logged_slave_link " << *mdr
5576 << " " << *targeti << dendl;
5577
5578 assert(g_conf->mds_kill_link_at != 6);
5579
5580 // update the target
5581 targeti->pop_and_dirty_projected_inode(mdr->ls);
5582 mdr->apply();
5583
5584 // hit pop
5585 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5586
5587 // done.
5588 mdr->slave_request->put();
5589 mdr->slave_request = 0;
5590
5591 // ack
5592 if (!mdr->aborted) {
5593 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5594 MMDSSlaveRequest::OP_LINKPREPACK);
5595 mds->send_message_mds(reply, mdr->slave_to_mds);
5596 } else {
5597 dout(10) << " abort flag set, finishing" << dendl;
5598 mdcache->request_finish(mdr);
5599 }
5600 }
5601
5602
5603 struct C_MDS_CommittedSlave : public ServerLogContext {
5604 C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
5605 void finish(int r) override {
5606 server->_committed_slave(mdr);
5607 }
5608 };
5609
5610 void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
5611 {
5612 dout(10) << "_commit_slave_link " << *mdr
5613 << " r=" << r
5614 << " " << *targeti << dendl;
5615
5616 assert(g_conf->mds_kill_link_at != 7);
5617
5618 if (r == 0) {
5619 // drop our pins, etc.
5620 mdr->cleanup();
5621
5622 // write a commit to the journal
5623 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
5624 ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
5625 mdlog->start_entry(le);
5626 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
5627 mdlog->flush();
5628 } else {
5629 do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
5630 }
5631 }
5632
5633 void Server::_committed_slave(MDRequestRef& mdr)
5634 {
5635 dout(10) << "_committed_slave " << *mdr << dendl;
5636
5637 assert(g_conf->mds_kill_link_at != 8);
5638
5639 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5640 MMDSSlaveRequest::OP_COMMITTED);
5641 mds->send_message_mds(req, mdr->slave_to_mds);
5642 mdcache->request_finish(mdr);
5643 }
5644
5645 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
5646 MutationRef mut;
5647 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r) : ServerLogContext(s, r), mut(m) {}
5648 void finish(int r) override {
5649 server->_link_rollback_finish(mut, mdr);
5650 }
5651 };
5652
5653 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
5654 {
5655 link_rollback rollback;
5656 bufferlist::iterator p = rbl.begin();
5657 ::decode(rollback, p);
5658
5659 dout(10) << "do_link_rollback on " << rollback.reqid
5660 << (rollback.was_inc ? " inc":" dec")
5661 << " ino " << rollback.ino
5662 << dendl;
5663
5664 assert(g_conf->mds_kill_link_at != 9);
5665
5666 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
5667 assert(mdr || mds->is_resolve());
5668
5669 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
5670 mut->ls = mds->mdlog->get_current_segment();
5671
5672 CInode *in = mdcache->get_inode(rollback.ino);
5673 assert(in);
5674 dout(10) << " target is " << *in << dendl;
5675 assert(!in->is_projected()); // live slave request hold versionlock xlock.
5676
5677 inode_t *pi = in->project_inode();
5678 pi->version = in->pre_dirty();
5679 mut->add_projected_inode(in);
5680
5681 // parent dir rctime
5682 CDir *parent = in->get_projected_parent_dn()->get_dir();
5683 fnode_t *pf = parent->project_fnode();
5684 mut->add_projected_fnode(parent);
5685 pf->version = parent->pre_dirty();
5686 if (pf->fragstat.mtime == pi->ctime) {
5687 pf->fragstat.mtime = rollback.old_dir_mtime;
5688 if (pf->rstat.rctime == pi->ctime)
5689 pf->rstat.rctime = rollback.old_dir_rctime;
5690 mut->add_updated_lock(&parent->get_inode()->filelock);
5691 mut->add_updated_lock(&parent->get_inode()->nestlock);
5692 }
5693
5694 // inode
5695 pi->ctime = rollback.old_ctime;
5696 if (rollback.was_inc)
5697 pi->nlink--;
5698 else
5699 pi->nlink++;
5700
5701 // journal it
5702 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
5703 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
5704 mdlog->start_entry(le);
5705 le->commit.add_dir_context(parent);
5706 le->commit.add_dir(parent, true);
5707 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
5708
5709 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr),
5710 mdr, __func__);
5711 mdlog->flush();
5712 }
5713
5714 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr)
5715 {
5716 dout(10) << "_link_rollback_finish" << dendl;
5717
5718 assert(g_conf->mds_kill_link_at != 10);
5719
5720 mut->apply();
5721 if (mdr)
5722 mdcache->request_finish(mdr);
5723
5724 mdcache->finish_rollback(mut->reqid);
5725
5726 mut->cleanup();
5727 }
5728
5729
5730 /* This function DOES NOT put the passed message before returning*/
5731 void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *m)
5732 {
5733 dout(10) << "handle_slave_link_prep_ack " << *mdr
5734 << " " << *m << dendl;
5735 mds_rank_t from = mds_rank_t(m->get_source().num());
5736
5737 assert(g_conf->mds_kill_link_at != 11);
5738
5739 // note slave
5740 mdr->more()->slaves.insert(from);
5741
5742 // witnessed!
5743 assert(mdr->more()->witnessed.count(from) == 0);
5744 mdr->more()->witnessed.insert(from);
5745 assert(!m->is_not_journaled());
5746 mdr->more()->has_journaled_slaves = true;
5747
5748 // remove from waiting list
5749 assert(mdr->more()->waiting_on_slave.count(from));
5750 mdr->more()->waiting_on_slave.erase(from);
5751
5752 assert(mdr->more()->waiting_on_slave.empty());
5753
5754 dispatch_client_request(mdr); // go again!
5755 }
5756
5757
5758
5759
5760
5761 // UNLINK
5762
5763 void Server::handle_client_unlink(MDRequestRef& mdr)
5764 {
5765 MClientRequest *req = mdr->client_request;
5766 client_t client = mdr->get_client();
5767
5768 // rmdir or unlink?
5769 bool rmdir = false;
5770 if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true;
5771
5772 if (req->get_filepath().depth() == 0) {
5773 respond_to_request(mdr, -EINVAL);
5774 return;
5775 }
5776
5777 // traverse to path
5778 vector<CDentry*> trace;
5779 CInode *in;
5780 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(), &trace, &in, MDS_TRAVERSE_FORWARD);
5781 if (r > 0) return;
5782 if (r < 0) {
5783 if (r == -ESTALE) {
5784 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
5785 mdcache->find_ino_peers(req->get_filepath().get_ino(), new C_MDS_TryFindInode(this, mdr));
5786 return;
5787 }
5788 respond_to_request(mdr, r);
5789 return;
5790 }
5791 if (mdr->snapid != CEPH_NOSNAP) {
5792 respond_to_request(mdr, -EROFS);
5793 return;
5794 }
5795
5796 CDentry *dn = trace[trace.size()-1];
5797 assert(dn);
5798 if (!dn->is_auth()) {
5799 mdcache->request_forward(mdr, dn->authority().first);
5800 return;
5801 }
5802
5803 CInode *diri = dn->get_dir()->get_inode();
5804
5805 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
5806 assert(!dnl->is_null());
5807
5808 if (rmdir) {
5809 dout(7) << "handle_client_rmdir on " << *dn << dendl;
5810 } else {
5811 dout(7) << "handle_client_unlink on " << *dn << dendl;
5812 }
5813 dout(7) << "dn links to " << *in << dendl;
5814
5815 // rmdir vs is_dir
5816 if (in->is_dir()) {
5817 if (rmdir) {
5818 // do empty directory checks
5819 if (_dir_is_nonempty_unlocked(mdr, in)) {
5820 respond_to_request(mdr, -ENOTEMPTY);
5821 return;
5822 }
5823 } else {
5824 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
5825 respond_to_request(mdr, -EISDIR);
5826 return;
5827 }
5828 } else {
5829 if (rmdir) {
5830 // unlink
5831 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
5832 respond_to_request(mdr, -ENOTDIR);
5833 return;
5834 }
5835 }
5836
5837 // -- create stray dentry? --
5838 CDentry *straydn = NULL;
5839 if (dnl->is_primary()) {
5840 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
5841 if (!straydn)
5842 return;
5843 dout(10) << " straydn is " << *straydn << dendl;
5844 } else if (mdr->straydn) {
5845 mdr->unpin(mdr->straydn);
5846 mdr->straydn = NULL;
5847 }
5848
5849 // lock
5850 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5851
5852 for (int i=0; i<(int)trace.size()-1; i++)
5853 rdlocks.insert(&trace[i]->lock);
5854 xlocks.insert(&dn->lock);
5855 wrlocks.insert(&diri->filelock);
5856 wrlocks.insert(&diri->nestlock);
5857 xlocks.insert(&in->linklock);
5858 if (straydn) {
5859 wrlocks.insert(&straydn->get_dir()->inode->filelock);
5860 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
5861 xlocks.insert(&straydn->lock);
5862 }
5863 if (in->is_dir())
5864 rdlocks.insert(&in->filelock); // to verify it's empty
5865 mds->locker->include_snap_rdlocks(rdlocks, dnl->get_inode());
5866
5867 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5868 return;
5869
5870 if (in->is_dir() &&
5871 _dir_is_nonempty(mdr, in)) {
5872 respond_to_request(mdr, -ENOTEMPTY);
5873 return;
5874 }
5875
5876 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5877 if (!check_access(mdr, diri, MAY_WRITE))
5878 return;
5879 }
5880
5881 // yay!
5882 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
5883 // subtree root auths need to be witnesses
5884 set<mds_rank_t> witnesses;
5885 in->list_replicas(witnesses);
5886 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
5887
5888 for (set<mds_rank_t>::iterator p = witnesses.begin();
5889 p != witnesses.end();
5890 ++p) {
5891 if (mdr->more()->witnessed.count(*p)) {
5892 dout(10) << " already witnessed by mds." << *p << dendl;
5893 } else if (mdr->more()->waiting_on_slave.count(*p)) {
5894 dout(10) << " already waiting on witness mds." << *p << dendl;
5895 } else {
5896 if (!_rmdir_prepare_witness(mdr, *p, trace, straydn))
5897 return;
5898 }
5899 }
5900 if (!mdr->more()->waiting_on_slave.empty())
5901 return; // we're waiting for a witness.
5902 }
5903
5904 // ok!
5905 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
5906 _link_remote(mdr, false, dn, dnl->get_inode());
5907 else
5908 _unlink_local(mdr, dn, straydn);
5909 }
5910
5911 class C_MDS_unlink_local_finish : public ServerLogContext {
5912 CDentry *dn;
5913 CDentry *straydn;
5914 version_t dnpv; // deleted dentry
5915 public:
5916 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
5917 ServerLogContext(s, r), dn(d), straydn(sd),
5918 dnpv(d->get_projected_version()) {}
5919 void finish(int r) override {
5920 assert(r == 0);
5921 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
5922 }
5923 };
5924
5925 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
5926 {
5927 dout(10) << "_unlink_local " << *dn << dendl;
5928
5929 CDentry::linkage_t *dnl = dn->get_projected_linkage();
5930 CInode *in = dnl->get_inode();
5931
5932 SnapRealm *realm = in->find_snaprealm();
5933 snapid_t follows = realm->get_newest_seq();
5934
5935 // ok, let's do it.
5936 mdr->ls = mdlog->get_current_segment();
5937
5938 // prepare log entry
5939 EUpdate *le = new EUpdate(mdlog, "unlink_local");
5940 mdlog->start_entry(le);
5941 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5942 if (!mdr->more()->witnessed.empty()) {
5943 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
5944 le->reqid = mdr->reqid;
5945 le->had_slaves = true;
5946 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
5947 }
5948
5949 if (straydn) {
5950 assert(dnl->is_primary());
5951 straydn->push_projected_linkage(in);
5952 straydn->first = follows + 1;
5953 }
5954
5955 // the unlinked dentry
5956 dn->pre_dirty();
5957
5958 inode_t *pi = in->project_inode();
5959 dn->make_path_string(pi->stray_prior_path, true);
5960 mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
5961 pi->version = in->pre_dirty();
5962 pi->ctime = mdr->get_op_stamp();
5963 pi->change_attr++;
5964 pi->nlink--;
5965 if (pi->nlink == 0)
5966 in->state_set(CInode::STATE_ORPHAN);
5967
5968 if (dnl->is_primary()) {
5969 // primary link. add stray dentry.
5970 assert(straydn);
5971 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
5972 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5973
5974 // project snaprealm, too
5975 if (in->snaprealm || follows + 1 > in->get_oldest_snap())
5976 in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
5977
5978 pi->update_backtrace();
5979 le->metablob.add_primary_dentry(straydn, in, true, true);
5980 } else {
5981 // remote link. update remote inode.
5982 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
5983 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5984 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5985 }
5986
5987 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
5988 le->metablob.add_null_dentry(dn, true);
5989
5990 if (in->is_dir()) {
5991 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
5992 le->metablob.renamed_dirino = in->ino();
5993 }
5994
5995 dn->push_projected_linkage();
5996
5997 if (in->is_dir()) {
5998 assert(straydn);
5999 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6000 }
6001
6002 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
6003 }
6004
6005 void Server::_unlink_local_finish(MDRequestRef& mdr,
6006 CDentry *dn, CDentry *straydn,
6007 version_t dnpv)
6008 {
6009 dout(10) << "_unlink_local_finish " << *dn << dendl;
6010
6011 if (!mdr->more()->witnessed.empty())
6012 mdcache->logged_master_update(mdr->reqid);
6013
6014 // unlink main dentry
6015 dn->get_dir()->unlink_inode(dn);
6016 dn->pop_projected_linkage();
6017
6018 // relink as stray? (i.e. was primary link?)
6019 CInode *strayin = NULL;
6020 bool snap_is_new = false;
6021 if (straydn) {
6022 dout(20) << " straydn is " << *straydn << dendl;
6023 CDentry::linkage_t *straydnl = straydn->pop_projected_linkage();
6024 strayin = straydnl->get_inode();
6025
6026 snap_is_new = strayin->snaprealm ? true : false;
6027 mdcache->touch_dentry_bottom(straydn);
6028 }
6029
6030 dn->mark_dirty(dnpv, mdr->ls);
6031 mdr->apply();
6032
6033 if (snap_is_new) //only new if strayin exists
6034 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, true);
6035
6036 mdcache->send_dentry_unlink(dn, straydn, mdr);
6037
6038 // update subtree map?
6039 if (straydn && strayin->is_dir())
6040 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
6041
6042 // bump pop
6043 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
6044
6045 // reply
6046 respond_to_request(mdr, 0);
6047
6048 // removing a new dn?
6049 dn->get_dir()->try_remove_unlinked_dn(dn);
6050
6051 // clean up ?
6052 // respond_to_request() drops locks. So stray reintegration can race with us.
6053 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6054 // Tip off the MDCache that this dentry is a stray that
6055 // might be elegible for purge.
6056 mdcache->notify_stray(straydn);
6057 }
6058 }
6059
6060 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
6061 {
6062 if (mds->is_cluster_degraded() &&
6063 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6064 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
6065 if (mdr->more()->waiting_on_slave.empty())
6066 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6067 return false;
6068 }
6069
6070 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
6071 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6072 MMDSSlaveRequest::OP_RMDIRPREP);
6073 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
6074 for (auto dn : trace)
6075 req->srcdnpath.push_dentry(dn->name);
6076 mdcache->replicate_stray(straydn, who, req->stray);
6077
6078 req->op_stamp = mdr->get_op_stamp();
6079 mds->send_message_mds(req, who);
6080
6081 assert(mdr->more()->waiting_on_slave.count(who) == 0);
6082 mdr->more()->waiting_on_slave.insert(who);
6083 return true;
6084 }
6085
6086 struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
6087 CDentry *dn, *straydn;
6088 C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
6089 : ServerLogContext(s, r), dn(d), straydn(st) {}
6090 void finish(int r) override {
6091 server->_logged_slave_rmdir(mdr, dn, straydn);
6092 }
6093 };
6094
6095 struct C_MDS_SlaveRmdirCommit : public ServerContext {
6096 MDRequestRef mdr;
6097 CDentry *straydn;
6098 C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
6099 : ServerContext(s), mdr(r), straydn(sd) { }
6100 void finish(int r) override {
6101 server->_commit_slave_rmdir(mdr, r, straydn);
6102 }
6103 };
6104
6105 void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
6106 {
6107 dout(10) << "handle_slave_rmdir_prep " << *mdr
6108 << " " << mdr->slave_request->srcdnpath
6109 << " to " << mdr->slave_request->destdnpath
6110 << dendl;
6111
6112 vector<CDentry*> trace;
6113 filepath srcpath(mdr->slave_request->srcdnpath);
6114 dout(10) << " src " << srcpath << dendl;
6115 CInode *in;
6116 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK);
6117 if (r > 0) return;
6118 if (r == -ESTALE) {
6119 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
6120 mdr->slave_to_mds);
6121 return;
6122 }
6123 assert(r == 0);
6124 CDentry *dn = trace[trace.size()-1];
6125 dout(10) << " dn " << *dn << dendl;
6126 mdr->pin(dn);
6127
6128 assert(mdr->straydn);
6129 CDentry *straydn = mdr->straydn;
6130 dout(10) << " straydn " << *straydn << dendl;
6131
6132 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6133
6134 rmdir_rollback rollback;
6135 rollback.reqid = mdr->reqid;
6136 rollback.src_dir = dn->get_dir()->dirfrag();
6137 rollback.src_dname = dn->name;
6138 rollback.dest_dir = straydn->get_dir()->dirfrag();
6139 rollback.dest_dname = straydn->name;
6140 ::encode(rollback, mdr->more()->rollback_bl);
6141 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
6142
6143 // set up commit waiter
6144 mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
6145
6146 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
6147 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
6148 dn->get_dir()->unlink_inode(dn);
6149 straydn->get_dir()->link_primary_inode(straydn, in);
6150
6151 assert(straydn->first >= in->first);
6152 in->first = straydn->first;
6153
6154 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), false);
6155
6156 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6157 MMDSSlaveRequest::OP_RMDIRPREPACK);
6158 reply->mark_not_journaled();
6159 mds->send_message_mds(reply, mdr->slave_to_mds);
6160
6161 // send caps to auth (if we're not already)
6162 if (in->is_any_caps() && !in->state_test(CInode::STATE_EXPORTINGCAPS))
6163 mdcache->migrator->export_caps(in);
6164
6165 mdcache->touch_dentry_bottom(straydn); // move stray to end of lru
6166
6167 mdr->slave_request->put();
6168 mdr->slave_request = 0;
6169 mdr->straydn = 0;
6170 return;
6171 }
6172
6173 straydn->push_projected_linkage(in);
6174 dn->push_projected_linkage();
6175
6176 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
6177 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
6178 mdlog->start_entry(le);
6179 le->rollback = mdr->more()->rollback_bl;
6180
6181 le->commit.add_dir_context(straydn->get_dir());
6182 le->commit.add_primary_dentry(straydn, in, true);
6183 // slave: no need to journal original dentry
6184
6185 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6186 le->commit.renamed_dirino = in->ino();
6187
6188 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6189
6190 mdr->more()->slave_update_journaled = true;
6191 submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
6192 mdr, __func__);
6193 mdlog->flush();
6194 }
6195
6196 void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
6197 {
6198 dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
6199
6200 // update our cache now, so we are consistent with what is in the journal
6201 // when we journal a subtree map
6202 CInode *in = dn->get_linkage()->get_inode();
6203 dn->get_dir()->unlink_inode(dn);
6204 straydn->pop_projected_linkage();
6205 dn->pop_projected_linkage();
6206 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), true);
6207
6208 // done.
6209 mdr->slave_request->put();
6210 mdr->slave_request = 0;
6211 mdr->straydn = 0;
6212
6213 if (!mdr->aborted) {
6214 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6215 MMDSSlaveRequest::OP_RMDIRPREPACK);
6216 mds->send_message_mds(reply, mdr->slave_to_mds);
6217 } else {
6218 dout(10) << " abort flag set, finishing" << dendl;
6219 mdcache->request_finish(mdr);
6220 }
6221 }
6222
6223 void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
6224 {
6225 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
6226 << " " << *ack << dendl;
6227
6228 mds_rank_t from = mds_rank_t(ack->get_source().num());
6229
6230 mdr->more()->slaves.insert(from);
6231 mdr->more()->witnessed.insert(from);
6232 if (!ack->is_not_journaled())
6233 mdr->more()->has_journaled_slaves = true;
6234
6235 // remove from waiting list
6236 assert(mdr->more()->waiting_on_slave.count(from));
6237 mdr->more()->waiting_on_slave.erase(from);
6238
6239 if (mdr->more()->waiting_on_slave.empty())
6240 dispatch_client_request(mdr); // go again!
6241 else
6242 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
6243 }
6244
6245 void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
6246 {
6247 dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
6248
6249 if (r == 0) {
6250 if (mdr->more()->slave_update_journaled) {
6251 CInode *strayin = straydn->get_projected_linkage()->get_inode();
6252 if (strayin && !strayin->snaprealm)
6253 mdcache->clear_dirty_bits_for_stray(strayin);
6254 }
6255
6256 mdr->cleanup();
6257
6258 if (mdr->more()->slave_update_journaled) {
6259 // write a commit to the journal
6260 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
6261 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
6262 ESlaveUpdate::RMDIR);
6263 mdlog->start_entry(le);
6264 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
6265 mdlog->flush();
6266 } else {
6267 _committed_slave(mdr);
6268 }
6269 } else {
6270 // abort
6271 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
6272 }
6273 }
6274
6275 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
6276 metareqid_t reqid;
6277 CDentry *dn;
6278 CDentry *straydn;
6279 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
6280 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
6281 void finish(int r) override {
6282 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
6283 }
6284 };
6285
6286 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
6287 {
6288 // unlink the other rollback methods, the rmdir rollback is only
6289 // needed to record the subtree changes in the journal for inode
6290 // replicas who are auth for empty dirfrags. no actual changes to
6291 // the file system are taking place here, so there is no Mutation.
6292
6293 rmdir_rollback rollback;
6294 bufferlist::iterator p = rbl.begin();
6295 ::decode(rollback, p);
6296
6297 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
6298 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
6299 assert(mdr || mds->is_resolve());
6300
6301 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
6302 if (!dir)
6303 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
6304 assert(dir);
6305 CDentry *dn = dir->lookup(rollback.src_dname);
6306 assert(dn);
6307 dout(10) << " dn " << *dn << dendl;
6308 dir = mdcache->get_dirfrag(rollback.dest_dir);
6309 assert(dir);
6310 CDentry *straydn = dir->lookup(rollback.dest_dname);
6311 assert(straydn);
6312 dout(10) << " straydn " << *dn << dendl;
6313 CInode *in = straydn->get_linkage()->get_inode();
6314
6315 if (mdr && !mdr->more()->slave_update_journaled) {
6316 assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
6317
6318 straydn->get_dir()->unlink_inode(straydn);
6319 dn->get_dir()->link_primary_inode(dn, in);
6320
6321 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), false);
6322
6323 mdcache->request_finish(mdr);
6324 mdcache->finish_rollback(rollback.reqid);
6325 return;
6326 }
6327
6328 dn->push_projected_linkage(in);
6329 straydn->push_projected_linkage();
6330
6331 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
6332 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
6333 mdlog->start_entry(le);
6334
6335 le->commit.add_dir_context(dn->get_dir());
6336 le->commit.add_primary_dentry(dn, in, true);
6337 // slave: no need to journal straydn
6338
6339 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6340 le->commit.renamed_dirino = in->ino();
6341
6342 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
6343
6344 submit_mdlog_entry(le,
6345 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
6346 dn, straydn),
6347 mdr, __func__);
6348 mdlog->flush();
6349 }
6350
6351 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
6352 {
6353 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
6354
6355 straydn->get_dir()->unlink_inode(straydn);
6356 dn->pop_projected_linkage();
6357 straydn->pop_projected_linkage();
6358
6359 CInode *in = dn->get_linkage()->get_inode();
6360 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), true);
6361 if (mds->is_resolve()) {
6362 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
6363 mdcache->try_trim_non_auth_subtree(root);
6364 }
6365
6366 if (mdr)
6367 mdcache->request_finish(mdr);
6368
6369 mdcache->finish_rollback(reqid);
6370 }
6371
6372
6373 /** _dir_is_nonempty[_unlocked]
6374 *
6375 * check if a directory is non-empty (i.e. we can rmdir it).
6376 *
6377 * the unlocked varient this is a fastpath check. we can't really be
6378 * sure until we rdlock the filelock.
6379 */
6380 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
6381 {
6382 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
6383 assert(in->is_auth());
6384
6385 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
6386 return true; // in a snapshot!
6387
6388 list<CDir*> ls;
6389 in->get_dirfrags(ls);
6390 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6391 CDir *dir = *p;
6392 // is the frag obviously non-empty?
6393 if (dir->is_auth()) {
6394 if (dir->get_projected_fnode()->fragstat.size()) {
6395 dout(10) << "dir_is_nonempty_unlocked dirstat has "
6396 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
6397 return true;
6398 }
6399 }
6400 }
6401
6402 return false;
6403 }
6404
6405 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
6406 {
6407 dout(10) << "dir_is_nonempty " << *in << dendl;
6408 assert(in->is_auth());
6409 assert(in->filelock.can_read(mdr->get_client()));
6410
6411 frag_info_t dirstat;
6412 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
6413
6414 list<CDir*> ls;
6415 in->get_dirfrags(ls);
6416 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6417 CDir *dir = *p;
6418 const fnode_t *pf = dir->get_projected_fnode();
6419 if (pf->fragstat.size()) {
6420 dout(10) << "dir_is_nonempty dirstat has "
6421 << pf->fragstat.size() << " items " << *dir << dendl;
6422 return true;
6423 }
6424
6425 if (pf->accounted_fragstat.version == dirstat_version)
6426 dirstat.add(pf->accounted_fragstat);
6427 else
6428 dirstat.add(pf->fragstat);
6429 }
6430
6431 return dirstat.size() != in->get_projected_inode()->dirstat.size();
6432 }
6433
6434
6435 // ======================================================
6436
6437
6438 class C_MDS_rename_finish : public ServerLogContext {
6439 CDentry *srcdn;
6440 CDentry *destdn;
6441 CDentry *straydn;
6442 public:
6443 C_MDS_rename_finish(Server *s, MDRequestRef& r,
6444 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
6445 ServerLogContext(s, r),
6446 srcdn(sdn), destdn(ddn), straydn(stdn) { }
6447 void finish(int r) override {
6448 assert(r == 0);
6449 server->_rename_finish(mdr, srcdn, destdn, straydn);
6450 }
6451 };
6452
6453
6454 /** handle_client_rename
6455 *
6456 * rename master is the destdn auth. this is because cached inodes
6457 * must remain connected. thus, any replica of srci, must also
6458 * replicate destdn, and possibly straydn, so that srci (and
6459 * destdn->inode) remain connected during the rename.
6460 *
6461 * to do this, we freeze srci, then master (destdn auth) verifies that
6462 * all other nodes have also replciated destdn and straydn. note that
6463 * destdn replicas need not also replicate srci. this only works when
6464 * destdn is master.
6465 *
6466 * This function takes responsibility for the passed mdr.
6467 */
6468 void Server::handle_client_rename(MDRequestRef& mdr)
6469 {
6470 MClientRequest *req = mdr->client_request;
6471 dout(7) << "handle_client_rename " << *req << dendl;
6472
6473 filepath destpath = req->get_filepath();
6474 filepath srcpath = req->get_filepath2();
6475 if (destpath.depth() == 0 || srcpath.depth() == 0) {
6476 respond_to_request(mdr, -EINVAL);
6477 return;
6478 }
6479 const string &destname = destpath.last_dentry();
6480
6481 vector<CDentry*>& srctrace = mdr->dn[1];
6482 vector<CDentry*>& desttrace = mdr->dn[0];
6483
6484 set<SimpleLock*> rdlocks, wrlocks, xlocks;
6485
6486 CDentry *destdn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, true, false, true);
6487 if (!destdn) return;
6488 dout(10) << " destdn " << *destdn << dendl;
6489 if (mdr->snapid != CEPH_NOSNAP) {
6490 respond_to_request(mdr, -EROFS);
6491 return;
6492 }
6493 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
6494 CDir *destdir = destdn->get_dir();
6495 assert(destdir->is_auth());
6496
6497 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER);
6498 if (r > 0)
6499 return; // delayed
6500 if (r < 0) {
6501 if (r == -ESTALE) {
6502 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
6503 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
6504 } else {
6505 dout(10) << "FAIL on error " << r << dendl;
6506 respond_to_request(mdr, r);
6507 }
6508 return;
6509
6510 }
6511 assert(!srctrace.empty());
6512 CDentry *srcdn = srctrace[srctrace.size()-1];
6513 dout(10) << " srcdn " << *srcdn << dendl;
6514 if (srcdn->last != CEPH_NOSNAP) {
6515 respond_to_request(mdr, -EROFS);
6516 return;
6517 }
6518 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
6519 CInode *srci = srcdnl->get_inode();
6520 dout(10) << " srci " << *srci << dendl;
6521
6522 CInode *oldin = 0;
6523 if (!destdnl->is_null()) {
6524 //dout(10) << "dest dn exists " << *destdn << dendl;
6525 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
6526 if (!oldin) return;
6527 dout(10) << " oldin " << *oldin << dendl;
6528
6529 // mv /some/thing /to/some/existing_other_thing
6530 if (oldin->is_dir() && !srci->is_dir()) {
6531 respond_to_request(mdr, -EISDIR);
6532 return;
6533 }
6534 if (!oldin->is_dir() && srci->is_dir()) {
6535 respond_to_request(mdr, -ENOTDIR);
6536 return;
6537 }
6538
6539 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
6540 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
6541 respond_to_request(mdr, -ENOTEMPTY);
6542 return;
6543 }
6544 if (srci == oldin && !srcdn->get_dir()->inode->is_stray()) {
6545 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
6546 return;
6547 }
6548 }
6549
6550 // -- some sanity checks --
6551
6552 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
6553 if (destpath.get_ino() != srcpath.get_ino() &&
6554 !(req->get_source().is_mds() &&
6555 MDS_INO_IS_MDSDIR(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
6556 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
6557 CInode *destbase = desttrace[0]->get_dir()->get_inode();
6558 // ok, extend srctrace toward root until it is an ancestor of desttrace.
6559 while (srcbase != destbase &&
6560 !srcbase->is_projected_ancestor_of(destbase)) {
6561 CDentry *pdn = srcbase->get_projected_parent_dn();
6562 srctrace.insert(srctrace.begin(), pdn);
6563 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
6564 srcbase = pdn->get_dir()->get_inode();
6565 }
6566
6567 // then, extend destpath until it shares the same parent inode as srcpath.
6568 while (destbase != srcbase) {
6569 CDentry *pdn = destbase->get_projected_parent_dn();
6570 desttrace.insert(desttrace.begin(), pdn);
6571 rdlocks.insert(&pdn->lock);
6572 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
6573 destbase = pdn->get_dir()->get_inode();
6574 }
6575 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
6576 }
6577
6578 // src == dest?
6579 if (srcdn->get_dir() == destdir && srcdn->name == destname) {
6580 dout(7) << "rename src=dest, noop" << dendl;
6581 respond_to_request(mdr, 0);
6582 return;
6583 }
6584
6585 // dest a child of src?
6586 // e.g. mv /usr /usr/foo
6587 CDentry *pdn = destdir->inode->get_projected_parent_dn();
6588 while (pdn) {
6589 if (pdn == srcdn) {
6590 dout(7) << "cannot rename item to be a child of itself" << dendl;
6591 respond_to_request(mdr, -EINVAL);
6592 return;
6593 }
6594 pdn = pdn->get_dir()->inode->parent;
6595 }
6596
6597 // is this a stray migration, reintegration or merge? (sanity checks!)
6598 if (mdr->reqid.name.is_mds() &&
6599 !(MDS_INO_IS_MDSDIR(srcpath.get_ino()) &&
6600 MDS_INO_IS_MDSDIR(destpath.get_ino())) &&
6601 !(destdnl->is_remote() &&
6602 destdnl->get_remote_ino() == srci->ino())) {
6603 respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
6604 return;
6605 }
6606
6607 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
6608 (srcdnl->is_primary() || destdnl->is_primary()));
6609 if (linkmerge)
6610 dout(10) << " this is a link merge" << dendl;
6611
6612 // -- create stray dentry? --
6613 CDentry *straydn = NULL;
6614 if (destdnl->is_primary() && !linkmerge) {
6615 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
6616 if (!straydn)
6617 return;
6618 dout(10) << " straydn is " << *straydn << dendl;
6619 } else if (mdr->straydn) {
6620 mdr->unpin(mdr->straydn);
6621 mdr->straydn = NULL;
6622 }
6623
6624 // -- prepare witness list --
6625 /*
6626 * NOTE: we use _all_ replicas as witnesses.
6627 * this probably isn't totally necessary (esp for file renames),
6628 * but if/when we change that, we have to make sure rejoin is
6629 * sufficiently robust to handle strong rejoins from survivors
6630 * with totally wrong dentry->inode linkage.
6631 * (currently, it can ignore rename effects, because the resolve
6632 * stage will sort them out.)
6633 */
6634 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
6635 if (srcdn->is_auth())
6636 srcdn->list_replicas(witnesses);
6637 else
6638 witnesses.insert(srcdn->authority().first);
6639 if (srcdnl->is_remote() && !srci->is_auth())
6640 witnesses.insert(srci->authority().first);
6641 destdn->list_replicas(witnesses);
6642 if (destdnl->is_remote() && !oldin->is_auth())
6643 witnesses.insert(oldin->authority().first);
6644 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
6645
6646
6647 // -- locks --
6648 map<SimpleLock*, mds_rank_t> remote_wrlocks;
6649
6650 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
6651 for (int i=0; i<(int)srctrace.size(); i++)
6652 rdlocks.insert(&srctrace[i]->lock);
6653 xlocks.insert(&srcdn->lock);
6654 mds_rank_t srcdirauth = srcdn->get_dir()->authority().first;
6655 if (srcdirauth != mds->get_nodeid()) {
6656 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
6657 remote_wrlocks[&srcdn->get_dir()->inode->filelock] = srcdirauth;
6658 remote_wrlocks[&srcdn->get_dir()->inode->nestlock] = srcdirauth;
6659 if (srci->is_dir())
6660 rdlocks.insert(&srci->dirfragtreelock);
6661 } else {
6662 wrlocks.insert(&srcdn->get_dir()->inode->filelock);
6663 wrlocks.insert(&srcdn->get_dir()->inode->nestlock);
6664 }
6665 mds->locker->include_snap_rdlocks(rdlocks, srcdn->get_dir()->inode);
6666
6667 // straydn?
6668 if (straydn) {
6669 wrlocks.insert(&straydn->get_dir()->inode->filelock);
6670 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
6671 xlocks.insert(&straydn->lock);
6672 }
6673
6674 // xlock versionlock on dentries if there are witnesses.
6675 // replicas can't see projected dentry linkages, and will get
6676 // confused if we try to pipeline things.
6677 if (!witnesses.empty()) {
6678 // take xlock on all projected ancestor dentries for srcdn and destdn.
6679 // this ensures the srcdn and destdn can be traversed to by the witnesses.
6680 for (int i= 0; i<(int)srctrace.size(); i++) {
6681 if (srctrace[i]->is_auth() && srctrace[i]->is_projected())
6682 xlocks.insert(&srctrace[i]->versionlock);
6683 }
6684 for (int i=0; i<(int)desttrace.size(); i++) {
6685 if (desttrace[i]->is_auth() && desttrace[i]->is_projected())
6686 xlocks.insert(&desttrace[i]->versionlock);
6687 }
6688 // xlock srci and oldin's primary dentries, so witnesses can call
6689 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
6690 // is traversed.
6691 if (srcdnl->is_remote())
6692 xlocks.insert(&srci->get_projected_parent_dn()->lock);
6693 if (destdnl->is_remote())
6694 xlocks.insert(&oldin->get_projected_parent_dn()->lock);
6695 }
6696
6697 // we need to update srci's ctime. xlock its least contended lock to do that...
6698 xlocks.insert(&srci->linklock);
6699
6700 // xlock oldin (for nlink--)
6701 if (oldin) {
6702 xlocks.insert(&oldin->linklock);
6703 if (oldin->is_dir())
6704 rdlocks.insert(&oldin->filelock);
6705 }
6706 if (srcdnl->is_primary() && srci->is_dir())
6707 // FIXME: this should happen whenever we are renamning between
6708 // realms, regardless of the file type
6709 // FIXME: If/when this changes, make sure to update the
6710 // "allowance" in handle_slave_rename_prep
6711 xlocks.insert(&srci->snaplock); // FIXME: an auth bcast could be sufficient?
6712 else
6713 rdlocks.insert(&srci->snaplock);
6714
6715 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL;
6716 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks,
6717 &remote_wrlocks, auth_pin_freeze))
6718 return;
6719
6720 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6721 if (!check_access(mdr, srcdn->get_dir()->get_inode(), MAY_WRITE))
6722 return;
6723
6724 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
6725 return;
6726
6727 if (!check_fragment_space(mdr, destdn->get_dir()))
6728 return;
6729
6730 if (!check_access(mdr, srci, MAY_WRITE))
6731 return;
6732 }
6733
6734 // with read lock, really verify oldin is empty
6735 if (oldin &&
6736 oldin->is_dir() &&
6737 _dir_is_nonempty(mdr, oldin)) {
6738 respond_to_request(mdr, -ENOTEMPTY);
6739 return;
6740 }
6741
6742 /* project_past_snaprealm_parent() will do this job
6743 *
6744 // moving between snaprealms?
6745 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
6746 SnapRealm *srcrealm = srci->find_snaprealm();
6747 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
6748 if (srcrealm != destrealm &&
6749 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
6750 destrealm->get_newest_seq() + 1 > srcdn->first)) {
6751 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
6752 mdcache->snaprealm_create(mdr, srci);
6753 return;
6754 }
6755 }
6756 */
6757
6758 assert(g_conf->mds_kill_rename_at != 1);
6759
6760 // -- open all srcdn inode frags, if any --
6761 // we need these open so that auth can properly delegate from inode to dirfrags
6762 // after the inode is _ours_.
6763 if (srcdnl->is_primary() &&
6764 !srcdn->is_auth() &&
6765 srci->is_dir()) {
6766 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
6767 mdr->set_stickydirs(srci);
6768
6769 list<frag_t> frags;
6770 srci->dirfragtree.get_leaves(frags);
6771 for (list<frag_t>::iterator p = frags.begin();
6772 p != frags.end();
6773 ++p) {
6774 CDir *dir = srci->get_dirfrag(*p);
6775 if (!dir) {
6776 dout(10) << " opening " << *p << " under " << *srci << dendl;
6777 mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr));
6778 return;
6779 }
6780 }
6781 }
6782
6783 // -- prepare witnesses --
6784
6785 // do srcdn auth last
6786 mds_rank_t last = MDS_RANK_NONE;
6787 if (!srcdn->is_auth()) {
6788 last = srcdn->authority().first;
6789 mdr->more()->srcdn_auth_mds = last;
6790 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
6791 // are involved in the rename operation.
6792 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
6793 dout(10) << " preparing ambiguous auth for srci" << dendl;
6794 assert(mdr->more()->is_remote_frozen_authpin);
6795 assert(mdr->more()->rename_inode == srci);
6796 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6797 return;
6798 }
6799 }
6800
6801 for (set<mds_rank_t>::iterator p = witnesses.begin();
6802 p != witnesses.end();
6803 ++p) {
6804 if (*p == last) continue; // do it last!
6805 if (mdr->more()->witnessed.count(*p)) {
6806 dout(10) << " already witnessed by mds." << *p << dendl;
6807 } else if (mdr->more()->waiting_on_slave.count(*p)) {
6808 dout(10) << " already waiting on witness mds." << *p << dendl;
6809 } else {
6810 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
6811 return;
6812 }
6813 }
6814 if (!mdr->more()->waiting_on_slave.empty())
6815 return; // we're waiting for a witness.
6816
6817 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
6818 dout(10) << " preparing last witness (srcdn auth)" << dendl;
6819 assert(mdr->more()->waiting_on_slave.count(last) == 0);
6820 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6821 return;
6822 }
6823
6824 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
6825 if (!mdr->more()->slaves.empty() && !srci->is_dir())
6826 assert(g_conf->mds_kill_rename_at != 3);
6827 if (!mdr->more()->slaves.empty() && srci->is_dir())
6828 assert(g_conf->mds_kill_rename_at != 4);
6829
6830 // -- declare now --
6831 mdr->set_mds_stamp(ceph_clock_now());
6832
6833 // -- prepare journal entry --
6834 mdr->ls = mdlog->get_current_segment();
6835 EUpdate *le = new EUpdate(mdlog, "rename");
6836 mdlog->start_entry(le);
6837 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6838 if (!mdr->more()->witnessed.empty()) {
6839 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6840
6841 le->reqid = mdr->reqid;
6842 le->had_slaves = true;
6843
6844 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6845 // no need to send frozen auth pin to recovring auth MDS of srci
6846 mdr->more()->is_remote_frozen_authpin = false;
6847 }
6848
6849 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
6850 if (le->client_map.length())
6851 le->cmapv = mds->sessionmap.get_projected();
6852
6853 // -- commit locally --
6854 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
6855
6856 journal_and_reply(mdr, srci, destdn, le, fin);
6857 }
6858
6859
6860 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
6861 {
6862 dout(10) << "_rename_finish " << *mdr << dendl;
6863
6864 if (!mdr->more()->witnessed.empty())
6865 mdcache->logged_master_update(mdr->reqid);
6866
6867 // apply
6868 _rename_apply(mdr, srcdn, destdn, straydn);
6869
6870 mdcache->send_dentry_link(destdn, mdr);
6871
6872 CDentry::linkage_t *destdnl = destdn->get_linkage();
6873 CInode *in = destdnl->get_inode();
6874 bool need_eval = mdr->more()->cap_imports.count(in);
6875
6876 // test hack: test slave commit
6877 if (!mdr->more()->slaves.empty() && !in->is_dir())
6878 assert(g_conf->mds_kill_rename_at != 5);
6879 if (!mdr->more()->slaves.empty() && in->is_dir())
6880 assert(g_conf->mds_kill_rename_at != 6);
6881
6882 // bump popularity
6883 mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
6884 if (destdnl->is_remote() && in->is_auth())
6885 mds->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
6886
6887 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
6888
6889 assert(g_conf->mds_kill_rename_at != 7);
6890
6891 // reply
6892 respond_to_request(mdr, 0);
6893
6894 if (need_eval)
6895 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
6896
6897 // clean up?
6898 // respond_to_request() drops locks. So stray reintegration can race with us.
6899 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6900 mdcache->notify_stray(straydn);
6901 }
6902 }
6903
6904
6905
6906 // helpers
6907
6908 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
6909 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
6910 {
6911 if (mds->is_cluster_degraded() &&
6912 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6913 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
6914 if (mdr->more()->waiting_on_slave.empty())
6915 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6916 return false;
6917 }
6918
6919 dout(10) << "_rename_prepare_witness mds." << who << dendl;
6920 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6921 MMDSSlaveRequest::OP_RENAMEPREP);
6922
6923 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
6924 for (auto dn : srctrace)
6925 req->srcdnpath.push_dentry(dn->name);
6926 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
6927 for (auto dn : dsttrace)
6928 req->destdnpath.push_dentry(dn->name);
6929 if (straydn)
6930 mdcache->replicate_stray(straydn, who, req->stray);
6931
6932 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
6933
6934 // srcdn auth will verify our current witness list is sufficient
6935 req->witnesses = witnesse;
6936
6937 req->op_stamp = mdr->get_op_stamp();
6938 mds->send_message_mds(req, who);
6939
6940 assert(mdr->more()->waiting_on_slave.count(who) == 0);
6941 mdr->more()->waiting_on_slave.insert(who);
6942 return true;
6943 }
6944
6945 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
6946 {
6947 version_t oldpv = mdr->more()->inode_import_v;
6948
6949 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
6950
6951 /* import node */
6952 bufferlist::iterator blp = mdr->more()->inode_import.begin();
6953
6954 // imported caps
6955 ::decode(mdr->more()->imported_client_map, blp);
6956 ::encode(mdr->more()->imported_client_map, *client_map_bl,
6957 mds->mdsmap->get_up_features());
6958 prepare_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
6959
6960 list<ScatterLock*> updated_scatterlocks;
6961 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
6962 mdr->more()->cap_imports, updated_scatterlocks);
6963
6964 // hack: force back to !auth and clean, temporarily
6965 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
6966 srcdnl->get_inode()->mark_clean();
6967
6968 return oldpv;
6969 }
6970
6971 bool Server::_need_force_journal(CInode *diri, bool empty)
6972 {
6973 list<CDir*> ls;
6974 diri->get_dirfrags(ls);
6975
6976 bool force_journal = false;
6977 if (empty) {
6978 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6979 if ((*p)->is_subtree_root() && (*p)->get_dir_auth().first == mds->get_nodeid()) {
6980 dout(10) << " frag " << (*p)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
6981 force_journal = true;
6982 break;
6983 } else
6984 dout(20) << " frag " << (*p)->get_frag() << " is not auth subtree dirfrag" << dendl;
6985 }
6986 } else {
6987 // see if any children of our frags are auth subtrees.
6988 list<CDir*> subtrees;
6989 mdcache->list_subtrees(subtrees);
6990 dout(10) << " subtrees " << subtrees << " frags " << ls << dendl;
6991 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6992 CDir *dir = *p;
6993 for (list<CDir*>::iterator q = subtrees.begin(); q != subtrees.end(); ++q) {
6994 if (dir->contains(*q)) {
6995 if ((*q)->get_dir_auth().first == mds->get_nodeid()) {
6996 dout(10) << " frag " << (*p)->get_frag() << " contains (maybe) auth subtree, will force journal "
6997 << **q << dendl;
6998 force_journal = true;
6999 break;
7000 } else
7001 dout(20) << " frag " << (*p)->get_frag() << " contains but isn't auth for " << **q << dendl;
7002 } else
7003 dout(20) << " frag " << (*p)->get_frag() << " does not contain " << **q << dendl;
7004 }
7005 if (force_journal)
7006 break;
7007 }
7008 }
7009 return force_journal;
7010 }
7011
7012 void Server::_rename_prepare(MDRequestRef& mdr,
7013 EMetaBlob *metablob, bufferlist *client_map_bl,
7014 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7015 {
7016 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7017 if (straydn)
7018 dout(10) << " straydn " << *straydn << dendl;
7019
7020 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7021 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7022 CInode *srci = srcdnl->get_inode();
7023 CInode *oldin = destdnl->get_inode();
7024
7025 // primary+remote link merge?
7026 bool linkmerge = (srci == destdnl->get_inode() &&
7027 (srcdnl->is_primary() || destdnl->is_primary()));
7028 bool silent = srcdn->get_dir()->inode->is_stray();
7029
7030 bool force_journal_dest = false;
7031 if (srci->is_dir() && !destdn->is_auth()) {
7032 if (srci->is_auth()) {
7033 // if we are auth for srci and exporting it, force journal because journal replay needs
7034 // the source inode to create auth subtrees.
7035 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
7036 force_journal_dest = true;
7037 } else
7038 force_journal_dest = _need_force_journal(srci, false);
7039 }
7040
7041 bool force_journal_stray = false;
7042 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
7043 force_journal_stray = _need_force_journal(oldin, true);
7044
7045 if (linkmerge)
7046 dout(10) << " merging remote and primary links to the same inode" << dendl;
7047 if (silent)
7048 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
7049 if (force_journal_dest)
7050 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
7051 if (force_journal_stray)
7052 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
7053
7054 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
7055 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
7056 metablob->renamed_dirino = srci->ino();
7057 } else if (oldin && oldin->is_dir() && force_journal_stray) {
7058 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
7059 metablob->renamed_dirino = oldin->ino();
7060 }
7061
7062 // prepare
7063 inode_t *pi = 0; // renamed inode
7064 inode_t *tpi = 0; // target/overwritten inode
7065
7066 // target inode
7067 if (!linkmerge) {
7068 if (destdnl->is_primary()) {
7069 assert(straydn); // moving to straydn.
7070 // link--, and move.
7071 if (destdn->is_auth()) {
7072 tpi = oldin->project_inode(); //project_snaprealm
7073 tpi->version = straydn->pre_dirty(tpi->version);
7074 tpi->update_backtrace();
7075 }
7076 straydn->push_projected_linkage(oldin);
7077 } else if (destdnl->is_remote()) {
7078 // nlink-- targeti
7079 if (oldin->is_auth()) {
7080 tpi = oldin->project_inode();
7081 tpi->version = oldin->pre_dirty();
7082 }
7083 }
7084 }
7085
7086 // dest
7087 if (srcdnl->is_remote()) {
7088 if (!linkmerge) {
7089 // destdn
7090 if (destdn->is_auth())
7091 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
7092 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7093 // srci
7094 if (srci->is_auth()) {
7095 pi = srci->project_inode();
7096 pi->version = srci->pre_dirty();
7097 }
7098 } else {
7099 dout(10) << " will merge remote onto primary link" << dendl;
7100 if (destdn->is_auth()) {
7101 pi = oldin->project_inode();
7102 pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
7103 }
7104 }
7105 } else { // primary
7106 if (destdn->is_auth()) {
7107 version_t oldpv;
7108 if (srcdn->is_auth())
7109 oldpv = srci->get_projected_version();
7110 else {
7111 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
7112
7113 // note which dirfrags have child subtrees in the journal
7114 // event, so that we can open those (as bounds) during replay.
7115 if (srci->is_dir()) {
7116 list<CDir*> ls;
7117 srci->get_dirfrags(ls);
7118 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7119 CDir *dir = *p;
7120 if (!dir->is_auth())
7121 metablob->renamed_dir_frags.push_back(dir->get_frag());
7122 }
7123 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
7124 }
7125 }
7126 pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
7127 // & srcdnl->snaprealm
7128 pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
7129 pi->update_backtrace();
7130 }
7131 destdn->push_projected_linkage(srci);
7132 }
7133
7134 // src
7135 if (srcdn->is_auth())
7136 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
7137 srcdn->push_projected_linkage(); // push null linkage
7138
7139 if (!silent) {
7140 if (pi) {
7141 pi->ctime = mdr->get_op_stamp();
7142 pi->change_attr++;
7143 if (linkmerge)
7144 pi->nlink--;
7145 }
7146 if (tpi) {
7147 tpi->ctime = mdr->get_op_stamp();
7148 tpi->change_attr++;
7149 destdn->make_path_string(tpi->stray_prior_path, true);
7150 tpi->nlink--;
7151 if (tpi->nlink == 0)
7152 oldin->state_set(CInode::STATE_ORPHAN);
7153 }
7154 }
7155
7156 // prepare nesting, mtime updates
7157 int predirty_dir = silent ? 0:PREDIRTY_DIR;
7158
7159 // guarantee stray dir is processed first during journal replay. unlink the old inode,
7160 // then link the source inode to destdn
7161 if (destdnl->is_primary()) {
7162 assert(straydn);
7163 if (straydn->is_auth()) {
7164 metablob->add_dir_context(straydn->get_dir());
7165 metablob->add_dir(straydn->get_dir(), true);
7166 }
7167 }
7168
7169 // sub off target
7170 if (destdn->is_auth() && !destdnl->is_null()) {
7171 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
7172 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
7173 if (destdnl->is_primary())
7174 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
7175 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7176 }
7177
7178 // move srcdn
7179 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
7180 int flags = predirty_dir | predirty_primary;
7181 if (srcdn->is_auth())
7182 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
7183 if (destdn->is_auth())
7184 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
7185
7186 SnapRealm *src_realm = srci->find_snaprealm();
7187 SnapRealm *dest_realm = destdn->get_dir()->inode->find_snaprealm();
7188 snapid_t next_dest_snap = dest_realm->get_newest_seq() + 1;
7189
7190 // add it all to the metablob
7191 // target inode
7192 if (!linkmerge) {
7193 if (destdnl->is_primary()) {
7194 if (destdn->is_auth()) {
7195 // project snaprealm, too
7196 if (oldin->snaprealm || dest_realm->get_newest_seq() + 1 > oldin->get_oldest_snap())
7197 oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
7198 straydn->first = MAX(oldin->first, next_dest_snap);
7199 metablob->add_primary_dentry(straydn, oldin, true, true);
7200 } else if (force_journal_stray) {
7201 dout(10) << " forced journaling straydn " << *straydn << dendl;
7202 metablob->add_dir_context(straydn->get_dir());
7203 metablob->add_primary_dentry(straydn, oldin, true);
7204 }
7205 } else if (destdnl->is_remote()) {
7206 if (oldin->is_auth()) {
7207 // auth for targeti
7208 metablob->add_dir_context(oldin->get_projected_parent_dir());
7209 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
7210 CEPH_NOSNAP, 0, destdnl);
7211 metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
7212 }
7213 }
7214 }
7215
7216 // dest
7217 if (srcdnl->is_remote()) {
7218 if (!linkmerge) {
7219 if (destdn->is_auth() && !destdnl->is_null())
7220 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7221 else
7222 destdn->first = MAX(destdn->first, next_dest_snap);
7223
7224 if (destdn->is_auth())
7225 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7226 if (srci->get_projected_parent_dn()->is_auth()) { // it's remote
7227 metablob->add_dir_context(srci->get_projected_parent_dir());
7228 mdcache->journal_cow_dentry(mdr.get(), metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl);
7229 metablob->add_primary_dentry(srci->get_projected_parent_dn(), srci, true);
7230 }
7231 } else {
7232 if (destdn->is_auth() && !destdnl->is_null())
7233 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7234 else
7235 destdn->first = MAX(destdn->first, next_dest_snap);
7236
7237 if (destdn->is_auth())
7238 metablob->add_primary_dentry(destdn, destdnl->get_inode(), true, true);
7239 }
7240 } else if (srcdnl->is_primary()) {
7241 // project snap parent update?
7242 if (destdn->is_auth() && src_realm != dest_realm &&
7243 (srci->snaprealm || src_realm->get_newest_seq() + 1 > srci->get_oldest_snap()))
7244 srci->project_past_snaprealm_parent(dest_realm);
7245
7246 if (destdn->is_auth() && !destdnl->is_null())
7247 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7248 else
7249 destdn->first = MAX(destdn->first, next_dest_snap);
7250
7251 if (destdn->is_auth())
7252 metablob->add_primary_dentry(destdn, srci, true, true);
7253 else if (force_journal_dest) {
7254 dout(10) << " forced journaling destdn " << *destdn << dendl;
7255 metablob->add_dir_context(destdn->get_dir());
7256 metablob->add_primary_dentry(destdn, srci, true);
7257 if (srcdn->is_auth() && srci->is_dir()) {
7258 // journal new subtrees root dirfrags
7259 list<CDir*> ls;
7260 srci->get_dirfrags(ls);
7261 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7262 CDir *dir = *p;
7263 if (dir->is_auth())
7264 metablob->add_dir(dir, true);
7265 }
7266 }
7267 }
7268 }
7269
7270 // src
7271 if (srcdn->is_auth()) {
7272 dout(10) << " journaling srcdn " << *srcdn << dendl;
7273 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
7274 // also journal the inode in case we need do slave rename rollback. It is Ok to add
7275 // both primary and NULL dentries. Because during journal replay, null dentry is
7276 // processed after primary dentry.
7277 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
7278 metablob->add_primary_dentry(srcdn, srci, true);
7279 metablob->add_null_dentry(srcdn, true);
7280 } else
7281 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
7282
7283 // make renamed inode first track the dn
7284 if (srcdnl->is_primary() && destdn->is_auth())
7285 srci->first = destdn->first;
7286
7287 if (oldin && oldin->is_dir())
7288 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
7289 if (srci->is_dir())
7290 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
7291
7292 }
7293
7294
7295 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7296 {
7297 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7298 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
7299
7300 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7301 CDentry::linkage_t *destdnl = destdn->get_linkage();
7302
7303 CInode *oldin = destdnl->get_inode();
7304
7305 bool imported_inode = false;
7306
7307 // primary+remote link merge?
7308 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7309 (srcdnl->is_primary() || destdnl->is_primary()));
7310
7311 // target inode
7312 if (!linkmerge) {
7313 if (destdnl->is_primary()) {
7314 assert(straydn);
7315 dout(10) << "straydn is " << *straydn << dendl;
7316 destdn->get_dir()->unlink_inode(destdn, false);
7317
7318 straydn->pop_projected_linkage();
7319 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7320 assert(!straydn->is_projected()); // no other projected
7321
7322 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
7323
7324 // nlink-- targeti
7325 if (destdn->is_auth()) {
7326 bool hadrealm = (oldin->snaprealm ? true : false);
7327 oldin->pop_and_dirty_projected_inode(mdr->ls);
7328 if (oldin->snaprealm && !hadrealm)
7329 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT);
7330 } else {
7331 // FIXME this snaprealm is not filled out correctly
7332 //oldin->open_snaprealm(); might be sufficient..
7333 }
7334 } else if (destdnl->is_remote()) {
7335 destdn->get_dir()->unlink_inode(destdn, false);
7336 if (oldin->is_auth())
7337 oldin->pop_and_dirty_projected_inode(mdr->ls);
7338 }
7339 }
7340
7341 // unlink src before we relink it at dest
7342 CInode *in = srcdnl->get_inode();
7343 assert(in);
7344
7345 bool srcdn_was_remote = srcdnl->is_remote();
7346 srcdn->get_dir()->unlink_inode(srcdn);
7347
7348 // dest
7349 if (srcdn_was_remote) {
7350 if (!linkmerge) {
7351 // destdn
7352 destdnl = destdn->pop_projected_linkage();
7353 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7354 assert(!destdn->is_projected()); // no other projected
7355
7356 destdn->link_remote(destdnl, in);
7357 if (destdn->is_auth())
7358 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
7359 // in
7360 if (in->is_auth())
7361 in->pop_and_dirty_projected_inode(mdr->ls);
7362 } else {
7363 dout(10) << "merging remote onto primary link" << dendl;
7364 oldin->pop_and_dirty_projected_inode(mdr->ls);
7365 }
7366 } else { // primary
7367 if (linkmerge) {
7368 dout(10) << "merging primary onto remote link" << dendl;
7369 destdn->get_dir()->unlink_inode(destdn, false);
7370 }
7371 destdnl = destdn->pop_projected_linkage();
7372 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7373 assert(!destdn->is_projected()); // no other projected
7374
7375 // srcdn inode import?
7376 if (!srcdn->is_auth() && destdn->is_auth()) {
7377 assert(mdr->more()->inode_import.length() > 0);
7378
7379 map<client_t,Capability::Import> imported_caps;
7380
7381 // finish cap imports
7382 finish_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
7383 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
7384 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
7385 mdr->more()->srcdn_auth_mds, true,
7386 mdr->more()->cap_imports[destdnl->get_inode()],
7387 imported_caps);
7388 }
7389
7390 mdr->more()->inode_import.clear();
7391 ::encode(imported_caps, mdr->more()->inode_import);
7392
7393 /* hack: add an auth pin for each xlock we hold. These were
7394 * remote xlocks previously but now they're local and
7395 * we're going to try and unpin when we xlock_finish. */
7396 for (set<SimpleLock *>::iterator i = mdr->xlocks.begin();
7397 i != mdr->xlocks.end();
7398 ++i)
7399 if ((*i)->get_parent() == destdnl->get_inode() &&
7400 !(*i)->is_locallock())
7401 mds->locker->xlock_import(*i);
7402
7403 // hack: fix auth bit
7404 in->state_set(CInode::STATE_AUTH);
7405 imported_inode = true;
7406
7407 mdr->clear_ambiguous_auth();
7408 }
7409
7410 if (destdn->is_auth()) {
7411 in->pop_and_dirty_projected_inode(mdr->ls);
7412
7413 } else {
7414 // FIXME: fix up snaprealm!
7415 }
7416 }
7417
7418 // src
7419 if (srcdn->is_auth())
7420 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
7421 srcdn->pop_projected_linkage();
7422 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7423 assert(!srcdn->is_projected()); // no other projected
7424
7425 // apply remaining projected inodes (nested)
7426 mdr->apply();
7427
7428 // update subtree map?
7429 if (destdnl->is_primary() && in->is_dir())
7430 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true, imported_inode);
7431
7432 if (straydn && oldin->is_dir())
7433 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
7434
7435 // removing a new dn?
7436 if (srcdn->is_auth())
7437 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
7438 }
7439
7440
7441
7442 // ------------
7443 // SLAVE
7444
7445 class C_MDS_SlaveRenamePrep : public ServerLogContext {
7446 CDentry *srcdn, *destdn, *straydn;
7447 public:
7448 C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7449 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
7450 void finish(int r) override {
7451 server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
7452 }
7453 };
7454
7455 class C_MDS_SlaveRenameCommit : public ServerContext {
7456 MDRequestRef mdr;
7457 CDentry *srcdn, *destdn, *straydn;
7458 public:
7459 C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7460 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
7461 void finish(int r) override {
7462 server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
7463 }
7464 };
7465
7466 class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
7467 MDRequestRef mdr;
7468 public:
7469 C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
7470 ServerContext(s), mdr(r) {}
7471 void finish(int r) override {
7472 server->_slave_rename_sessions_flushed(mdr);
7473 }
7474 };
7475
7476 /* This function DOES put the mdr->slave_request before returning*/
7477 void Server::handle_slave_rename_prep(MDRequestRef& mdr)
7478 {
7479 dout(10) << "handle_slave_rename_prep " << *mdr
7480 << " " << mdr->slave_request->srcdnpath
7481 << " to " << mdr->slave_request->destdnpath
7482 << dendl;
7483
7484 if (mdr->slave_request->is_interrupted()) {
7485 dout(10) << " slave request interrupted, sending noop reply" << dendl;
7486 MMDSSlaveRequest *reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7487 reply->mark_interrupted();
7488 mds->send_message_mds(reply, mdr->slave_to_mds);
7489 mdr->slave_request->put();
7490 mdr->slave_request = 0;
7491 return;
7492 }
7493
7494 // discover destdn
7495 filepath destpath(mdr->slave_request->destdnpath);
7496 dout(10) << " dest " << destpath << dendl;
7497 vector<CDentry*> trace;
7498 int r = mdcache->path_traverse(mdr, NULL, NULL, destpath, &trace, NULL, MDS_TRAVERSE_DISCOVERXLOCK);
7499 if (r > 0) return;
7500 if (r == -ESTALE) {
7501 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
7502 mdr->slave_to_mds);
7503 return;
7504 }
7505 assert(r == 0); // we shouldn't get an error here!
7506
7507 CDentry *destdn = trace[trace.size()-1];
7508 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7509 dout(10) << " destdn " << *destdn << dendl;
7510 mdr->pin(destdn);
7511
7512 // discover srcdn
7513 filepath srcpath(mdr->slave_request->srcdnpath);
7514 dout(10) << " src " << srcpath << dendl;
7515 CInode *srci = nullptr;
7516 r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &srci, MDS_TRAVERSE_DISCOVERXLOCK);
7517 if (r > 0) return;
7518 assert(r == 0);
7519
7520 // srcpath must not point to a null dentry
7521 assert(srci != nullptr);
7522
7523 CDentry *srcdn = trace[trace.size()-1];
7524 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7525 dout(10) << " srcdn " << *srcdn << dendl;
7526 mdr->pin(srcdn);
7527 mdr->pin(srci);
7528
7529 // stray?
7530 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7531 (srcdnl->is_primary() || destdnl->is_primary()));
7532 CDentry *straydn = mdr->straydn;
7533 if (destdnl->is_primary() && !linkmerge)
7534 assert(straydn);
7535
7536 mdr->set_op_stamp(mdr->slave_request->op_stamp);
7537 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
7538
7539 // set up commit waiter (early, to clean up any freezing etc we do)
7540 if (!mdr->more()->slave_commit)
7541 mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
7542
7543 // am i srcdn auth?
7544 if (srcdn->is_auth()) {
7545 set<mds_rank_t> srcdnrep;
7546 srcdn->list_replicas(srcdnrep);
7547
7548 bool reply_witness = false;
7549 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
7550 // freeze?
7551 // we need this to
7552 // - avoid conflicting lock state changes
7553 // - avoid concurrent updates to the inode
7554 // (this could also be accomplished with the versionlock)
7555 int allowance = 2; // 1 for the mdr auth_pin, 1 for the link lock
7556 allowance += srcdnl->get_inode()->is_dir(); // for the snap lock
7557 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
7558 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
7559
7560 // unfreeze auth pin after freezing the inode to avoid queueing waiters
7561 if (srcdnl->get_inode()->is_frozen_auth_pin())
7562 mdr->unfreeze_auth_pin();
7563
7564 if (!frozen_inode) {
7565 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
7566 return;
7567 }
7568
7569 /*
7570 * set ambiguous auth for srci
7571 * NOTE: we don't worry about ambiguous cache expire as we do
7572 * with subtree migrations because all slaves will pin
7573 * srcdn->get_inode() for duration of this rename.
7574 */
7575 mdr->set_ambiguous_auth(srcdnl->get_inode());
7576
7577 // just mark the source inode as ambiguous auth if more than two MDS are involved.
7578 // the master will send another OP_RENAMEPREP slave request later.
7579 if (mdr->slave_request->witnesses.size() > 1) {
7580 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
7581 reply_witness = true;
7582 }
7583
7584 // make sure bystanders have received all lock related messages
7585 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7586 if (*p == mdr->slave_to_mds ||
7587 (mds->is_cluster_degraded() &&
7588 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
7589 continue;
7590 MMDSSlaveRequest *notify = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7591 MMDSSlaveRequest::OP_RENAMENOTIFY);
7592 mds->send_message_mds(notify, *p);
7593 mdr->more()->waiting_on_slave.insert(*p);
7594 }
7595
7596 // make sure clients have received all cap related messages
7597 set<client_t> export_client_set;
7598 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
7599
7600 MDSGatherBuilder gather(g_ceph_context);
7601 flush_client_sessions(export_client_set, gather);
7602 if (gather.has_subs()) {
7603 mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
7604 gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
7605 gather.activate();
7606 }
7607 }
7608
7609 // is witness list sufficient?
7610 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7611 if (*p == mdr->slave_to_mds ||
7612 mdr->slave_request->witnesses.count(*p)) continue;
7613 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
7614 reply_witness = true;
7615 break;
7616 }
7617
7618 if (reply_witness) {
7619 assert(!srcdnrep.empty());
7620 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7621 MMDSSlaveRequest::OP_RENAMEPREPACK);
7622 reply->witnesses.swap(srcdnrep);
7623 mds->send_message_mds(reply, mdr->slave_to_mds);
7624 mdr->slave_request->put();
7625 mdr->slave_request = 0;
7626 return;
7627 }
7628 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
7629 if (!mdr->more()->waiting_on_slave.empty()) {
7630 dout(10) << " still waiting for rename notify acks from "
7631 << mdr->more()->waiting_on_slave << dendl;
7632 return;
7633 }
7634 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
7635 // set ambiguous auth for srci on witnesses
7636 mdr->set_ambiguous_auth(srcdnl->get_inode());
7637 }
7638
7639 // encode everything we'd need to roll this back... basically, just the original state.
7640 rename_rollback rollback;
7641
7642 rollback.reqid = mdr->reqid;
7643
7644 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
7645 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7646 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
7647 rollback.orig_src.dname = srcdn->name;
7648 if (srcdnl->is_primary())
7649 rollback.orig_src.ino = srcdnl->get_inode()->ino();
7650 else {
7651 assert(srcdnl->is_remote());
7652 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
7653 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
7654 }
7655
7656 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
7657 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7658 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
7659 rollback.orig_dest.dname = destdn->name;
7660 if (destdnl->is_primary())
7661 rollback.orig_dest.ino = destdnl->get_inode()->ino();
7662 else if (destdnl->is_remote()) {
7663 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
7664 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
7665 }
7666
7667 if (straydn) {
7668 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
7669 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
7670 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
7671 rollback.stray.dname = straydn->name;
7672 }
7673 ::encode(rollback, mdr->more()->rollback_bl);
7674 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7675
7676 // journal.
7677 mdr->ls = mdlog->get_current_segment();
7678 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
7679 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
7680 mdlog->start_entry(le);
7681 le->rollback = mdr->more()->rollback_bl;
7682
7683 bufferlist blah; // inode import data... obviously not used if we're the slave
7684 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
7685
7686 if (le->commit.empty()) {
7687 dout(10) << " empty metablob, skipping journal" << dendl;
7688 mdlog->cancel_entry(le);
7689 mdr->ls = NULL;
7690 _logged_slave_rename(mdr, srcdn, destdn, straydn);
7691 } else {
7692 mdr->more()->slave_update_journaled = true;
7693 submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
7694 mdr, __func__);
7695 mdlog->flush();
7696 }
7697 }
7698
7699 void Server::_logged_slave_rename(MDRequestRef& mdr,
7700 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7701 {
7702 dout(10) << "_logged_slave_rename " << *mdr << dendl;
7703
7704 // prepare ack
7705 MMDSSlaveRequest *reply = NULL;
7706 if (!mdr->aborted) {
7707 reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7708 if (!mdr->more()->slave_update_journaled)
7709 reply->mark_not_journaled();
7710 }
7711
7712 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7713 CDentry::linkage_t *destdnl = NULL;
7714 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
7715
7716 // export srci?
7717 if (srcdn->is_auth() && srcdnl->is_primary()) {
7718 // set export bounds for CInode::encode_export()
7719 list<CDir*> bounds;
7720 if (srcdnl->get_inode()->is_dir()) {
7721 srcdnl->get_inode()->get_dirfrags(bounds);
7722 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7723 (*p)->state_set(CDir::STATE_EXPORTBOUND);
7724 }
7725
7726 map<client_t,entity_inst_t> exported_client_map;
7727 bufferlist inodebl;
7728 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
7729 exported_client_map);
7730
7731 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7732 (*p)->state_clear(CDir::STATE_EXPORTBOUND);
7733
7734 if (reply) {
7735 ::encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
7736 reply->inode_export.claim_append(inodebl);
7737 reply->inode_export_v = srcdnl->get_inode()->inode.version;
7738 }
7739
7740 // remove mdr auth pin
7741 mdr->auth_unpin(srcdnl->get_inode());
7742 mdr->more()->is_inode_exporter = true;
7743
7744 if (srcdnl->get_inode()->is_dirty())
7745 srcdnl->get_inode()->mark_clean();
7746
7747 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
7748 }
7749
7750 // apply
7751 _rename_apply(mdr, srcdn, destdn, straydn);
7752
7753 destdnl = destdn->get_linkage();
7754
7755 // bump popularity
7756 mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
7757 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
7758 mds->balancer->hit_inode(mdr->get_mds_stamp(), destdnl->get_inode(),
7759 META_POP_IWR);
7760
7761 // done.
7762 mdr->slave_request->put();
7763 mdr->slave_request = 0;
7764 mdr->straydn = 0;
7765
7766 if (reply) {
7767 mds->send_message_mds(reply, mdr->slave_to_mds);
7768 } else {
7769 assert(mdr->aborted);
7770 dout(10) << " abort flag set, finishing" << dendl;
7771 mdcache->request_finish(mdr);
7772 }
7773 }
7774
7775 void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
7776 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7777 {
7778 dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
7779
7780 CDentry::linkage_t *destdnl = destdn->get_linkage();
7781
7782 list<MDSInternalContextBase*> finished;
7783 if (r == 0) {
7784 // unfreeze+singleauth inode
7785 // hmm, do i really need to delay this?
7786 if (mdr->more()->is_inode_exporter) {
7787
7788 CInode *in = destdnl->get_inode();
7789
7790 // drop our pins
7791 // we exported, clear out any xlocks that we moved to another MDS
7792 set<SimpleLock*>::iterator i = mdr->xlocks.begin();
7793 while (i != mdr->xlocks.end()) {
7794 SimpleLock *lock = *i++;
7795
7796 // we only care about xlocks on the exported inode
7797 if (lock->get_parent() == in &&
7798 !lock->is_locallock())
7799 mds->locker->xlock_export(lock, mdr.get());
7800 }
7801
7802 map<client_t,Capability::Import> peer_imported;
7803 bufferlist::iterator bp = mdr->more()->inode_import.begin();
7804 ::decode(peer_imported, bp);
7805
7806 dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl;
7807 mdcache->migrator->finish_export_inode(destdnl->get_inode(),
7808 mdr->get_mds_stamp(),
7809 mdr->slave_to_mds, peer_imported, finished);
7810 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
7811
7812 // unfreeze
7813 assert(destdnl->get_inode()->is_frozen_inode());
7814 destdnl->get_inode()->unfreeze_inode(finished);
7815 }
7816
7817 // singleauth
7818 if (mdr->more()->is_ambiguous_auth) {
7819 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
7820 mdr->more()->is_ambiguous_auth = false;
7821 }
7822
7823 if (straydn && mdr->more()->slave_update_journaled) {
7824 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7825 if (strayin && !strayin->snaprealm)
7826 mdcache->clear_dirty_bits_for_stray(strayin);
7827 }
7828
7829 mds->queue_waiters(finished);
7830 mdr->cleanup();
7831
7832 if (mdr->more()->slave_update_journaled) {
7833 // write a commit to the journal
7834 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
7835 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
7836 ESlaveUpdate::RENAME);
7837 mdlog->start_entry(le);
7838 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
7839 mdlog->flush();
7840 } else {
7841 _committed_slave(mdr);
7842 }
7843 } else {
7844
7845 // abort
7846 // rollback_bl may be empty if we froze the inode but had to provide an expanded
7847 // witness list from the master, and they failed before we tried prep again.
7848 if (mdr->more()->rollback_bl.length()) {
7849 if (mdr->more()->is_inode_exporter) {
7850 dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl;
7851 destdnl->get_inode()->abort_export();
7852 }
7853 if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
7854 mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
7855 // rollback but preserve the slave request
7856 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
7857 mdr->more()->rollback_bl.clear();
7858 } else
7859 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
7860 } else {
7861 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
7862 // singleauth
7863 if (mdr->more()->is_ambiguous_auth) {
7864 if (srcdn->is_auth())
7865 mdr->more()->rename_inode->unfreeze_inode(finished);
7866
7867 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
7868 mdr->more()->is_ambiguous_auth = false;
7869 }
7870 mds->queue_waiters(finished);
7871 mdcache->request_finish(mdr);
7872 }
7873 }
7874 }
7875
7876 void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
7877 bool isdir, int linkunlink, nest_info_t &rstat)
7878 {
7879 fnode_t *pf;
7880 pf = dir->project_fnode();
7881 mut->add_projected_fnode(dir);
7882 pf->version = dir->pre_dirty();
7883
7884 if (isdir) {
7885 pf->fragstat.nsubdirs += linkunlink;
7886 } else {
7887 pf->fragstat.nfiles += linkunlink;
7888 }
7889 if (r.ino) {
7890 pf->rstat.rbytes += linkunlink * rstat.rbytes;
7891 pf->rstat.rfiles += linkunlink * rstat.rfiles;
7892 pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
7893 pf->rstat.rsnaprealms += linkunlink * rstat.rsnaprealms;
7894 }
7895 if (pf->fragstat.mtime == ctime) {
7896 pf->fragstat.mtime = r.dirfrag_old_mtime;
7897 if (pf->rstat.rctime == ctime)
7898 pf->rstat.rctime = r.dirfrag_old_rctime;
7899 }
7900 mut->add_updated_lock(&dir->get_inode()->filelock);
7901 mut->add_updated_lock(&dir->get_inode()->nestlock);
7902 }
7903
7904 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
7905 MutationRef mut;
7906 CDentry *srcdn;
7907 version_t srcdnpv;
7908 CDentry *destdn;
7909 CDentry *straydn;
7910 bool finish_mdr;
7911 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
7912 CDentry *sd, version_t pv, CDentry *dd,
7913 CDentry *st, bool f) :
7914 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
7915 straydn(st), finish_mdr(f) {}
7916 void finish(int r) override {
7917 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
7918 destdn, straydn, finish_mdr);
7919 }
7920 };
7921
7922 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
7923 bool finish_mdr)
7924 {
7925 rename_rollback rollback;
7926 bufferlist::iterator p = rbl.begin();
7927 ::decode(rollback, p);
7928
7929 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
7930 // need to finish this update before sending resolve to claim the subtree
7931 mdcache->add_rollback(rollback.reqid, master);
7932
7933 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7934 mut->ls = mds->mdlog->get_current_segment();
7935
7936 CDentry *srcdn = NULL;
7937 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
7938 if (!srcdir)
7939 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
7940 if (srcdir) {
7941 dout(10) << " srcdir " << *srcdir << dendl;
7942 srcdn = srcdir->lookup(rollback.orig_src.dname);
7943 if (srcdn) {
7944 dout(10) << " srcdn " << *srcdn << dendl;
7945 assert(srcdn->get_linkage()->is_null());
7946 } else
7947 dout(10) << " srcdn not found" << dendl;
7948 } else
7949 dout(10) << " srcdir not found" << dendl;
7950
7951 CDentry *destdn = NULL;
7952 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
7953 if (!destdir)
7954 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
7955 if (destdir) {
7956 dout(10) << " destdir " << *destdir << dendl;
7957 destdn = destdir->lookup(rollback.orig_dest.dname);
7958 if (destdn)
7959 dout(10) << " destdn " << *destdn << dendl;
7960 else
7961 dout(10) << " destdn not found" << dendl;
7962 } else
7963 dout(10) << " destdir not found" << dendl;
7964
7965 CInode *in = NULL;
7966 if (rollback.orig_src.ino) {
7967 in = mdcache->get_inode(rollback.orig_src.ino);
7968 if (in && in->is_dir())
7969 assert(srcdn && destdn);
7970 } else
7971 in = mdcache->get_inode(rollback.orig_src.remote_ino);
7972
7973 CDir *straydir = NULL;
7974 CDentry *straydn = NULL;
7975 if (rollback.stray.dirfrag.ino) {
7976 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
7977 if (straydir) {
7978 dout(10) << "straydir " << *straydir << dendl;
7979 straydn = straydir->lookup(rollback.stray.dname);
7980 if (straydn) {
7981 dout(10) << " straydn " << *straydn << dendl;
7982 assert(straydn->get_linkage()->is_primary());
7983 } else
7984 dout(10) << " straydn not found" << dendl;
7985 } else
7986 dout(10) << "straydir not found" << dendl;
7987 }
7988
7989 CInode *target = NULL;
7990 if (rollback.orig_dest.ino) {
7991 target = mdcache->get_inode(rollback.orig_dest.ino);
7992 if (target)
7993 assert(destdn && straydn);
7994 } else if (rollback.orig_dest.remote_ino)
7995 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
7996
7997 // can't use is_auth() in the resolve stage
7998 mds_rank_t whoami = mds->get_nodeid();
7999 // slave
8000 assert(!destdn || destdn->authority().first != whoami);
8001 assert(!straydn || straydn->authority().first != whoami);
8002
8003 bool force_journal_src = false;
8004 bool force_journal_dest = false;
8005 if (in && in->is_dir() && srcdn->authority().first != whoami)
8006 force_journal_src = _need_force_journal(in, false);
8007 if (in && target && target->is_dir())
8008 force_journal_dest = _need_force_journal(in, true);
8009
8010 version_t srcdnpv = 0;
8011 // repair src
8012 if (srcdn) {
8013 if (srcdn->authority().first == whoami)
8014 srcdnpv = srcdn->pre_dirty();
8015 if (rollback.orig_src.ino) {
8016 assert(in);
8017 srcdn->push_projected_linkage(in);
8018 } else
8019 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
8020 rollback.orig_src.remote_d_type);
8021 }
8022
8023 inode_t *pi = 0;
8024 if (in) {
8025 if (in->authority().first == whoami) {
8026 pi = in->project_inode();
8027 mut->add_projected_inode(in);
8028 pi->version = in->pre_dirty();
8029 } else
8030 pi = in->get_projected_inode();
8031 if (pi->ctime == rollback.ctime)
8032 pi->ctime = rollback.orig_src.old_ctime;
8033 }
8034
8035 if (srcdn && srcdn->authority().first == whoami) {
8036 nest_info_t blah;
8037 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
8038 in ? in->is_dir() : false, 1, pi ? pi->accounted_rstat : blah);
8039 }
8040
8041 // repair dest
8042 if (destdn) {
8043 if (rollback.orig_dest.ino && target) {
8044 destdn->push_projected_linkage(target);
8045 } else if (rollback.orig_dest.remote_ino) {
8046 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
8047 rollback.orig_dest.remote_d_type);
8048 } else {
8049 // the dentry will be trimmed soon, it's ok to have wrong linkage
8050 if (rollback.orig_dest.ino)
8051 assert(mds->is_resolve());
8052 destdn->push_projected_linkage();
8053 }
8054 }
8055
8056 if (straydn)
8057 straydn->push_projected_linkage();
8058
8059 if (target) {
8060 inode_t *ti = NULL;
8061 if (target->authority().first == whoami) {
8062 ti = target->project_inode();
8063 mut->add_projected_inode(target);
8064 ti->version = target->pre_dirty();
8065 } else
8066 ti = target->get_projected_inode();
8067 if (ti->ctime == rollback.ctime)
8068 ti->ctime = rollback.orig_dest.old_ctime;
8069 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
8070 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
8071 assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
8072 else
8073 assert(rollback.orig_dest.remote_ino &&
8074 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
8075 } else
8076 ti->nlink++;
8077 }
8078
8079 if (srcdn)
8080 dout(0) << " srcdn back to " << *srcdn << dendl;
8081 if (in)
8082 dout(0) << " srci back to " << *in << dendl;
8083 if (destdn)
8084 dout(0) << " destdn back to " << *destdn << dendl;
8085 if (target)
8086 dout(0) << " desti back to " << *target << dendl;
8087
8088 // journal it
8089 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
8090 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
8091 mdlog->start_entry(le);
8092
8093 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
8094 le->commit.add_dir_context(srcdir);
8095 if (rollback.orig_src.ino)
8096 le->commit.add_primary_dentry(srcdn, 0, true);
8097 else
8098 le->commit.add_remote_dentry(srcdn, true);
8099 }
8100
8101 if (!rollback.orig_src.ino && // remote linkage
8102 in && in->authority().first == whoami) {
8103 le->commit.add_dir_context(in->get_projected_parent_dir());
8104 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
8105 }
8106
8107 if (force_journal_dest) {
8108 assert(rollback.orig_dest.ino);
8109 le->commit.add_dir_context(destdir);
8110 le->commit.add_primary_dentry(destdn, 0, true);
8111 }
8112
8113 // slave: no need to journal straydn
8114
8115 if (target && target != in && target->authority().first == whoami) {
8116 assert(rollback.orig_dest.remote_ino);
8117 le->commit.add_dir_context(target->get_projected_parent_dir());
8118 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
8119 }
8120
8121 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
8122 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
8123 le->commit.renamed_dirino = in->ino();
8124 if (srcdn->authority().first == whoami) {
8125 list<CDir*> ls;
8126 in->get_dirfrags(ls);
8127 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
8128 CDir *dir = *p;
8129 if (!dir->is_auth())
8130 le->commit.renamed_dir_frags.push_back(dir->get_frag());
8131 }
8132 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
8133 }
8134 } else if (force_journal_dest) {
8135 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
8136 le->commit.renamed_dirino = target->ino();
8137 }
8138
8139 if (target && target->is_dir()) {
8140 assert(destdn);
8141 mdcache->project_subtree_rename(target, straydir, destdir);
8142 }
8143
8144 if (in && in->is_dir()) {
8145 assert(srcdn);
8146 mdcache->project_subtree_rename(in, destdir, srcdir);
8147 }
8148
8149 if (mdr && !mdr->more()->slave_update_journaled) {
8150 assert(le->commit.empty());
8151 mdlog->cancel_entry(le);
8152 mut->ls = NULL;
8153 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, finish_mdr);
8154 } else {
8155 assert(!le->commit.empty());
8156 if (mdr)
8157 mdr->more()->slave_update_journaled = false;
8158 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr, srcdn, srcdnpv,
8159 destdn, straydn, finish_mdr);
8160 submit_mdlog_entry(le, fin, mdr, __func__);
8161 mdlog->flush();
8162 }
8163 }
8164
8165 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
8166 version_t srcdnpv, CDentry *destdn,
8167 CDentry *straydn, bool finish_mdr)
8168 {
8169 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
8170
8171 if (straydn) {
8172 straydn->get_dir()->unlink_inode(straydn);
8173 straydn->pop_projected_linkage();
8174 }
8175 if (destdn) {
8176 destdn->get_dir()->unlink_inode(destdn);
8177 destdn->pop_projected_linkage();
8178 }
8179 if (srcdn) {
8180 srcdn->pop_projected_linkage();
8181 if (srcdn->authority().first == mds->get_nodeid())
8182 srcdn->mark_dirty(srcdnpv, mut->ls);
8183 }
8184
8185 mut->apply();
8186
8187 if (srcdn && srcdn->get_linkage()->is_primary()) {
8188 CInode *in = srcdn->get_linkage()->get_inode();
8189 if (srcdn->authority().first == mds->get_nodeid())
8190 in->state_set(CInode::STATE_AUTH);
8191 // update subtree map?
8192 if (in && in->is_dir()) {
8193 assert(destdn);
8194 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
8195 }
8196 }
8197
8198 if (destdn) {
8199 CInode *oldin = destdn->get_linkage()->get_inode();
8200 // update subtree map?
8201 if (oldin && oldin->is_dir()) {
8202 assert(straydn);
8203 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
8204 }
8205 }
8206
8207 if (mds->is_resolve()) {
8208 CDir *root = NULL;
8209 if (straydn)
8210 root = mdcache->get_subtree_root(straydn->get_dir());
8211 else if (destdn)
8212 root = mdcache->get_subtree_root(destdn->get_dir());
8213 if (root)
8214 mdcache->try_trim_non_auth_subtree(root);
8215 }
8216
8217 if (mdr) {
8218 list<MDSInternalContextBase*> finished;
8219 if (mdr->more()->is_ambiguous_auth) {
8220 if (srcdn->is_auth())
8221 mdr->more()->rename_inode->unfreeze_inode(finished);
8222
8223 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8224 mdr->more()->is_ambiguous_auth = false;
8225 }
8226 mds->queue_waiters(finished);
8227 if (finish_mdr || mdr->aborted)
8228 mdcache->request_finish(mdr);
8229 else
8230 mdr->more()->slave_rolling_back = false;
8231 }
8232
8233 mdcache->finish_rollback(mut->reqid);
8234
8235 mut->cleanup();
8236 }
8237
8238 /* This function DOES put the passed message before returning*/
8239 void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8240 {
8241 dout(10) << "handle_slave_rename_prep_ack " << *mdr
8242 << " witnessed by " << ack->get_source()
8243 << " " << *ack << dendl;
8244 mds_rank_t from = mds_rank_t(ack->get_source().num());
8245
8246 // note slave
8247 mdr->more()->slaves.insert(from);
8248 if (mdr->more()->srcdn_auth_mds == from &&
8249 mdr->more()->is_remote_frozen_authpin &&
8250 !mdr->more()->is_ambiguous_auth) {
8251 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
8252 }
8253
8254 // witnessed? or add extra witnesses?
8255 assert(mdr->more()->witnessed.count(from) == 0);
8256 if (ack->is_interrupted()) {
8257 dout(10) << " slave request interrupted, noop" << dendl;
8258 } else if (ack->witnesses.empty()) {
8259 mdr->more()->witnessed.insert(from);
8260 if (!ack->is_not_journaled())
8261 mdr->more()->has_journaled_slaves = true;
8262 } else {
8263 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
8264 mdr->more()->extra_witnesses.swap(ack->witnesses);
8265 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
8266 }
8267
8268 // srci import?
8269 if (ack->inode_export.length()) {
8270 dout(10) << " got srci import" << dendl;
8271 mdr->more()->inode_import.claim(ack->inode_export);
8272 mdr->more()->inode_import_v = ack->inode_export_v;
8273 }
8274
8275 // remove from waiting list
8276 assert(mdr->more()->waiting_on_slave.count(from));
8277 mdr->more()->waiting_on_slave.erase(from);
8278
8279 if (mdr->more()->waiting_on_slave.empty())
8280 dispatch_client_request(mdr); // go again!
8281 else
8282 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
8283 }
8284
8285 void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8286 {
8287 dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
8288 << ack->get_source() << dendl;
8289 assert(mdr->is_slave());
8290 mds_rank_t from = mds_rank_t(ack->get_source().num());
8291
8292 if (mdr->more()->waiting_on_slave.count(from)) {
8293 mdr->more()->waiting_on_slave.erase(from);
8294
8295 if (mdr->more()->waiting_on_slave.empty()) {
8296 if (mdr->slave_request)
8297 dispatch_slave_request(mdr);
8298 } else
8299 dout(10) << " still waiting for rename notify acks from "
8300 << mdr->more()->waiting_on_slave << dendl;
8301 }
8302 }
8303
8304 void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
8305 {
8306 dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
8307
8308 if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
8309 mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
8310
8311 if (mdr->more()->waiting_on_slave.empty()) {
8312 if (mdr->slave_request)
8313 dispatch_slave_request(mdr);
8314 } else
8315 dout(10) << " still waiting for rename notify acks from "
8316 << mdr->more()->waiting_on_slave << dendl;
8317 }
8318 }
8319
8320 // snaps
8321 /* This function takes responsibility for the passed mdr*/
8322 void Server::handle_client_lssnap(MDRequestRef& mdr)
8323 {
8324 MClientRequest *req = mdr->client_request;
8325
8326 // traverse to path
8327 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8328 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8329 respond_to_request(mdr, -ESTALE);
8330 return;
8331 }
8332 if (!diri->is_auth()) {
8333 mdcache->request_forward(mdr, diri->authority().first);
8334 return;
8335 }
8336 if (!diri->is_dir()) {
8337 respond_to_request(mdr, -ENOTDIR);
8338 return;
8339 }
8340 dout(10) << "lssnap on " << *diri << dendl;
8341
8342 // lock snap
8343 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8344 mds->locker->include_snap_rdlocks(rdlocks, diri);
8345 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8346 return;
8347
8348 if (!check_access(mdr, diri, MAY_READ))
8349 return;
8350
8351 SnapRealm *realm = diri->find_snaprealm();
8352 map<snapid_t,SnapInfo*> infomap;
8353 realm->get_snap_info(infomap, diri->get_oldest_snap());
8354
8355 unsigned max_entries = req->head.args.readdir.max_entries;
8356 if (!max_entries)
8357 max_entries = infomap.size();
8358 int max_bytes = req->head.args.readdir.max_bytes;
8359 if (!max_bytes)
8360 // make sure at least one item can be encoded
8361 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
8362
8363 __u64 last_snapid = 0;
8364 string offset_str = req->get_path2();
8365 if (!offset_str.empty())
8366 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
8367
8368 bufferlist dirbl;
8369 encode_empty_dirstat(dirbl);
8370
8371 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
8372
8373 __u32 num = 0;
8374 bufferlist dnbl;
8375 map<snapid_t,SnapInfo*>::iterator p = infomap.upper_bound(last_snapid);
8376 for (; p != infomap.end() && num < max_entries; ++p) {
8377 dout(10) << p->first << " -> " << *p->second << dendl;
8378
8379 // actual
8380 string snap_name;
8381 if (p->second->ino == diri->ino())
8382 snap_name = p->second->name;
8383 else
8384 snap_name = p->second->get_long_name();
8385
8386 unsigned start_len = dnbl.length();
8387 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
8388 break;
8389
8390 ::encode(snap_name, dnbl);
8391 encode_infinite_lease(dnbl);
8392
8393 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
8394 if (r < 0) {
8395 bufferlist keep;
8396 keep.substr_of(dnbl, 0, start_len);
8397 dnbl.swap(keep);
8398 break;
8399 }
8400 ++num;
8401 }
8402
8403 ::encode(num, dirbl);
8404 __u16 flags = 0;
8405 if (p == infomap.end()) {
8406 flags = CEPH_READDIR_FRAG_END;
8407 if (last_snapid == 0)
8408 flags |= CEPH_READDIR_FRAG_COMPLETE;
8409 }
8410 ::encode(flags, dirbl);
8411 dirbl.claim_append(dnbl);
8412
8413 mdr->reply_extra_bl = dirbl;
8414 mdr->tracei = diri;
8415 respond_to_request(mdr, 0);
8416 }
8417
8418
8419 // MKSNAP
8420
8421 struct C_MDS_mksnap_finish : public ServerLogContext {
8422 CInode *diri;
8423 SnapInfo info;
8424 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
8425 ServerLogContext(s, r), diri(di), info(i) {}
8426 void finish(int r) override {
8427 server->_mksnap_finish(mdr, diri, info);
8428 }
8429 };
8430
8431 /* This function takes responsibility for the passed mdr*/
8432 void Server::handle_client_mksnap(MDRequestRef& mdr)
8433 {
8434 if (!mds->mdsmap->allows_snaps()) {
8435 // you can't make snapshots until you set an option right now
8436 respond_to_request(mdr, -EPERM);
8437 return;
8438 }
8439
8440 MClientRequest *req = mdr->client_request;
8441 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8442 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8443 respond_to_request(mdr, -ESTALE);
8444 return;
8445 }
8446
8447 if (!diri->is_auth()) { // fw to auth?
8448 mdcache->request_forward(mdr, diri->authority().first);
8449 return;
8450 }
8451
8452 // dir only
8453 if (!diri->is_dir()) {
8454 respond_to_request(mdr, -ENOTDIR);
8455 return;
8456 }
8457 if (diri->is_system() && !diri->is_root()) {
8458 // no snaps in system dirs (root is ok)
8459 respond_to_request(mdr, -EPERM);
8460 return;
8461 }
8462
8463 const string &snapname = req->get_filepath().last_dentry();
8464
8465 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8466 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8467 respond_to_request(mdr, -EPERM);
8468 return;
8469 }
8470
8471 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
8472
8473 // lock snap
8474 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8475
8476 mds->locker->include_snap_rdlocks(rdlocks, diri);
8477 rdlocks.erase(&diri->snaplock);
8478 xlocks.insert(&diri->snaplock);
8479
8480 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8481 return;
8482
8483 if (!check_access(mdr, diri, MAY_WRITE))
8484 return;
8485
8486 // make sure name is unique
8487 if (diri->snaprealm &&
8488 diri->snaprealm->exists(snapname)) {
8489 respond_to_request(mdr, -EEXIST);
8490 return;
8491 }
8492 if (snapname.length() == 0 ||
8493 snapname[0] == '_') {
8494 respond_to_request(mdr, -EINVAL);
8495 return;
8496 }
8497
8498 // allocate a snapid
8499 if (!mdr->more()->stid) {
8500 // prepare an stid
8501 mds->snapclient->prepare_create(diri->ino(), snapname,
8502 mdr->get_mds_stamp(),
8503 &mdr->more()->stid, &mdr->more()->snapidbl,
8504 new C_MDS_RetryRequest(mdcache, mdr));
8505 return;
8506 }
8507
8508 version_t stid = mdr->more()->stid;
8509 snapid_t snapid;
8510 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8511 ::decode(snapid, p);
8512 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
8513
8514 // journal
8515 SnapInfo info;
8516 info.ino = diri->ino();
8517 info.snapid = snapid;
8518 info.name = snapname;
8519 info.stamp = mdr->get_op_stamp();
8520
8521 inode_t *pi = diri->project_inode();
8522 pi->ctime = info.stamp;
8523 pi->version = diri->pre_dirty();
8524
8525 // project the snaprealm
8526 sr_t *newsnap = diri->project_snaprealm(snapid);
8527 newsnap->snaps[snapid] = info;
8528 newsnap->seq = snapid;
8529 newsnap->last_created = snapid;
8530
8531 // journal the inode changes
8532 mdr->ls = mdlog->get_current_segment();
8533 EUpdate *le = new EUpdate(mdlog, "mksnap");
8534 mdlog->start_entry(le);
8535
8536 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8537 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8538 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8539 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8540
8541 // journal the snaprealm changes
8542 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
8543 mdr, __func__);
8544 mdlog->flush();
8545 }
8546
8547 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
8548 {
8549 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
8550
8551 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
8552
8553 diri->pop_and_dirty_projected_inode(mdr->ls);
8554 mdr->apply();
8555
8556 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8557
8558 // create snap
8559 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8560
8561 mdcache->do_realm_invalidate_and_update_notify(diri, op);
8562
8563 // yay
8564 mdr->in[0] = diri;
8565 mdr->snapid = info.snapid;
8566 mdr->tracei = diri;
8567 respond_to_request(mdr, 0);
8568 }
8569
8570
8571 // RMSNAP
8572
8573 struct C_MDS_rmsnap_finish : public ServerLogContext {
8574 CInode *diri;
8575 snapid_t snapid;
8576 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8577 ServerLogContext(s, r), diri(di), snapid(sn) {}
8578 void finish(int r) override {
8579 server->_rmsnap_finish(mdr, diri, snapid);
8580 }
8581 };
8582
8583 /* This function takes responsibility for the passed mdr*/
8584 void Server::handle_client_rmsnap(MDRequestRef& mdr)
8585 {
8586 MClientRequest *req = mdr->client_request;
8587
8588 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8589 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8590 respond_to_request(mdr, -ESTALE);
8591 return;
8592 }
8593 if (!diri->is_auth()) { // fw to auth?
8594 mdcache->request_forward(mdr, diri->authority().first);
8595 return;
8596 }
8597 if (!diri->is_dir()) {
8598 respond_to_request(mdr, -ENOTDIR);
8599 return;
8600 }
8601
8602 const string &snapname = req->get_filepath().last_dentry();
8603
8604 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8605 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8606 respond_to_request(mdr, -EPERM);
8607 return;
8608 }
8609
8610 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
8611
8612 // does snap exist?
8613 if (snapname.length() == 0 || snapname[0] == '_') {
8614 respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
8615 return;
8616 }
8617 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
8618 respond_to_request(mdr, -ENOENT);
8619 return;
8620 }
8621 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
8622 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
8623
8624 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8625 mds->locker->include_snap_rdlocks(rdlocks, diri);
8626 rdlocks.erase(&diri->snaplock);
8627 xlocks.insert(&diri->snaplock);
8628
8629 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8630 return;
8631
8632 if (!check_access(mdr, diri, MAY_WRITE))
8633 return;
8634
8635 // prepare
8636 if (!mdr->more()->stid) {
8637 mds->snapclient->prepare_destroy(diri->ino(), snapid,
8638 &mdr->more()->stid, &mdr->more()->snapidbl,
8639 new C_MDS_RetryRequest(mdcache, mdr));
8640 return;
8641 }
8642 version_t stid = mdr->more()->stid;
8643 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8644 snapid_t seq;
8645 ::decode(seq, p);
8646 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8647
8648 // journal
8649 inode_t *pi = diri->project_inode();
8650 pi->version = diri->pre_dirty();
8651 pi->ctime = mdr->get_op_stamp();
8652
8653 mdr->ls = mdlog->get_current_segment();
8654 EUpdate *le = new EUpdate(mdlog, "rmsnap");
8655 mdlog->start_entry(le);
8656
8657 // project the snaprealm
8658 sr_t *newnode = diri->project_snaprealm();
8659 newnode->snaps.erase(snapid);
8660 newnode->seq = seq;
8661 newnode->last_destroyed = seq;
8662
8663 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8664 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8665 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8666 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8667
8668 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
8669 mdr, __func__);
8670 mdlog->flush();
8671 }
8672
8673 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8674 {
8675 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
8676 snapid_t stid = mdr->more()->stid;
8677 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8678 snapid_t seq;
8679 ::decode(seq, p);
8680
8681 diri->pop_and_dirty_projected_inode(mdr->ls);
8682 mdr->apply();
8683
8684 mds->snapclient->commit(stid, mdr->ls);
8685
8686 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8687
8688 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
8689
8690 // yay
8691 mdr->in[0] = diri;
8692 respond_to_request(mdr, 0);
8693
8694 // purge snapshot data
8695 if (diri->snaprealm->have_past_parents_open())
8696 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
8697 }
8698
8699 struct C_MDS_renamesnap_finish : public ServerLogContext {
8700 CInode *diri;
8701 snapid_t snapid;
8702 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8703 ServerLogContext(s, r), diri(di), snapid(sn) {}
8704 void finish(int r) override {
8705 server->_renamesnap_finish(mdr, diri, snapid);
8706 }
8707 };
8708
8709 /* This function takes responsibility for the passed mdr*/
8710 void Server::handle_client_renamesnap(MDRequestRef& mdr)
8711 {
8712 MClientRequest *req = mdr->client_request;
8713 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
8714 respond_to_request(mdr, -EINVAL);
8715 return;
8716 }
8717
8718 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8719 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8720 respond_to_request(mdr, -ESTALE);
8721 return;
8722 }
8723
8724 if (!diri->is_auth()) { // fw to auth?
8725 mdcache->request_forward(mdr, diri->authority().first);
8726 return;
8727 }
8728
8729 if (!diri->is_dir()) { // dir only
8730 respond_to_request(mdr, -ENOTDIR);
8731 return;
8732 }
8733
8734 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid ||
8735 mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8736 respond_to_request(mdr, -EPERM);
8737 return;
8738 }
8739
8740 const string &dstname = req->get_filepath().last_dentry();
8741 const string &srcname = req->get_filepath2().last_dentry();
8742 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
8743
8744 if (srcname.length() == 0 || srcname[0] == '_') {
8745 respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
8746 return;
8747 }
8748 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
8749 respond_to_request(mdr, -ENOENT);
8750 return;
8751 }
8752 if (dstname.length() == 0 || dstname[0] == '_') {
8753 respond_to_request(mdr, -EINVAL);
8754 return;
8755 }
8756 if (diri->snaprealm->exists(dstname)) {
8757 respond_to_request(mdr, -EEXIST);
8758 return;
8759 }
8760
8761 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
8762 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
8763
8764 // lock snap
8765 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8766
8767 mds->locker->include_snap_rdlocks(rdlocks, diri);
8768 rdlocks.erase(&diri->snaplock);
8769 xlocks.insert(&diri->snaplock);
8770
8771 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8772 return;
8773
8774 if (!check_access(mdr, diri, MAY_WRITE))
8775 return;
8776
8777 // prepare
8778 if (!mdr->more()->stid) {
8779 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
8780 &mdr->more()->stid, &mdr->more()->snapidbl,
8781 new C_MDS_RetryRequest(mdcache, mdr));
8782 return;
8783 }
8784
8785 version_t stid = mdr->more()->stid;
8786 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8787 snapid_t seq;
8788 ::decode(seq, p);
8789 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8790
8791 // journal
8792 inode_t *pi = diri->project_inode();
8793 pi->ctime = mdr->get_op_stamp();
8794 pi->version = diri->pre_dirty();
8795
8796 // project the snaprealm
8797 sr_t *newsnap = diri->project_snaprealm();
8798 assert(newsnap->snaps.count(snapid));
8799 newsnap->snaps[snapid].name = dstname;
8800
8801 // journal the inode changes
8802 mdr->ls = mdlog->get_current_segment();
8803 EUpdate *le = new EUpdate(mdlog, "renamesnap");
8804 mdlog->start_entry(le);
8805
8806 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8807 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8808 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8809 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8810
8811 // journal the snaprealm changes
8812 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
8813 mdr, __func__);
8814 mdlog->flush();
8815 }
8816
8817 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8818 {
8819 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
8820
8821 diri->pop_and_dirty_projected_inode(mdr->ls);
8822 mdr->apply();
8823
8824 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8825
8826 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8827
8828 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE, true);
8829
8830 // yay
8831 mdr->in[0] = diri;
8832 mdr->tracei = diri;
8833 mdr->snapid = snapid;
8834 respond_to_request(mdr, 0);
8835 }
8836
8837 /**
8838 * Return true if server is in state RECONNECT and this
8839 * client has not yet reconnected.
8840 */
8841 bool Server::waiting_for_reconnect(client_t c) const
8842 {
8843 return client_reconnect_gather.count(c) > 0;
8844 }
8845
8846 void Server::dump_reconnect_status(Formatter *f) const
8847 {
8848 f->open_object_section("reconnect_status");
8849 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
8850 f->close_section();
8851 }