]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
update sources to v12.2.1
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20
21 #include "MDSRank.h"
22 #include "Server.h"
23 #include "Locker.h"
24 #include "MDCache.h"
25 #include "MDLog.h"
26 #include "Migrator.h"
27 #include "MDBalancer.h"
28 #include "InoTable.h"
29 #include "SnapClient.h"
30 #include "Mutation.h"
31
32 #include "msg/Messenger.h"
33
34 #include "osdc/Objecter.h"
35
36 #include "messages/MClientSession.h"
37 #include "messages/MClientRequest.h"
38 #include "messages/MClientReply.h"
39 #include "messages/MClientReconnect.h"
40 #include "messages/MClientCaps.h"
41 #include "messages/MClientSnap.h"
42
43 #include "messages/MMDSSlaveRequest.h"
44
45 #include "messages/MLock.h"
46
47 #include "events/EUpdate.h"
48 #include "events/ESlaveUpdate.h"
49 #include "events/ESession.h"
50 #include "events/EOpen.h"
51 #include "events/ECommitted.h"
52
53 #include "include/filepath.h"
54 #include "common/errno.h"
55 #include "common/Timer.h"
56 #include "common/perf_counters.h"
57 #include "include/compat.h"
58 #include "osd/OSDMap.h"
59
60 #include <errno.h>
61
62 #include <list>
63 #include <iostream>
64 using namespace std;
65
66 #include "common/config.h"
67
68 #define dout_context g_ceph_context
69 #define dout_subsys ceph_subsys_mds
70 #undef dout_prefix
71 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
72
73 class ServerContext : public MDSInternalContextBase {
74 protected:
75 Server *server;
76 MDSRank *get_mds() override
77 {
78 return server->mds;
79 }
80
81 public:
82 explicit ServerContext(Server *s) : server(s) {
83 assert(server != NULL);
84 }
85 };
86
87 class ServerLogContext : public MDSLogContextBase {
88 protected:
89 Server *server;
90 MDSRank *get_mds() override
91 {
92 return server->mds;
93 }
94
95 MDRequestRef mdr;
96 void pre_finish(int r) override {
97 if (mdr)
98 mdr->mark_event("journal_committed: ");
99 }
100 public:
101 explicit ServerLogContext(Server *s) : server(s) {
102 assert(server != NULL);
103 }
104 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
105 assert(server != NULL);
106 }
107 };
108
109 void Server::create_logger()
110 {
111 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
112 plb.add_u64_counter(l_mdss_handle_client_request,"handle_client_request",
113 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
114 plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
115 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
116 plb.add_u64_counter(l_mdss_handle_client_session, "handle_client_session",
117 "Client session messages", "hcs", PerfCountersBuilder::PRIO_INTERESTING);
118 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", "Client requests dispatched");
119 plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request", "Server requests dispatched");
120 plb.add_u64_counter(l_mdss_req_lookuphash, "req_lookuphash",
121 "Request type lookup hash of inode");
122 plb.add_u64_counter(l_mdss_req_lookupino, "req_lookupino",
123 "Request type lookup inode");
124 plb.add_u64_counter(l_mdss_req_lookupparent, "req_lookupparent",
125 "Request type lookup parent");
126 plb.add_u64_counter(l_mdss_req_lookupname, "req_lookupname",
127 "Request type lookup name");
128 plb.add_u64_counter(l_mdss_req_lookup, "req_lookup",
129 "Request type lookup");
130 plb.add_u64_counter(l_mdss_req_lookupsnap, "req_lookupsnap",
131 "Request type lookup snapshot");
132 plb.add_u64_counter(l_mdss_req_getattr, "req_getattr",
133 "Request type get attribute");
134 plb.add_u64_counter(l_mdss_req_setattr, "req_setattr",
135 "Request type set attribute");
136 plb.add_u64_counter(l_mdss_req_setlayout, "req_setlayout",
137 "Request type set file layout");
138 plb.add_u64_counter(l_mdss_req_setdirlayout, "req_setdirlayout",
139 "Request type set directory layout");
140 plb.add_u64_counter(l_mdss_req_setxattr, "req_setxattr",
141 "Request type set extended attribute");
142 plb.add_u64_counter(l_mdss_req_rmxattr, "req_rmxattr",
143 "Request type remove extended attribute");
144 plb.add_u64_counter(l_mdss_req_readdir, "req_readdir",
145 "Request type read directory");
146 plb.add_u64_counter(l_mdss_req_setfilelock, "req_setfilelock",
147 "Request type set file lock");
148 plb.add_u64_counter(l_mdss_req_getfilelock, "req_getfilelock",
149 "Request type get file lock");
150 plb.add_u64_counter(l_mdss_req_create, "req_create",
151 "Request type create");
152 plb.add_u64_counter(l_mdss_req_open, "req_open",
153 "Request type open");
154 plb.add_u64_counter(l_mdss_req_mknod, "req_mknod",
155 "Request type make node");
156 plb.add_u64_counter(l_mdss_req_link, "req_link",
157 "Request type link");
158 plb.add_u64_counter(l_mdss_req_unlink, "req_unlink",
159 "Request type unlink");
160 plb.add_u64_counter(l_mdss_req_rmdir, "req_rmdir",
161 "Request type remove directory");
162 plb.add_u64_counter(l_mdss_req_rename, "req_rename",
163 "Request type rename");
164 plb.add_u64_counter(l_mdss_req_mkdir, "req_mkdir",
165 "Request type make directory");
166 plb.add_u64_counter(l_mdss_req_symlink, "req_symlink",
167 "Request type symbolic link");
168 plb.add_u64_counter(l_mdss_req_lssnap, "req_lssnap",
169 "Request type list snapshot");
170 plb.add_u64_counter(l_mdss_req_mksnap, "req_mksnap",
171 "Request type make snapshot");
172 plb.add_u64_counter(l_mdss_req_rmsnap, "req_rmsnap",
173 "Request type remove snapshot");
174 plb.add_u64_counter(l_mdss_req_renamesnap, "req_renamesnap",
175 "Request type rename snapshot");
176 logger = plb.create_perf_counters();
177 g_ceph_context->get_perfcounters_collection()->add(logger);
178 }
179
180 Server::Server(MDSRank *m) :
181 mds(m),
182 mdcache(mds->mdcache), mdlog(mds->mdlog),
183 logger(0),
184 is_full(false),
185 reconnect_done(NULL),
186 failed_reconnects(0),
187 reconnect_evicting(false),
188 terminating_sessions(false)
189 {
190 }
191
192
193 /* This function DOES put the passed message before returning*/
194 void Server::dispatch(Message *m)
195 {
196 switch (m->get_type()) {
197 case CEPH_MSG_CLIENT_RECONNECT:
198 handle_client_reconnect(static_cast<MClientReconnect*>(m));
199 return;
200 }
201
202 // active?
203 if (!mds->is_active() &&
204 !(mds->is_stopping() && m->get_source().is_mds())) {
205 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
206 (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT)) {
207 MClientRequest *req = static_cast<MClientRequest*>(m);
208 Session *session = get_session(req);
209 if (!session || session->is_closed()) {
210 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
211 req->put();
212 return;
213 }
214 bool queue_replay = false;
215 if (req->is_replay()) {
216 dout(3) << "queuing replayed op" << dendl;
217 queue_replay = true;
218 } else if (req->get_retry_attempt()) {
219 // process completed request in clientreplay stage. The completed request
220 // might have created new file/directorie. This guarantees MDS sends a reply
221 // to client before other request modifies the new file/directorie.
222 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
223 dout(3) << "queuing completed op" << dendl;
224 queue_replay = true;
225 }
226 // this request was created before the cap reconnect message, drop any embedded
227 // cap releases.
228 req->releases.clear();
229 }
230 if (queue_replay) {
231 req->mark_queued_for_replay();
232 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
233 return;
234 }
235 }
236
237 bool wait_for_active = true;
238 if (m->get_type() == MSG_MDS_SLAVE_REQUEST) {
239 // handle_slave_request() will wait if necessary
240 wait_for_active = false;
241 } else if (mds->is_clientreplay()) {
242 // session open requests need to be handled during replay,
243 // close requests need to be delayed
244 if ((m->get_type() == CEPH_MSG_CLIENT_SESSION &&
245 (static_cast<MClientSession*>(m))->get_op() != CEPH_SESSION_REQUEST_CLOSE)) {
246 wait_for_active = false;
247 } else if (m->get_type() == CEPH_MSG_CLIENT_REQUEST) {
248 MClientRequest *req = static_cast<MClientRequest*>(m);
249 if (req->is_queued_for_replay()) {
250 wait_for_active = false;
251 }
252 }
253 }
254 if (wait_for_active) {
255 dout(3) << "not active yet, waiting" << dendl;
256 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
257 return;
258 }
259 }
260
261 switch (m->get_type()) {
262 case CEPH_MSG_CLIENT_SESSION:
263 handle_client_session(static_cast<MClientSession*>(m));
264 return;
265 case CEPH_MSG_CLIENT_REQUEST:
266 handle_client_request(static_cast<MClientRequest*>(m));
267 return;
268 case MSG_MDS_SLAVE_REQUEST:
269 handle_slave_request(static_cast<MMDSSlaveRequest*>(m));
270 return;
271 default:
272 derr << "server unknown message " << m->get_type() << dendl;
273 assert(0 == "server unknown message");
274 }
275 }
276
277
278
279 // ----------------------------------------------------------
280 // SESSION management
281
282 class C_MDS_session_finish : public ServerLogContext {
283 Session *session;
284 uint64_t state_seq;
285 bool open;
286 version_t cmapv;
287 interval_set<inodeno_t> inos;
288 version_t inotablev;
289 Context *fin;
290 public:
291 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
292 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
293 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
294 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { }
295 void finish(int r) override {
296 assert(r == 0);
297 server->_session_logged(session, state_seq, open, cmapv, inos, inotablev);
298 if (fin) {
299 fin->complete(r);
300 }
301 }
302 };
303
304 Session *Server::get_session(Message *m)
305 {
306 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
307 if (session) {
308 dout(20) << "get_session have " << session << " " << session->info.inst
309 << " state " << session->get_state_name() << dendl;
310 session->put(); // not carry ref
311 } else {
312 dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
313 }
314 return session;
315 }
316
317 /* This function DOES put the passed message before returning*/
318 void Server::handle_client_session(MClientSession *m)
319 {
320 version_t pv;
321 bool blacklisted = false;
322 Session *session = get_session(m);
323
324 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
325 assert(m->get_source().is_client()); // should _not_ come from an mds!
326
327 if (!session) {
328 dout(0) << " ignoring sessionless msg " << *m << dendl;
329 m->put();
330 return;
331 }
332
333 if (logger)
334 logger->inc(l_mdss_handle_client_session);
335
336 uint64_t sseq = 0;
337 switch (m->get_op()) {
338 case CEPH_SESSION_REQUEST_OPEN:
339 if (session->is_opening() ||
340 session->is_open() ||
341 session->is_stale() ||
342 session->is_killing()) {
343 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
344 m->put();
345 return;
346 }
347 assert(session->is_closed() ||
348 session->is_closing());
349
350 blacklisted = mds->objecter->with_osdmap(
351 [session](const OSDMap &osd_map) -> bool {
352 return osd_map.is_blacklisted(session->info.inst.addr);
353 });
354
355 if (blacklisted) {
356 dout(10) << "ignoring blacklisted client " << session->info.inst.addr << dendl;
357 m->put();
358 return;
359 }
360
361 session->set_client_metadata(m->client_meta);
362 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN "
363 << session->info.client_metadata.size() << " metadata entries:" << dendl;
364 for (map<string, string>::iterator i = session->info.client_metadata.begin();
365 i != session->info.client_metadata.end(); ++i) {
366 dout(20) << " " << i->first << ": " << i->second << dendl;
367 }
368
369 // Special case for the 'root' metadata path; validate that the claimed
370 // root is actually within the caps of the session
371 if (session->info.client_metadata.count("root")) {
372 const auto claimed_root = session->info.client_metadata.at("root");
373 // claimed_root has a leading "/" which we strip before passing
374 // into caps check
375 if (claimed_root.empty() || claimed_root[0] != '/' ||
376 !session->auth_caps.path_capable(claimed_root.substr(1))) {
377 derr << __func__ << " forbidden path claimed as mount root: "
378 << claimed_root << " by " << m->get_source() << dendl;
379 // Tell the client we're rejecting their open
380 mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
381 mds->clog->warn() << "client session with invalid root '" <<
382 claimed_root << "' denied (" << session->info.inst << ")";
383 session->clear();
384 // Drop out; don't record this session in SessionMap or journal it.
385 break;
386 }
387 }
388
389 if (session->is_closed())
390 mds->sessionmap.add_session(session);
391
392 pv = mds->sessionmap.mark_projected(session);
393 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
394 mds->sessionmap.touch_session(session);
395 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, m->client_meta),
396 new C_MDS_session_finish(this, session, sseq, true, pv));
397 mdlog->flush();
398 break;
399
400 case CEPH_SESSION_REQUEST_RENEWCAPS:
401 if (session->is_open() ||
402 session->is_stale()) {
403 mds->sessionmap.touch_session(session);
404 if (session->is_stale()) {
405 mds->sessionmap.set_state(session, Session::STATE_OPEN);
406 mds->locker->resume_stale_caps(session);
407 mds->sessionmap.touch_session(session);
408 }
409 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_RENEWCAPS, m->get_seq()));
410 } else {
411 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
412 }
413 break;
414
415 case CEPH_SESSION_REQUEST_CLOSE:
416 {
417 if (session->is_closed() ||
418 session->is_closing() ||
419 session->is_killing()) {
420 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
421 m->put();
422 return;
423 }
424 if (session->is_importing()) {
425 dout(10) << "ignoring close req on importing session" << dendl;
426 m->put();
427 return;
428 }
429 assert(session->is_open() ||
430 session->is_stale() ||
431 session->is_opening());
432 if (m->get_seq() < session->get_push_seq()) {
433 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
434 << ", dropping" << dendl;
435 m->put();
436 return;
437 }
438 // We are getting a seq that is higher than expected.
439 // Handle the same as any other seqn error.
440 //
441 if (m->get_seq() != session->get_push_seq()) {
442 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
443 << ", BUGGY!" << dendl;
444 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
445 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
446 m->put();
447 return;
448 }
449 journal_close_session(session, Session::STATE_CLOSING, NULL);
450 }
451 break;
452
453 case CEPH_SESSION_FLUSHMSG_ACK:
454 finish_flush_session(session, m->get_seq());
455 break;
456
457 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
458 mdlog->flush();
459 break;
460
461 default:
462 ceph_abort();
463 }
464 m->put();
465 }
466
467 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
468 {
469 for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
470 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
471 assert(session);
472 if (!session->is_open() ||
473 !session->connection.get() ||
474 !session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER))
475 continue;
476 version_t seq = session->wait_for_flush(gather.new_sub());
477 mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
478 }
479 }
480
481 void Server::finish_flush_session(Session *session, version_t seq)
482 {
483 list<MDSInternalContextBase*> finished;
484 session->finish_flush(seq, finished);
485 mds->queue_waiters(finished);
486 }
487
488 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
489 interval_set<inodeno_t>& inos, version_t piv)
490 {
491 dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close")
492 << " " << pv << dendl;
493
494 if (piv) {
495 assert(session->is_closing() || session->is_killing() ||
496 session->is_opening()); // re-open closing session
497 session->info.prealloc_inos.subtract(inos);
498 mds->inotable->apply_release_ids(inos);
499 assert(mds->inotable->get_version() == piv);
500 }
501
502 mds->sessionmap.mark_dirty(session);
503
504 // apply
505 if (session->get_state_seq() != state_seq) {
506 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
507 << ", noop" << dendl;
508 // close must have been canceled (by an import?), or any number of other things..
509 } else if (open) {
510 assert(session->is_opening());
511 mds->sessionmap.set_state(session, Session::STATE_OPEN);
512 mds->sessionmap.touch_session(session);
513 assert(session->connection != NULL);
514 session->connection->send_message(new MClientSession(CEPH_SESSION_OPEN));
515 if (mdcache->is_readonly())
516 session->connection->send_message(new MClientSession(CEPH_SESSION_FORCE_RO));
517 } else if (session->is_closing() ||
518 session->is_killing()) {
519 // kill any lingering capabilities, leases, requests
520 while (!session->caps.empty()) {
521 Capability *cap = session->caps.front();
522 CInode *in = cap->get_inode();
523 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
524 mds->locker->remove_client_cap(in, session->info.inst.name.num());
525 }
526 while (!session->leases.empty()) {
527 ClientLease *r = session->leases.front();
528 CDentry *dn = static_cast<CDentry*>(r->parent);
529 dout(20) << " killing client lease of " << *dn << dendl;
530 dn->remove_client_lease(r, mds->locker);
531 }
532 if (client_reconnect_gather.count(session->info.get_client())) {
533 dout(20) << " removing client from reconnect set" << dendl;
534 client_reconnect_gather.erase(session->info.get_client());
535
536 if (client_reconnect_gather.empty()) {
537 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
538 reconnect_gather_finish();
539 }
540 }
541
542 if (session->is_closing()) {
543 // mark con disposable. if there is a fault, we will get a
544 // reset and clean it up. if the client hasn't received the
545 // CLOSE message yet, they will reconnect and get an
546 // ms_handle_remote_reset() and realize they had in fact closed.
547 // do this *before* sending the message to avoid a possible
548 // race.
549 if (session->connection != NULL) {
550 // Conditional because terminate_sessions will indiscrimately
551 // put sessions in CLOSING whether they ever had a conn or not.
552 session->connection->mark_disposable();
553 }
554
555 // reset session
556 mds->send_message_client(new MClientSession(CEPH_SESSION_CLOSE), session);
557 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
558 session->clear();
559 mds->sessionmap.remove_session(session);
560 } else if (session->is_killing()) {
561 // destroy session, close connection
562 if (session->connection != NULL) {
563 session->connection->mark_down();
564 }
565 mds->sessionmap.remove_session(session);
566 } else {
567 ceph_abort();
568 }
569 } else {
570 ceph_abort();
571 }
572 }
573
574 /**
575 * Inject sessions from some source other than actual connections.
576 *
577 * For example:
578 * - sessions inferred from journal replay
579 * - sessions learned from other MDSs during rejoin
580 * - sessions learned from other MDSs during dir/caps migration
581 * - sessions learned from other MDSs during a cross-MDS rename
582 */
583 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
584 map<client_t,uint64_t>& sseqmap)
585 {
586 version_t pv = mds->sessionmap.get_projected();
587
588 dout(10) << "prepare_force_open_sessions " << pv
589 << " on " << cm.size() << " clients"
590 << dendl;
591 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
592
593 Session *session = mds->sessionmap.get_or_add_session(p->second);
594 pv = mds->sessionmap.mark_projected(session);
595 if (session->is_closed() ||
596 session->is_closing() ||
597 session->is_killing())
598 sseqmap[p->first] = mds->sessionmap.set_state(session, Session::STATE_OPENING);
599 else
600 assert(session->is_open() ||
601 session->is_opening() ||
602 session->is_stale());
603 session->inc_importing();
604 }
605 return pv;
606 }
607
608 void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm,
609 map<client_t,uint64_t>& sseqmap,
610 bool dec_import)
611 {
612 /*
613 * FIXME: need to carefully consider the race conditions between a
614 * client trying to close a session and an MDS doing an import
615 * trying to force open a session...
616 */
617 dout(10) << "finish_force_open_sessions on " << cm.size() << " clients,"
618 << " initial v " << mds->sessionmap.get_version() << dendl;
619
620
621 int sessions_inserted = 0;
622 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
623 sessions_inserted++;
624
625 Session *session = mds->sessionmap.get_session(p->second.name);
626 assert(session);
627
628 if (sseqmap.count(p->first)) {
629 uint64_t sseq = sseqmap[p->first];
630 if (session->get_state_seq() != sseq) {
631 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
632 } else {
633 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
634 mds->sessionmap.set_state(session, Session::STATE_OPEN);
635 mds->sessionmap.touch_session(session);
636 mds->send_message_client(new MClientSession(CEPH_SESSION_OPEN), session);
637 if (mdcache->is_readonly())
638 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
639 }
640 } else {
641 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
642 assert(session->is_open() || session->is_stale());
643 }
644
645 if (dec_import) {
646 session->dec_importing();
647 }
648
649 mds->sessionmap.mark_dirty(session);
650 }
651
652 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
653 }
654
655 class C_MDS_TerminatedSessions : public ServerContext {
656 void finish(int r) override {
657 server->terminating_sessions = false;
658 }
659 public:
660 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
661 };
662
663 void Server::terminate_sessions()
664 {
665 dout(2) << "terminate_sessions" << dendl;
666
667 terminating_sessions = true;
668
669 // kill them off. clients will retry etc.
670 set<Session*> sessions;
671 mds->sessionmap.get_client_session_set(sessions);
672 for (set<Session*>::const_iterator p = sessions.begin();
673 p != sessions.end();
674 ++p) {
675 Session *session = *p;
676 if (session->is_closing() ||
677 session->is_killing() ||
678 session->is_closed())
679 continue;
680 journal_close_session(session, Session::STATE_CLOSING, NULL);
681 }
682
683 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
684 }
685
686
687 void Server::find_idle_sessions()
688 {
689 dout(10) << "find_idle_sessions. laggy until " << mds->get_laggy_until() << dendl;
690
691 // timeout/stale
692 // (caps go stale, lease die)
693 utime_t now = ceph_clock_now();
694 utime_t cutoff = now;
695 cutoff -= g_conf->mds_session_timeout;
696 while (1) {
697 Session *session = mds->sessionmap.get_oldest_session(Session::STATE_OPEN);
698 if (!session) break;
699 dout(20) << "laggiest active session is " << session->info.inst << dendl;
700 if (session->last_cap_renew >= cutoff) {
701 dout(20) << "laggiest active session is " << session->info.inst << " and sufficiently new ("
702 << session->last_cap_renew << ")" << dendl;
703 break;
704 }
705
706 dout(10) << "new stale session " << session->info.inst << " last " << session->last_cap_renew << dendl;
707 mds->sessionmap.set_state(session, Session::STATE_STALE);
708 mds->locker->revoke_stale_caps(session);
709 mds->locker->remove_stale_leases(session);
710 mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
711 finish_flush_session(session, session->get_push_seq());
712 }
713
714 // autoclose
715 cutoff = now;
716 cutoff -= g_conf->mds_session_autoclose;
717
718 // don't kick clients if we've been laggy
719 if (mds->get_laggy_until() > cutoff) {
720 dout(10) << " laggy_until " << mds->get_laggy_until() << " > cutoff " << cutoff
721 << ", not kicking any clients to be safe" << dendl;
722 return;
723 }
724
725 if (mds->sessionmap.get_sessions().size() == 1 &&
726 mds->mdsmap->get_num_in_mds() == 1) {
727 dout(20) << "not evicting a slow client, because there is only one"
728 << dendl;
729 return;
730 }
731
732 // Collect a list of sessions exceeding the autoclose threshold
733 std::vector<Session *> to_evict;
734 const auto sessions_p = mds->sessionmap.by_state.find(Session::STATE_STALE);
735 if (sessions_p == mds->sessionmap.by_state.end() || sessions_p->second->empty()) {
736 return;
737 }
738 const auto &stale_sessions = sessions_p->second;
739 assert(stale_sessions != nullptr);
740
741 for (const auto &session: *stale_sessions) {
742 if (session->is_importing()) {
743 dout(10) << "stopping at importing session " << session->info.inst << dendl;
744 break;
745 }
746 assert(session->is_stale());
747 if (session->last_cap_renew >= cutoff) {
748 dout(20) << "oldest stale session is " << session->info.inst << " and sufficiently new ("
749 << session->last_cap_renew << ")" << dendl;
750 break;
751 }
752
753 to_evict.push_back(session);
754 }
755
756 for (const auto &session: to_evict) {
757 utime_t age = now;
758 age -= session->last_cap_renew;
759 mds->clog->warn() << "evicting unresponsive client " << *session
760 << ", after " << age << " seconds";
761 dout(10) << "autoclosing stale session " << session->info.inst << " last "
762 << session->last_cap_renew << dendl;
763
764 if (g_conf->mds_session_blacklist_on_timeout) {
765 std::stringstream ss;
766 mds->evict_client(session->info.inst.name.num(), false, true,
767 ss, nullptr);
768 } else {
769 kill_session(session, NULL);
770 }
771 }
772 }
773
774 /*
775 * XXX bump in the interface here, not using an MDSInternalContextBase here
776 * because all the callers right now happen to use a SaferCond
777 */
778 void Server::kill_session(Session *session, Context *on_safe)
779 {
780 assert(mds->mds_lock.is_locked_by_me());
781
782 if ((session->is_opening() ||
783 session->is_open() ||
784 session->is_stale()) &&
785 !session->is_importing()) {
786 dout(10) << "kill_session " << session << dendl;
787 journal_close_session(session, Session::STATE_KILLING, on_safe);
788 } else {
789 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
790 assert(session->is_closing() ||
791 session->is_closed() ||
792 session->is_killing() ||
793 session->is_importing());
794 if (on_safe) {
795 on_safe->complete(0);
796 }
797 }
798 }
799
800 size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
801 {
802 std::list<Session*> victims;
803 const auto sessions = mds->sessionmap.get_sessions();
804 for (const auto p : sessions) {
805 if (!p.first.is_client()) {
806 // Do not apply OSDMap blacklist to MDS daemons, we find out
807 // about their death via MDSMap.
808 continue;
809 }
810
811 Session *s = p.second;
812 if (blacklist.count(s->info.inst.addr)) {
813 victims.push_back(s);
814 }
815 }
816
817 for (const auto s : victims) {
818 kill_session(s, nullptr);
819 }
820
821 dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
822
823 return victims.size();
824 }
825
826 void Server::journal_close_session(Session *session, int state, Context *on_safe)
827 {
828 uint64_t sseq = mds->sessionmap.set_state(session, state);
829 version_t pv = mds->sessionmap.mark_projected(session);
830 version_t piv = 0;
831
832 // release alloc and pending-alloc inos for this session
833 // and wipe out session state, in case the session close aborts for some reason
834 interval_set<inodeno_t> both;
835 both.insert(session->info.prealloc_inos);
836 both.insert(session->pending_prealloc_inos);
837 if (both.size()) {
838 mds->inotable->project_release_ids(both);
839 piv = mds->inotable->get_projected_version();
840 } else
841 piv = 0;
842
843 mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv),
844 new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe));
845 mdlog->flush();
846
847 // clean up requests, too
848 elist<MDRequestImpl*>::iterator p =
849 session->requests.begin(member_offset(MDRequestImpl,
850 item_session_request));
851 while (!p.end()) {
852 MDRequestRef mdr = mdcache->request_get((*p)->reqid);
853 ++p;
854 mdcache->request_kill(mdr);
855 }
856
857 finish_flush_session(session, session->get_push_seq());
858 }
859
860 void Server::reconnect_clients(MDSInternalContext *reconnect_done_)
861 {
862 reconnect_done = reconnect_done_;
863 mds->sessionmap.get_client_set(client_reconnect_gather);
864
865 if (client_reconnect_gather.empty()) {
866 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
867 reconnect_gather_finish();
868 return;
869 }
870
871 // clients will get the mdsmap and discover we're reconnecting via the monitor.
872
873 reconnect_start = ceph_clock_now();
874 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
875 mds->sessionmap.dump();
876 }
877
878 /* This function DOES put the passed message before returning*/
879 void Server::handle_client_reconnect(MClientReconnect *m)
880 {
881 dout(7) << "handle_client_reconnect " << m->get_source() << dendl;
882 client_t from = m->get_source().num();
883 Session *session = get_session(m);
884 assert(session);
885
886 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
887 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
888 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
889 return;
890 }
891
892 utime_t delay = ceph_clock_now();
893 delay -= reconnect_start;
894 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
895
896 bool deny = false;
897 if (!mds->is_reconnect()) {
898 // XXX maybe in the future we can do better than this?
899 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
900 mds->clog->info() << "denied reconnect attempt (mds is "
901 << ceph_mds_state_name(mds->get_state())
902 << ") from " << m->get_source_inst()
903 << " after " << delay << " (allowed interval " << g_conf->mds_reconnect_timeout << ")";
904 deny = true;
905 } else if (session->is_closed()) {
906 dout(1) << " session is closed, ignoring reconnect, sending close" << dendl;
907 mds->clog->info() << "denied reconnect attempt (mds is "
908 << ceph_mds_state_name(mds->get_state())
909 << ") from " << m->get_source_inst() << " (session is closed)";
910 deny = true;
911 } else if (mdcache->is_readonly()) {
912 dout(1) << " read-only FS, ignoring reconnect, sending close" << dendl;
913 mds->clog->info() << "denied reconnect attempt (mds is read-only)";
914 deny = true;
915 }
916
917 if (deny) {
918 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_CLOSE));
919 m->put();
920 return;
921 }
922
923 // notify client of success with an OPEN
924 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_OPEN));
925 session->last_cap_renew = ceph_clock_now();
926 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
927
928 // snaprealms
929 for (vector<ceph_mds_snaprealm_reconnect>::iterator p = m->realms.begin();
930 p != m->realms.end();
931 ++p) {
932 CInode *in = mdcache->get_inode(inodeno_t(p->ino));
933 if (in && in->state_test(CInode::STATE_PURGING))
934 continue;
935 if (in) {
936 assert(in->snaprealm);
937 if (in->snaprealm->have_past_parents_open()) {
938 dout(15) << "open snaprealm (w/ past parents) on " << *in << dendl;
939 mdcache->finish_snaprealm_reconnect(from, in->snaprealm, snapid_t(p->seq));
940 } else {
941 dout(15) << "open snaprealm (w/o past parents) on " << *in << dendl;
942 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
943 }
944 } else {
945 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(p->ino)
946 << " seq " << p->seq << dendl;
947 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
948 }
949 }
950
951 // caps
952 for (map<inodeno_t, cap_reconnect_t>::iterator p = m->caps.begin();
953 p != m->caps.end();
954 ++p) {
955 // make sure our last_cap_id is MAX over all issued caps
956 if (p->second.capinfo.cap_id > mdcache->last_cap_id)
957 mdcache->last_cap_id = p->second.capinfo.cap_id;
958
959 CInode *in = mdcache->get_inode(p->first);
960 if (in && in->state_test(CInode::STATE_PURGING))
961 continue;
962 if (in && in->is_auth()) {
963 // we recovered it, and it's ours. take note.
964 dout(15) << "open cap realm " << inodeno_t(p->second.capinfo.snaprealm)
965 << " on " << *in << dendl;
966 in->reconnect_cap(from, p->second, session);
967 mdcache->add_reconnected_cap(from, p->first, p->second);
968 recover_filelocks(in, p->second.flockbl, m->get_orig_source().num());
969 continue;
970 }
971
972 if (in && !in->is_auth()) {
973 // not mine.
974 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
975 // add to cap export list.
976 p->second.path.clear(); // we don't need path
977 mdcache->rejoin_export_caps(p->first, from, p->second,
978 in->authority().first);
979 } else {
980 // don't know if the inode is mine
981 dout(10) << "missing ino " << p->first << ", will load later" << dendl;
982 p->second.path.clear(); // we don't need path
983 mdcache->rejoin_recovered_caps(p->first, from, p->second, MDS_RANK_NONE);
984 }
985 }
986
987 // remove from gather set
988 client_reconnect_gather.erase(from);
989 if (client_reconnect_gather.empty())
990 reconnect_gather_finish();
991
992 m->put();
993 }
994
995
996
997 void Server::reconnect_gather_finish()
998 {
999 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1000 assert(reconnect_done);
1001 reconnect_done->complete(0);
1002 reconnect_done = NULL;
1003 }
1004
1005 void Server::reconnect_tick()
1006 {
1007 if (reconnect_evicting) {
1008 dout(4) << "reconnect_tick: waiting for evictions" << dendl;
1009 return;
1010 }
1011
1012 utime_t reconnect_end = reconnect_start;
1013 reconnect_end += g_conf->mds_reconnect_timeout;
1014 if (ceph_clock_now() >= reconnect_end &&
1015 !client_reconnect_gather.empty()) {
1016 dout(10) << "reconnect timed out" << dendl;
1017
1018 // If we're doing blacklist evictions, use this to wait for them before
1019 // proceeding to reconnect_gather_finish
1020 MDSGatherBuilder gather(g_ceph_context);
1021
1022 for (set<client_t>::iterator p = client_reconnect_gather.begin();
1023 p != client_reconnect_gather.end();
1024 ++p) {
1025 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
1026 assert(session);
1027 dout(1) << "reconnect gave up on " << session->info.inst << dendl;
1028
1029 mds->clog->warn() << "evicting unresponsive client " << *session
1030 << ", after waiting " << g_conf->mds_reconnect_timeout
1031 << " seconds during MDS startup";
1032
1033 if (g_conf->mds_session_blacklist_on_timeout) {
1034 std::stringstream ss;
1035 mds->evict_client(session->info.inst.name.num(), false, true, ss,
1036 gather.new_sub());
1037 } else {
1038 kill_session(session, NULL);
1039 }
1040
1041 failed_reconnects++;
1042 }
1043 client_reconnect_gather.clear();
1044
1045 if (gather.has_subs()) {
1046 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1047 gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
1048 [this](int r){reconnect_gather_finish();})));
1049 gather.activate();
1050 reconnect_evicting = true;
1051 } else {
1052 reconnect_gather_finish();
1053 }
1054 }
1055 }
1056
1057 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1058 {
1059 if (!locks.length()) return;
1060 int numlocks;
1061 ceph_filelock lock;
1062 bufferlist::iterator p = locks.begin();
1063 ::decode(numlocks, p);
1064 for (int i = 0; i < numlocks; ++i) {
1065 ::decode(lock, p);
1066 lock.client = client;
1067 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1068 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1069 }
1070 ::decode(numlocks, p);
1071 for (int i = 0; i < numlocks; ++i) {
1072 ::decode(lock, p);
1073 lock.client = client;
1074 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1075 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1076 }
1077 }
1078
1079
1080 /**
1081 * Call this when the MDCache is oversized, to send requests to the clients
1082 * to trim some caps, and consequently unpin some inodes in the MDCache so
1083 * that it can trim too.
1084 */
1085 void Server::recall_client_state(void)
1086 {
1087 /* try to recall at least 80% of all caps */
1088 uint64_t max_caps_per_client = (Capability::count() * .8);
1089 uint64_t min_caps_per_client = 100;
1090 /* unless this ratio is smaller: */
1091 /* ratio: determine the amount of caps to recall from each client. Use
1092 * percentage full over the cache reservation. Cap the ratio at 80% of client
1093 * caps. */
1094 double ratio = 1.0-fmin(0.80, mdcache->cache_toofull_ratio());
1095
1096 dout(10) << "recall_client_state " << ratio
1097 << ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
1098 << dendl;
1099
1100 set<Session*> sessions;
1101 mds->sessionmap.get_client_session_set(sessions);
1102 for (auto &session : sessions) {
1103 if (!session->is_open() ||
1104 !session->info.inst.name.is_client())
1105 continue;
1106
1107 dout(10) << " session " << session->info.inst
1108 << " caps " << session->caps.size()
1109 << ", leases " << session->leases.size()
1110 << dendl;
1111
1112 if (session->caps.size() > min_caps_per_client) {
1113 uint64_t newlim = MIN((session->caps.size() * ratio), max_caps_per_client);
1114 if (session->caps.size() > newlim) {
1115 MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
1116 m->head.max_caps = newlim;
1117 mds->send_message_client(m, session);
1118 session->notify_recall_sent(newlim);
1119 }
1120 }
1121 }
1122 }
1123
1124 void Server::force_clients_readonly()
1125 {
1126 dout(10) << "force_clients_readonly" << dendl;
1127 set<Session*> sessions;
1128 mds->sessionmap.get_client_session_set(sessions);
1129 for (set<Session*>::const_iterator p = sessions.begin();
1130 p != sessions.end();
1131 ++p) {
1132 Session *session = *p;
1133 if (!session->info.inst.name.is_client() ||
1134 !(session->is_open() || session->is_stale()))
1135 continue;
1136 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
1137 }
1138 }
1139
1140 /*******
1141 * some generic stuff for finishing off requests
1142 */
1143 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1144 {
1145 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1146 assert(!mdr->has_completed);
1147
1148 // note trace items for eventual reply.
1149 mdr->tracei = in;
1150 if (in)
1151 mdr->pin(in);
1152
1153 mdr->tracedn = dn;
1154 if (dn)
1155 mdr->pin(dn);
1156
1157 early_reply(mdr, in, dn);
1158
1159 mdr->committing = true;
1160 submit_mdlog_entry(le, fin, mdr, __func__);
1161
1162 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1163 if (mds->queue_one_replay()) {
1164 dout(10) << " queued next replay op" << dendl;
1165 } else {
1166 dout(10) << " journaled last replay op, flushing" << dendl;
1167 mdlog->flush();
1168 }
1169 } else if (mdr->did_early_reply)
1170 mds->locker->drop_rdlocks(mdr.get());
1171 else
1172 mdlog->flush();
1173 }
1174
1175 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1176 const char *event)
1177 {
1178 if (mdr) {
1179 string event_str("submit entry: ");
1180 event_str += event;
1181 mdr->mark_event_string(event_str);
1182 }
1183 mdlog->submit_entry(le, fin);
1184 }
1185
1186 /*
1187 * send response built from mdr contents and error code; clean up mdr
1188 */
1189 void Server::respond_to_request(MDRequestRef& mdr, int r)
1190 {
1191 if (mdr->client_request) {
1192 reply_client_request(mdr, new MClientReply(mdr->client_request, r));
1193
1194 // add here to avoid counting ops multiple times (e.g., locks, loading)
1195 switch(mdr->client_request->get_op()) {
1196 case CEPH_MDS_OP_LOOKUPHASH:
1197 logger->inc(l_mdss_req_lookuphash);
1198 break;
1199 case CEPH_MDS_OP_LOOKUPINO:
1200 logger->inc(l_mdss_req_lookupino);
1201 break;
1202 case CEPH_MDS_OP_LOOKUPPARENT:
1203 logger->inc(l_mdss_req_lookupparent);
1204 break;
1205 case CEPH_MDS_OP_LOOKUPNAME:
1206 logger->inc(l_mdss_req_lookupname);
1207 break;
1208 case CEPH_MDS_OP_LOOKUP:
1209 logger->inc(l_mdss_req_lookup);
1210 break;
1211 case CEPH_MDS_OP_LOOKUPSNAP:
1212 logger->inc(l_mdss_req_lookupsnap);
1213 break;
1214 case CEPH_MDS_OP_GETATTR:
1215 logger->inc(l_mdss_req_getattr);
1216 break;
1217 case CEPH_MDS_OP_SETATTR:
1218 logger->inc(l_mdss_req_setattr);
1219 break;
1220 case CEPH_MDS_OP_SETLAYOUT:
1221 logger->inc(l_mdss_req_setlayout);
1222 break;
1223 case CEPH_MDS_OP_SETDIRLAYOUT:
1224 logger->inc(l_mdss_req_setdirlayout);
1225 break;
1226 case CEPH_MDS_OP_SETXATTR:
1227 logger->inc(l_mdss_req_setxattr);
1228 break;
1229 case CEPH_MDS_OP_RMXATTR:
1230 logger->inc(l_mdss_req_rmxattr);
1231 break;
1232 case CEPH_MDS_OP_READDIR:
1233 logger->inc(l_mdss_req_readdir);
1234 break;
1235 case CEPH_MDS_OP_SETFILELOCK:
1236 logger->inc(l_mdss_req_setfilelock);
1237 break;
1238 case CEPH_MDS_OP_GETFILELOCK:
1239 logger->inc(l_mdss_req_getfilelock);
1240 break;
1241 case CEPH_MDS_OP_CREATE:
1242 logger->inc(l_mdss_req_create);
1243 case CEPH_MDS_OP_OPEN:
1244 logger->inc(l_mdss_req_open);
1245 break;
1246 case CEPH_MDS_OP_MKNOD:
1247 logger->inc(l_mdss_req_mknod);
1248 break;
1249 case CEPH_MDS_OP_LINK:
1250 logger->inc(l_mdss_req_link);
1251 break;
1252 case CEPH_MDS_OP_UNLINK:
1253 logger->inc(l_mdss_req_unlink);
1254 break;
1255 case CEPH_MDS_OP_RMDIR:
1256 logger->inc(l_mdss_req_rmdir);
1257 break;
1258 case CEPH_MDS_OP_RENAME:
1259 logger->inc(l_mdss_req_rename);
1260 break;
1261 case CEPH_MDS_OP_MKDIR:
1262 logger->inc(l_mdss_req_mkdir);
1263 break;
1264 case CEPH_MDS_OP_SYMLINK:
1265 logger->inc(l_mdss_req_symlink);
1266 break;
1267 case CEPH_MDS_OP_LSSNAP:
1268 logger->inc(l_mdss_req_lssnap);
1269 break;
1270 case CEPH_MDS_OP_MKSNAP:
1271 logger->inc(l_mdss_req_mksnap);
1272 break;
1273 case CEPH_MDS_OP_RMSNAP:
1274 logger->inc(l_mdss_req_rmsnap);
1275 break;
1276 case CEPH_MDS_OP_RENAMESNAP:
1277 logger->inc(l_mdss_req_renamesnap);
1278 break;
1279 }
1280 } else if (mdr->internal_op > -1) {
1281 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1282 if (!mdr->internal_op_finish)
1283 assert(0 == "trying to respond to internal op without finisher");
1284 mdr->internal_op_finish->complete(r);
1285 mdcache->request_finish(mdr);
1286 }
1287 }
1288
1289 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
1290 {
1291 if (!g_conf->mds_early_reply)
1292 return;
1293
1294 if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
1295 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
1296 return;
1297 }
1298
1299 if (mdr->alloc_ino) {
1300 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
1301 return;
1302 }
1303
1304 MClientRequest *req = mdr->client_request;
1305 entity_inst_t client_inst = req->get_source_inst();
1306 if (client_inst.name.is_mds())
1307 return;
1308
1309 if (req->is_replay()) {
1310 dout(10) << " no early reply on replay op" << dendl;
1311 return;
1312 }
1313
1314
1315 MClientReply *reply = new MClientReply(req, 0);
1316 reply->set_unsafe();
1317
1318 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1319 //
1320 //_rename_finish() does not send dentry link/unlink message to replicas.
1321 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1322 // that have projected linkages from getting new replica.
1323 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
1324
1325 dout(10) << "early_reply " << reply->get_result()
1326 << " (" << cpp_strerror(reply->get_result())
1327 << ") " << *req << dendl;
1328
1329 if (tracei || tracedn) {
1330 if (tracei)
1331 mdr->cap_releases.erase(tracei->vino());
1332 if (tracedn)
1333 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1334
1335 set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid,
1336 req->get_dentry_wanted(), mdr);
1337 }
1338
1339 reply->set_extra_bl(mdr->reply_extra_bl);
1340 req->get_connection()->send_message(reply);
1341
1342 mdr->did_early_reply = true;
1343
1344 mds->logger->inc(l_mds_reply);
1345 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
1346 mds->logger->tinc(l_mds_reply_latency, lat);
1347 dout(20) << "lat " << lat << dendl;
1348
1349 mdr->mark_event("early_replied");
1350 }
1351
1352 /*
1353 * send given reply
1354 * include a trace to tracei
1355 * Clean up mdr
1356 */
1357 void Server::reply_client_request(MDRequestRef& mdr, MClientReply *reply)
1358 {
1359 assert(mdr.get());
1360 MClientRequest *req = mdr->client_request;
1361
1362 dout(7) << "reply_client_request " << reply->get_result()
1363 << " (" << cpp_strerror(reply->get_result())
1364 << ") " << *req << dendl;
1365
1366 mdr->mark_event("replying");
1367
1368 Session *session = mdr->session;
1369
1370 // note successful request in session map?
1371 //
1372 // setfilelock requests are special, they only modify states in MDS memory.
1373 // The states get lost when MDS fails. If Client re-send a completed
1374 // setfilelock request, it means that client did not receive corresponding
1375 // setfilelock reply. So MDS should re-execute the setfilelock request.
1376 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
1377 reply->get_result() == 0 && session) {
1378 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
1379 session->add_completed_request(mdr->reqid.tid, created);
1380 if (mdr->ls) {
1381 mdr->ls->touched_sessions.insert(session->info.inst.name);
1382 }
1383 }
1384
1385 // give any preallocated inos to the session
1386 apply_allocated_inos(mdr, session);
1387
1388 // get tracei/tracedn from mdr?
1389 snapid_t snapid = mdr->snapid;
1390 CInode *tracei = mdr->tracei;
1391 CDentry *tracedn = mdr->tracedn;
1392
1393 bool is_replay = mdr->client_request->is_replay();
1394 bool did_early_reply = mdr->did_early_reply;
1395 entity_inst_t client_inst = req->get_source_inst();
1396 int dentry_wanted = req->get_dentry_wanted();
1397
1398 if (!did_early_reply && !is_replay) {
1399
1400 mds->logger->inc(l_mds_reply);
1401 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
1402 mds->logger->tinc(l_mds_reply_latency, lat);
1403 dout(20) << "lat " << lat << dendl;
1404
1405 if (tracei)
1406 mdr->cap_releases.erase(tracei->vino());
1407 if (tracedn)
1408 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1409 }
1410
1411 // drop non-rdlocks before replying, so that we can issue leases
1412 mdcache->request_drop_non_rdlocks(mdr);
1413
1414 // reply at all?
1415 if (client_inst.name.is_mds() || !session) {
1416 reply->put(); // mds doesn't need a reply
1417 reply = 0;
1418 } else {
1419 // send reply.
1420 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
1421 (tracei || tracedn)) {
1422 if (is_replay) {
1423 if (tracei)
1424 mdcache->try_reconnect_cap(tracei, session);
1425 } else {
1426 // include metadata in reply
1427 set_trace_dist(session, reply, tracei, tracedn,
1428 snapid, dentry_wanted,
1429 mdr);
1430 }
1431 }
1432
1433 // We can set the extra bl unconditionally: if it's already been sent in the
1434 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
1435 reply->set_extra_bl(mdr->reply_extra_bl);
1436
1437 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
1438 req->get_connection()->send_message(reply);
1439 }
1440
1441 if (req->is_queued_for_replay() &&
1442 (mdr->has_completed || reply->get_result() < 0)) {
1443 if (reply->get_result() < 0) {
1444 int r = reply->get_result();
1445 derr << "reply_client_request: failed to replay " << *req
1446 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
1447 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
1448 }
1449 mds->queue_one_replay();
1450 }
1451
1452 // clean up request
1453 mdcache->request_finish(mdr);
1454
1455 // take a closer look at tracei, if it happens to be a remote link
1456 if (tracei &&
1457 tracedn &&
1458 tracedn->get_projected_linkage()->is_remote()) {
1459 mdcache->eval_remote(tracedn);
1460 }
1461 }
1462
1463
1464 void Server::encode_empty_dirstat(bufferlist& bl)
1465 {
1466 static DirStat empty;
1467 empty.encode(bl);
1468 }
1469
1470 void Server::encode_infinite_lease(bufferlist& bl)
1471 {
1472 LeaseStat e;
1473 e.seq = 0;
1474 e.mask = -1;
1475 e.duration_ms = -1;
1476 ::encode(e, bl);
1477 dout(20) << "encode_infinite_lease " << e << dendl;
1478 }
1479
1480 void Server::encode_null_lease(bufferlist& bl)
1481 {
1482 LeaseStat e;
1483 e.seq = 0;
1484 e.mask = 0;
1485 e.duration_ms = 0;
1486 ::encode(e, bl);
1487 dout(20) << "encode_null_lease " << e << dendl;
1488 }
1489
1490
1491 /*
1492 * pass inode OR dentry (not both, or we may get confused)
1493 *
1494 * trace is in reverse order (i.e. root inode comes last)
1495 */
1496 void Server::set_trace_dist(Session *session, MClientReply *reply,
1497 CInode *in, CDentry *dn,
1498 snapid_t snapid,
1499 int dentry_wanted,
1500 MDRequestRef& mdr)
1501 {
1502 // skip doing this for debugging purposes?
1503 if (g_conf->mds_inject_traceless_reply_probability &&
1504 mdr->ls && !mdr->o_trunc &&
1505 (rand() % 10000 < g_conf->mds_inject_traceless_reply_probability * 10000.0)) {
1506 dout(5) << "deliberately skipping trace for " << *reply << dendl;
1507 return;
1508 }
1509
1510 // inode, dentry, dir, ..., inode
1511 bufferlist bl;
1512 mds_rank_t whoami = mds->get_nodeid();
1513 client_t client = session->get_client();
1514 utime_t now = ceph_clock_now();
1515
1516 dout(20) << "set_trace_dist snapid " << snapid << dendl;
1517
1518 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
1519
1520 // realm
1521 if (snapid == CEPH_NOSNAP) {
1522 SnapRealm *realm;
1523 if (in)
1524 realm = in->find_snaprealm();
1525 else
1526 realm = dn->get_dir()->get_inode()->find_snaprealm();
1527 reply->snapbl = realm->get_snap_trace();
1528 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
1529 }
1530
1531 // dir + dentry?
1532 if (dn) {
1533 reply->head.is_dentry = 1;
1534 CDir *dir = dn->get_dir();
1535 CInode *diri = dir->get_inode();
1536
1537 diri->encode_inodestat(bl, session, NULL, snapid);
1538 dout(20) << "set_trace_dist added diri " << *diri << dendl;
1539
1540 #ifdef MDS_VERIFY_FRAGSTAT
1541 if (dir->is_complete())
1542 dir->verify_fragstat();
1543 #endif
1544 dir->encode_dirstat(bl, whoami);
1545 dout(20) << "set_trace_dist added dir " << *dir << dendl;
1546
1547 ::encode(dn->get_name(), bl);
1548 if (snapid == CEPH_NOSNAP)
1549 mds->locker->issue_client_lease(dn, client, bl, now, session);
1550 else
1551 encode_null_lease(bl);
1552 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
1553 } else
1554 reply->head.is_dentry = 0;
1555
1556 // inode
1557 if (in) {
1558 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
1559 dout(20) << "set_trace_dist added in " << *in << dendl;
1560 reply->head.is_target = 1;
1561 } else
1562 reply->head.is_target = 0;
1563
1564 reply->set_trace(bl);
1565 }
1566
1567
1568
1569
1570 /***
1571 * process a client request
1572 * This function DOES put the passed message before returning
1573 */
1574 void Server::handle_client_request(MClientRequest *req)
1575 {
1576 dout(4) << "handle_client_request " << *req << dendl;
1577
1578 if (mds->logger)
1579 mds->logger->inc(l_mds_request);
1580 if (logger)
1581 logger->inc(l_mdss_handle_client_request);
1582
1583 if (!mdcache->is_open()) {
1584 dout(5) << "waiting for root" << dendl;
1585 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
1586 return;
1587 }
1588
1589 // active session?
1590 Session *session = 0;
1591 if (req->get_source().is_client()) {
1592 session = get_session(req);
1593 if (!session) {
1594 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
1595 } else if (session->is_closed() ||
1596 session->is_closing() ||
1597 session->is_killing()) {
1598 dout(5) << "session closed|closing|killing, dropping" << dendl;
1599 session = NULL;
1600 }
1601 if (!session) {
1602 if (req->is_queued_for_replay())
1603 mds->queue_one_replay();
1604 req->put();
1605 return;
1606 }
1607 }
1608
1609 // old mdsmap?
1610 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
1611 // send it? hrm, this isn't ideal; they may get a lot of copies if
1612 // they have a high request rate.
1613 }
1614
1615 // completed request?
1616 bool has_completed = false;
1617 if (req->is_replay() || req->get_retry_attempt()) {
1618 assert(session);
1619 inodeno_t created;
1620 if (session->have_completed_request(req->get_reqid().tid, &created)) {
1621 has_completed = true;
1622 // Don't send traceless reply if the completed request has created
1623 // new inode. Treat the request as lookup request instead.
1624 if (req->is_replay() ||
1625 ((created == inodeno_t() || !mds->is_clientreplay()) &&
1626 req->get_op() != CEPH_MDS_OP_OPEN &&
1627 req->get_op() != CEPH_MDS_OP_CREATE)) {
1628 dout(5) << "already completed " << req->get_reqid() << dendl;
1629 MClientReply *reply = new MClientReply(req, 0);
1630 if (created != inodeno_t()) {
1631 bufferlist extra;
1632 ::encode(created, extra);
1633 reply->set_extra_bl(extra);
1634 }
1635 req->get_connection()->send_message(reply);
1636
1637 if (req->is_queued_for_replay())
1638 mds->queue_one_replay();
1639
1640 req->put();
1641 return;
1642 }
1643 if (req->get_op() != CEPH_MDS_OP_OPEN &&
1644 req->get_op() != CEPH_MDS_OP_CREATE) {
1645 dout(10) << " completed request which created new inode " << created
1646 << ", convert it to lookup request" << dendl;
1647 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
1648 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
1649 }
1650 }
1651 }
1652
1653 // trim completed_request list
1654 if (req->get_oldest_client_tid() > 0) {
1655 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
1656 assert(session);
1657 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
1658 // Sessions 'completed_requests' was dirtied, mark it to be
1659 // potentially flushed at segment expiry.
1660 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
1661
1662 if (session->get_num_trim_requests_warnings() > 0 &&
1663 session->get_num_completed_requests() * 2 < g_conf->mds_max_completed_requests)
1664 session->reset_num_trim_requests_warnings();
1665 } else {
1666 if (session->get_num_completed_requests() >=
1667 (g_conf->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
1668 session->inc_num_trim_requests_warnings();
1669 stringstream ss;
1670 ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
1671 << req->get_oldest_client_tid() << "), "
1672 << session->get_num_completed_requests()
1673 << " completed requests recorded in session\n";
1674 mds->clog->warn() << ss.str();
1675 dout(20) << __func__ << " " << ss.str() << dendl;
1676 }
1677 }
1678 }
1679
1680 // register + dispatch
1681 MDRequestRef mdr = mdcache->request_start(req);
1682 if (!mdr.get())
1683 return;
1684
1685 if (session) {
1686 mdr->session = session;
1687 session->requests.push_back(&mdr->item_session_request);
1688 }
1689
1690 if (has_completed)
1691 mdr->has_completed = true;
1692
1693 // process embedded cap releases?
1694 // (only if NOT replay!)
1695 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
1696 client_t client = req->get_source().num();
1697 for (vector<MClientRequest::Release>::iterator p = req->releases.begin();
1698 p != req->releases.end();
1699 ++p)
1700 mds->locker->process_request_cap_release(mdr, client, p->item, p->dname);
1701 req->releases.clear();
1702 }
1703
1704 dispatch_client_request(mdr);
1705 return;
1706 }
1707
1708 void Server::handle_osd_map()
1709 {
1710 /* Note that we check the OSDMAP_FULL flag directly rather than
1711 * using osdmap_full_flag(), because we want to know "is the flag set"
1712 * rather than "does the flag apply to us?" */
1713 mds->objecter->with_osdmap([this](const OSDMap& o) {
1714 is_full = o.test_flag(CEPH_OSDMAP_FULL);
1715 dout(7) << __func__ << ": full = " << is_full << " epoch = "
1716 << o.get_epoch() << dendl;
1717 });
1718 }
1719
1720 void Server::dispatch_client_request(MDRequestRef& mdr)
1721 {
1722 // we shouldn't be waiting on anyone.
1723 assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
1724
1725 if (mdr->killed) {
1726 dout(10) << "request " << *mdr << " was killed" << dendl;
1727 return;
1728 }
1729
1730 MClientRequest *req = mdr->client_request;
1731
1732 if (logger) logger->inc(l_mdss_dispatch_client_request);
1733
1734 dout(7) << "dispatch_client_request " << *req << dendl;
1735
1736 if (req->may_write()) {
1737 if (mdcache->is_readonly()) {
1738 dout(10) << " read-only FS" << dendl;
1739 respond_to_request(mdr, -EROFS);
1740 return;
1741 }
1742 if (mdr->has_more() && mdr->more()->slave_error) {
1743 dout(10) << " got error from slaves" << dendl;
1744 respond_to_request(mdr, mdr->more()->slave_error);
1745 return;
1746 }
1747 }
1748
1749 if (is_full) {
1750 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1751 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
1752 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1753 req->get_op() == CEPH_MDS_OP_RMXATTR ||
1754 req->get_op() == CEPH_MDS_OP_SETXATTR ||
1755 req->get_op() == CEPH_MDS_OP_CREATE ||
1756 req->get_op() == CEPH_MDS_OP_SYMLINK ||
1757 req->get_op() == CEPH_MDS_OP_MKSNAP ||
1758 ((req->get_op() == CEPH_MDS_OP_LINK ||
1759 req->get_op() == CEPH_MDS_OP_RENAME) &&
1760 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
1761 ) {
1762
1763 dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
1764 respond_to_request(mdr, -ENOSPC);
1765 return;
1766 } else {
1767 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
1768 }
1769 }
1770
1771 switch (req->get_op()) {
1772 case CEPH_MDS_OP_LOOKUPHASH:
1773 case CEPH_MDS_OP_LOOKUPINO:
1774 handle_client_lookup_ino(mdr, false, false);
1775 break;
1776 case CEPH_MDS_OP_LOOKUPPARENT:
1777 handle_client_lookup_ino(mdr, true, false);
1778 break;
1779 case CEPH_MDS_OP_LOOKUPNAME:
1780 handle_client_lookup_ino(mdr, false, true);
1781 break;
1782
1783 // inodes ops.
1784 case CEPH_MDS_OP_LOOKUP:
1785 handle_client_getattr(mdr, true);
1786 break;
1787
1788 case CEPH_MDS_OP_LOOKUPSNAP:
1789 // lookupsnap does not reference a CDentry; treat it as a getattr
1790 case CEPH_MDS_OP_GETATTR:
1791 handle_client_getattr(mdr, false);
1792 break;
1793
1794 case CEPH_MDS_OP_SETATTR:
1795 handle_client_setattr(mdr);
1796 break;
1797 case CEPH_MDS_OP_SETLAYOUT:
1798 handle_client_setlayout(mdr);
1799 break;
1800 case CEPH_MDS_OP_SETDIRLAYOUT:
1801 handle_client_setdirlayout(mdr);
1802 break;
1803 case CEPH_MDS_OP_SETXATTR:
1804 handle_client_setxattr(mdr);
1805 break;
1806 case CEPH_MDS_OP_RMXATTR:
1807 handle_client_removexattr(mdr);
1808 break;
1809
1810 case CEPH_MDS_OP_READDIR:
1811 handle_client_readdir(mdr);
1812 break;
1813
1814 case CEPH_MDS_OP_SETFILELOCK:
1815 handle_client_file_setlock(mdr);
1816 break;
1817
1818 case CEPH_MDS_OP_GETFILELOCK:
1819 handle_client_file_readlock(mdr);
1820 break;
1821
1822 // funky.
1823 case CEPH_MDS_OP_CREATE:
1824 if (mdr->has_completed)
1825 handle_client_open(mdr); // already created.. just open
1826 else
1827 handle_client_openc(mdr);
1828 break;
1829
1830 case CEPH_MDS_OP_OPEN:
1831 handle_client_open(mdr);
1832 break;
1833
1834 // namespace.
1835 // no prior locks.
1836 case CEPH_MDS_OP_MKNOD:
1837 handle_client_mknod(mdr);
1838 break;
1839 case CEPH_MDS_OP_LINK:
1840 handle_client_link(mdr);
1841 break;
1842 case CEPH_MDS_OP_UNLINK:
1843 case CEPH_MDS_OP_RMDIR:
1844 handle_client_unlink(mdr);
1845 break;
1846 case CEPH_MDS_OP_RENAME:
1847 handle_client_rename(mdr);
1848 break;
1849 case CEPH_MDS_OP_MKDIR:
1850 handle_client_mkdir(mdr);
1851 break;
1852 case CEPH_MDS_OP_SYMLINK:
1853 handle_client_symlink(mdr);
1854 break;
1855
1856
1857 // snaps
1858 case CEPH_MDS_OP_LSSNAP:
1859 handle_client_lssnap(mdr);
1860 break;
1861 case CEPH_MDS_OP_MKSNAP:
1862 handle_client_mksnap(mdr);
1863 break;
1864 case CEPH_MDS_OP_RMSNAP:
1865 handle_client_rmsnap(mdr);
1866 break;
1867 case CEPH_MDS_OP_RENAMESNAP:
1868 handle_client_renamesnap(mdr);
1869 break;
1870
1871 default:
1872 dout(1) << " unknown client op " << req->get_op() << dendl;
1873 respond_to_request(mdr, -EOPNOTSUPP);
1874 }
1875 }
1876
1877
1878 // ---------------------------------------
1879 // SLAVE REQUESTS
1880
1881 /* This function DOES put the passed message before returning*/
1882 void Server::handle_slave_request(MMDSSlaveRequest *m)
1883 {
1884 dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
1885 mds_rank_t from = mds_rank_t(m->get_source().num());
1886
1887 if (logger) logger->inc(l_mdss_handle_slave_request);
1888
1889 // reply?
1890 if (m->is_reply())
1891 return handle_slave_request_reply(m);
1892
1893 // the purpose of rename notify is enforcing causal message ordering. making sure
1894 // bystanders have received all messages from rename srcdn's auth MDS.
1895 if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
1896 MMDSSlaveRequest *reply = new MMDSSlaveRequest(m->get_reqid(), m->get_attempt(),
1897 MMDSSlaveRequest::OP_RENAMENOTIFYACK);
1898 mds->send_message(reply, m->get_connection());
1899 m->put();
1900 return;
1901 }
1902
1903 CDentry *straydn = NULL;
1904 if (m->stray.length() > 0) {
1905 straydn = mdcache->add_replica_stray(m->stray, from);
1906 assert(straydn);
1907 m->stray.clear();
1908 }
1909
1910 // am i a new slave?
1911 MDRequestRef mdr;
1912 if (mdcache->have_request(m->get_reqid())) {
1913 // existing?
1914 mdr = mdcache->request_get(m->get_reqid());
1915
1916 // is my request newer?
1917 if (mdr->attempt > m->get_attempt()) {
1918 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
1919 << ", dropping " << *m << dendl;
1920 m->put();
1921 return;
1922 }
1923
1924
1925 if (mdr->attempt < m->get_attempt()) {
1926 // mine is old, close it out
1927 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
1928 << ", closing out" << dendl;
1929 mdcache->request_finish(mdr);
1930 mdr.reset();
1931 } else if (mdr->slave_to_mds != from) {
1932 dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
1933 m->put();
1934 return;
1935 }
1936
1937 if (m->get_op() == MMDSSlaveRequest::OP_FINISH && m->is_abort()) {
1938 mdr->aborted = true;
1939 if (mdr->slave_request) {
1940 // only abort on-going xlock, wrlock and auth pin
1941 assert(!mdr->slave_did_prepare());
1942 } else {
1943 mdcache->request_finish(mdr);
1944 }
1945 return;
1946 }
1947 }
1948 if (!mdr.get()) {
1949 // new?
1950 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
1951 dout(10) << "missing slave request for " << m->get_reqid()
1952 << " OP_FINISH, must have lost race with a forward" << dendl;
1953 m->put();
1954 return;
1955 }
1956 mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
1957 mdr->set_op_stamp(m->op_stamp);
1958 }
1959 assert(mdr->slave_request == 0); // only one at a time, please!
1960
1961 if (straydn) {
1962 mdr->pin(straydn);
1963 mdr->straydn = straydn;
1964 }
1965
1966 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
1967 dout(3) << "not clientreplay|active yet, waiting" << dendl;
1968 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
1969 return;
1970 } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
1971 mdr->locks.empty()) {
1972 dout(3) << "not active yet, waiting" << dendl;
1973 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
1974 return;
1975 }
1976
1977 mdr->slave_request = m;
1978
1979 dispatch_slave_request(mdr);
1980 }
1981
1982 /* This function DOES put the passed message before returning*/
1983 void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
1984 {
1985 mds_rank_t from = mds_rank_t(m->get_source().num());
1986
1987 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
1988 metareqid_t r = m->get_reqid();
1989 if (!mdcache->have_uncommitted_master(r, from)) {
1990 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
1991 << from << " reqid " << r << dendl;
1992 m->put();
1993 return;
1994 }
1995 dout(3) << "not clientreplay|active yet, waiting" << dendl;
1996 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
1997 return;
1998 }
1999
2000 if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
2001 metareqid_t r = m->get_reqid();
2002 mdcache->committed_master_slave(r, from);
2003 m->put();
2004 return;
2005 }
2006
2007 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2008 if (m->get_attempt() != mdr->attempt) {
2009 dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
2010 << m->get_attempt() << dendl;
2011 m->put();
2012 return;
2013 }
2014
2015 switch (m->get_op()) {
2016 case MMDSSlaveRequest::OP_XLOCKACK:
2017 {
2018 // identify lock, master request
2019 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2020 m->get_object_info());
2021 mdr->more()->slaves.insert(from);
2022 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2023 mdr->xlocks.insert(lock);
2024 mdr->locks.insert(lock);
2025 mdr->finish_locking(lock);
2026 lock->get_xlock(mdr, mdr->get_client());
2027
2028 assert(mdr->more()->waiting_on_slave.count(from));
2029 mdr->more()->waiting_on_slave.erase(from);
2030 assert(mdr->more()->waiting_on_slave.empty());
2031 mdcache->dispatch_request(mdr);
2032 }
2033 break;
2034
2035 case MMDSSlaveRequest::OP_WRLOCKACK:
2036 {
2037 // identify lock, master request
2038 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2039 m->get_object_info());
2040 mdr->more()->slaves.insert(from);
2041 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2042 mdr->remote_wrlocks[lock] = from;
2043 mdr->locks.insert(lock);
2044 mdr->finish_locking(lock);
2045
2046 assert(mdr->more()->waiting_on_slave.count(from));
2047 mdr->more()->waiting_on_slave.erase(from);
2048 assert(mdr->more()->waiting_on_slave.empty());
2049 mdcache->dispatch_request(mdr);
2050 }
2051 break;
2052
2053 case MMDSSlaveRequest::OP_AUTHPINACK:
2054 handle_slave_auth_pin_ack(mdr, m);
2055 break;
2056
2057 case MMDSSlaveRequest::OP_LINKPREPACK:
2058 handle_slave_link_prep_ack(mdr, m);
2059 break;
2060
2061 case MMDSSlaveRequest::OP_RMDIRPREPACK:
2062 handle_slave_rmdir_prep_ack(mdr, m);
2063 break;
2064
2065 case MMDSSlaveRequest::OP_RENAMEPREPACK:
2066 handle_slave_rename_prep_ack(mdr, m);
2067 break;
2068
2069 case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
2070 handle_slave_rename_notify_ack(mdr, m);
2071 break;
2072
2073 default:
2074 ceph_abort();
2075 }
2076
2077 // done with reply.
2078 m->put();
2079 }
2080
2081 /* This function DOES put the mdr->slave_request before returning*/
2082 void Server::dispatch_slave_request(MDRequestRef& mdr)
2083 {
2084 dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
2085
2086 if (mdr->aborted) {
2087 dout(7) << " abort flag set, finishing" << dendl;
2088 mdcache->request_finish(mdr);
2089 return;
2090 }
2091
2092 if (logger) logger->inc(l_mdss_dispatch_slave_request);
2093
2094 int op = mdr->slave_request->get_op();
2095 switch (op) {
2096 case MMDSSlaveRequest::OP_XLOCK:
2097 case MMDSSlaveRequest::OP_WRLOCK:
2098 {
2099 // identify object
2100 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2101 mdr->slave_request->get_object_info());
2102
2103 if (!lock) {
2104 dout(10) << "don't have object, dropping" << dendl;
2105 ceph_abort(); // can this happen, if we auth pinned properly.
2106 }
2107 if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2108 dout(10) << "not auth for remote xlock attempt, dropping on "
2109 << *lock << " on " << *lock->get_parent() << dendl;
2110 } else {
2111 // use acquire_locks so that we get auth_pinning.
2112 set<SimpleLock*> rdlocks;
2113 set<SimpleLock*> wrlocks = mdr->wrlocks;
2114 set<SimpleLock*> xlocks = mdr->xlocks;
2115
2116 int replycode = 0;
2117 switch (op) {
2118 case MMDSSlaveRequest::OP_XLOCK:
2119 xlocks.insert(lock);
2120 replycode = MMDSSlaveRequest::OP_XLOCKACK;
2121 break;
2122 case MMDSSlaveRequest::OP_WRLOCK:
2123 wrlocks.insert(lock);
2124 replycode = MMDSSlaveRequest::OP_WRLOCKACK;
2125 break;
2126 }
2127
2128 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
2129 return;
2130
2131 // ack
2132 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, replycode);
2133 r->set_lock_type(lock->get_type());
2134 lock->get_parent()->set_object_info(r->get_object_info());
2135 mds->send_message(r, mdr->slave_request->get_connection());
2136 }
2137
2138 // done.
2139 mdr->slave_request->put();
2140 mdr->slave_request = 0;
2141 }
2142 break;
2143
2144 case MMDSSlaveRequest::OP_UNXLOCK:
2145 case MMDSSlaveRequest::OP_UNWRLOCK:
2146 {
2147 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2148 mdr->slave_request->get_object_info());
2149 assert(lock);
2150 bool need_issue = false;
2151 switch (op) {
2152 case MMDSSlaveRequest::OP_UNXLOCK:
2153 mds->locker->xlock_finish(lock, mdr.get(), &need_issue);
2154 break;
2155 case MMDSSlaveRequest::OP_UNWRLOCK:
2156 mds->locker->wrlock_finish(lock, mdr.get(), &need_issue);
2157 break;
2158 }
2159 if (need_issue)
2160 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2161
2162 // done. no ack necessary.
2163 mdr->slave_request->put();
2164 mdr->slave_request = 0;
2165 }
2166 break;
2167
2168 case MMDSSlaveRequest::OP_DROPLOCKS:
2169 mds->locker->drop_locks(mdr.get());
2170 mdr->slave_request->put();
2171 mdr->slave_request = 0;
2172 break;
2173
2174 case MMDSSlaveRequest::OP_AUTHPIN:
2175 handle_slave_auth_pin(mdr);
2176 break;
2177
2178 case MMDSSlaveRequest::OP_LINKPREP:
2179 case MMDSSlaveRequest::OP_UNLINKPREP:
2180 handle_slave_link_prep(mdr);
2181 break;
2182
2183 case MMDSSlaveRequest::OP_RMDIRPREP:
2184 handle_slave_rmdir_prep(mdr);
2185 break;
2186
2187 case MMDSSlaveRequest::OP_RENAMEPREP:
2188 handle_slave_rename_prep(mdr);
2189 break;
2190
2191 case MMDSSlaveRequest::OP_FINISH:
2192 // information about rename imported caps
2193 if (mdr->slave_request->inode_export.length() > 0)
2194 mdr->more()->inode_import.claim(mdr->slave_request->inode_export);
2195 // finish off request.
2196 mdcache->request_finish(mdr);
2197 break;
2198
2199 default:
2200 ceph_abort();
2201 }
2202 }
2203
2204 /* This function DOES put the mdr->slave_request before returning*/
2205 void Server::handle_slave_auth_pin(MDRequestRef& mdr)
2206 {
2207 dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
2208
2209 // build list of objects
2210 list<MDSCacheObject*> objects;
2211 CInode *auth_pin_freeze = NULL;
2212 bool fail = false, wouldblock = false, readonly = false;
2213
2214 if (mdcache->is_readonly()) {
2215 dout(10) << " read-only FS" << dendl;
2216 readonly = true;
2217 fail = true;
2218 }
2219
2220 if (!fail) {
2221 for (vector<MDSCacheObjectInfo>::iterator p = mdr->slave_request->get_authpins().begin();
2222 p != mdr->slave_request->get_authpins().end();
2223 ++p) {
2224 MDSCacheObject *object = mdcache->get_object(*p);
2225 if (!object) {
2226 dout(10) << " don't have " << *p << dendl;
2227 fail = true;
2228 break;
2229 }
2230
2231 objects.push_back(object);
2232 if (*p == mdr->slave_request->get_authpin_freeze())
2233 auth_pin_freeze = static_cast<CInode*>(object);
2234 }
2235 }
2236
2237 // can we auth pin them?
2238 if (!fail) {
2239 for (list<MDSCacheObject*>::iterator p = objects.begin();
2240 p != objects.end();
2241 ++p) {
2242 if (!(*p)->is_auth()) {
2243 dout(10) << " not auth for " << **p << dendl;
2244 fail = true;
2245 break;
2246 }
2247 if (mdr->is_auth_pinned(*p))
2248 continue;
2249 if (!mdr->can_auth_pin(*p)) {
2250 if (mdr->slave_request->is_nonblock()) {
2251 dout(10) << " can't auth_pin (freezing?) " << **p << " nonblocking" << dendl;
2252 fail = true;
2253 wouldblock = true;
2254 break;
2255 }
2256 // wait
2257 dout(10) << " waiting for authpinnable on " << **p << dendl;
2258 (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2259 mdr->drop_local_auth_pins();
2260
2261 mds->locker->notify_freeze_waiter(*p);
2262 return;
2263 }
2264 }
2265 }
2266
2267 // auth pin!
2268 if (fail) {
2269 mdr->drop_local_auth_pins(); // just in case
2270 } else {
2271 /* freeze authpin wrong inode */
2272 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
2273 mdr->more()->rename_inode != auth_pin_freeze)
2274 mdr->unfreeze_auth_pin(true);
2275
2276 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2277 * on the source inode to complete. This happens after all locks for the rename
2278 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2279 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2280 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2281 * The solution is freeze the inode and prevent other MDRequests from getting new
2282 * auth pins.
2283 */
2284 if (auth_pin_freeze) {
2285 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
2286 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
2287 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
2288 mds->mdlog->flush();
2289 return;
2290 }
2291 }
2292 for (list<MDSCacheObject*>::iterator p = objects.begin();
2293 p != objects.end();
2294 ++p) {
2295 dout(10) << "auth_pinning " << **p << dendl;
2296 mdr->auth_pin(*p);
2297 }
2298 }
2299
2300 // ack!
2301 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
2302
2303 // return list of my auth_pins (if any)
2304 for (set<MDSCacheObject*>::iterator p = mdr->auth_pins.begin();
2305 p != mdr->auth_pins.end();
2306 ++p) {
2307 MDSCacheObjectInfo info;
2308 (*p)->set_object_info(info);
2309 reply->get_authpins().push_back(info);
2310 if (*p == (MDSCacheObject*)auth_pin_freeze)
2311 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
2312 }
2313
2314 if (wouldblock)
2315 reply->mark_error_wouldblock();
2316 if (readonly)
2317 reply->mark_error_rofs();
2318
2319 mds->send_message_mds(reply, mdr->slave_to_mds);
2320
2321 // clean up this request
2322 mdr->slave_request->put();
2323 mdr->slave_request = 0;
2324 return;
2325 }
2326
2327 /* This function DOES NOT put the passed ack before returning*/
2328 void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
2329 {
2330 dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
2331 mds_rank_t from = mds_rank_t(ack->get_source().num());
2332
2333 // added auth pins?
2334 set<MDSCacheObject*> pinned;
2335 for (vector<MDSCacheObjectInfo>::iterator p = ack->get_authpins().begin();
2336 p != ack->get_authpins().end();
2337 ++p) {
2338 MDSCacheObject *object = mdcache->get_object(*p);
2339 assert(object); // we pinned it
2340 dout(10) << " remote has pinned " << *object << dendl;
2341 if (!mdr->is_auth_pinned(object))
2342 mdr->remote_auth_pins[object] = from;
2343 if (*p == ack->get_authpin_freeze())
2344 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
2345 pinned.insert(object);
2346 }
2347
2348 // removed frozen auth pin ?
2349 if (mdr->more()->is_remote_frozen_authpin &&
2350 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
2351 auto p = mdr->remote_auth_pins.find(mdr->more()->rename_inode);
2352 assert(p != mdr->remote_auth_pins.end());
2353 if (p->second == from) {
2354 mdr->more()->is_remote_frozen_authpin = false;
2355 }
2356 }
2357
2358 // removed auth pins?
2359 map<MDSCacheObject*, mds_rank_t>::iterator p = mdr->remote_auth_pins.begin();
2360 while (p != mdr->remote_auth_pins.end()) {
2361 MDSCacheObject* object = p->first;
2362 if (p->second == from && pinned.count(object) == 0) {
2363 dout(10) << " remote has unpinned " << *object << dendl;
2364 mdr->remote_auth_pins.erase(p++);
2365 } else {
2366 ++p;
2367 }
2368 }
2369
2370 if (ack->is_error_rofs()) {
2371 mdr->more()->slave_error = -EROFS;
2372 mdr->aborted = true;
2373 } else if (ack->is_error_wouldblock()) {
2374 mdr->more()->slave_error = -EWOULDBLOCK;
2375 mdr->aborted = true;
2376 }
2377
2378 // note slave
2379 mdr->more()->slaves.insert(from);
2380
2381 // clear from waiting list
2382 assert(mdr->more()->waiting_on_slave.count(from));
2383 mdr->more()->waiting_on_slave.erase(from);
2384
2385 // go again?
2386 if (mdr->more()->waiting_on_slave.empty())
2387 mdcache->dispatch_request(mdr);
2388 else
2389 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
2390 }
2391
2392
2393 // ---------------------------------------
2394 // HELPERS
2395
2396
2397 /**
2398 * check whether we are permitted to complete a request
2399 *
2400 * Check whether we have permission to perform the operation specified
2401 * by mask on the given inode, based on the capability in the mdr's
2402 * session.
2403 */
2404 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
2405 {
2406 if (mdr->session) {
2407 int r = mdr->session->check_access(
2408 in, mask,
2409 mdr->client_request->get_caller_uid(),
2410 mdr->client_request->get_caller_gid(),
2411 &mdr->client_request->get_caller_gid_list(),
2412 mdr->client_request->head.args.setattr.uid,
2413 mdr->client_request->head.args.setattr.gid);
2414 if (r < 0) {
2415 respond_to_request(mdr, r);
2416 return false;
2417 }
2418 }
2419 return true;
2420 }
2421
2422 /**
2423 * check whether fragment has reached maximum size
2424 *
2425 */
2426 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
2427 {
2428 const auto size = in->get_frag_size();
2429 if (size >= g_conf->mds_bal_fragment_size_max) {
2430 dout(10) << "fragment " << *in << " size exceeds " << g_conf->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
2431 respond_to_request(mdr, -ENOSPC);
2432 return false;
2433 }
2434
2435 return true;
2436 }
2437
2438
2439 /** validate_dentry_dir
2440 *
2441 * verify that the dir exists and would own the dname.
2442 * do not check if the dentry exists.
2443 */
2444 CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, const string& dname)
2445 {
2446 // make sure parent is a dir?
2447 if (!diri->is_dir()) {
2448 dout(7) << "validate_dentry_dir: not a dir" << dendl;
2449 respond_to_request(mdr, -ENOTDIR);
2450 return NULL;
2451 }
2452
2453 // which dirfrag?
2454 frag_t fg = diri->pick_dirfrag(dname);
2455 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
2456 if (!dir)
2457 return 0;
2458
2459 // frozen?
2460 if (dir->is_frozen()) {
2461 dout(7) << "dir is frozen " << *dir << dendl;
2462 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2463 return NULL;
2464 }
2465
2466 return dir;
2467 }
2468
2469
2470 /** prepare_null_dentry
2471 * prepare a null (or existing) dentry in given dir.
2472 * wait for any dn lock.
2473 */
2474 CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, const string& dname, bool okexist)
2475 {
2476 dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
2477 assert(dir->is_auth());
2478
2479 client_t client = mdr->get_client();
2480
2481 // does it already exist?
2482 CDentry *dn = dir->lookup(dname);
2483 if (dn) {
2484 /*
2485 if (dn->lock.is_xlocked_by_other(mdr)) {
2486 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2487 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2488 return 0;
2489 }
2490 */
2491 if (!dn->get_linkage(client, mdr)->is_null()) {
2492 // name already exists
2493 dout(10) << "dentry " << dname << " exists in " << *dir << dendl;
2494 if (!okexist) {
2495 respond_to_request(mdr, -EEXIST);
2496 return 0;
2497 }
2498 } else {
2499 dn->first = dir->inode->find_snaprealm()->get_newest_seq() + 1;
2500 }
2501
2502 return dn;
2503 }
2504
2505 // make sure dir is complete
2506 if (!dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2507 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2508 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2509 return 0;
2510 }
2511
2512 // create
2513 dn = dir->add_null_dentry(dname, dir->inode->find_snaprealm()->get_newest_seq() + 1);
2514 dn->mark_new();
2515 dout(10) << "prepare_null_dentry added " << *dn << dendl;
2516 return dn;
2517 }
2518
2519 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
2520 {
2521 CDentry *straydn = mdr->straydn;
2522 if (straydn) {
2523 string straydname;
2524 in->name_stray_dentry(straydname);
2525 if (straydn->get_name() == straydname)
2526 return straydn;
2527
2528 assert(!mdr->done_locking);
2529 mdr->unpin(straydn);
2530 }
2531
2532 CDir *straydir = mdcache->get_stray_dir(in);
2533
2534 if (!mdr->client_request->is_replay() &&
2535 !check_fragment_space(mdr, straydir))
2536 return NULL;
2537
2538 straydn = mdcache->get_or_create_stray_dentry(in);
2539 mdr->straydn = straydn;
2540 mdr->pin(straydn);
2541 return straydn;
2542 }
2543
2544 /** prepare_new_inode
2545 *
2546 * create a new inode. set c/m/atime. hit dir pop.
2547 */
2548 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
2549 file_layout_t *layout)
2550 {
2551 CInode *in = new CInode(mdcache);
2552
2553 // Server::prepare_force_open_sessions() can re-open session in closing
2554 // state. In that corner case, session's prealloc_inos are being freed.
2555 // To simplify the code, we disallow using/refilling session's prealloc_ino
2556 // while session is opening.
2557 bool allow_prealloc_inos = !mdr->session->is_opening();
2558
2559 // assign ino
2560 if (allow_prealloc_inos &&
2561 mdr->session->info.prealloc_inos.size()) {
2562 mdr->used_prealloc_ino =
2563 in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used
2564 mds->sessionmap.mark_projected(mdr->session);
2565
2566 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
2567 << " (" << mdr->session->info.prealloc_inos
2568 << ", " << mdr->session->info.prealloc_inos.size() << " left)"
2569 << dendl;
2570 } else {
2571 mdr->alloc_ino =
2572 in->inode.ino = mds->inotable->project_alloc_id();
2573 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
2574 }
2575
2576 if (useino && useino != in->inode.ino) {
2577 dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
2578 mds->clog->error() << mdr->client_request->get_source()
2579 << " specified ino " << useino
2580 << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
2581 //ceph_abort(); // just for now.
2582 }
2583
2584 if (allow_prealloc_inos &&
2585 mdr->session->get_num_projected_prealloc_inos() < g_conf->mds_client_prealloc_inos / 2) {
2586 int need = g_conf->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
2587 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
2588 assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
2589 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
2590 mds->sessionmap.mark_projected(mdr->session);
2591 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
2592 }
2593
2594 in->inode.version = 1;
2595 in->inode.xattr_version = 1;
2596 in->inode.nlink = 1; // FIXME
2597
2598 in->inode.mode = mode;
2599
2600 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
2601 if (in->inode.is_dir()) {
2602 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
2603 } else if (layout) {
2604 in->inode.layout = *layout;
2605 } else {
2606 in->inode.layout = mdcache->default_file_layout;
2607 }
2608
2609 in->inode.truncate_size = -1ull; // not truncated, yet!
2610 in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
2611
2612 CInode *diri = dir->get_inode();
2613
2614 dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
2615
2616 if (diri->inode.mode & S_ISGID) {
2617 dout(10) << " dir is sticky" << dendl;
2618 in->inode.gid = diri->inode.gid;
2619 if (S_ISDIR(mode)) {
2620 dout(10) << " new dir also sticky" << dendl;
2621 in->inode.mode |= S_ISGID;
2622 }
2623 } else
2624 in->inode.gid = mdr->client_request->get_caller_gid();
2625
2626 in->inode.uid = mdr->client_request->get_caller_uid();
2627
2628 in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
2629 mdr->get_op_stamp();
2630
2631 in->inode.change_attr = 0;
2632
2633 MClientRequest *req = mdr->client_request;
2634 if (req->get_data().length()) {
2635 bufferlist::iterator p = req->get_data().begin();
2636
2637 // xattrs on new inode?
2638 map<string,bufferptr> xattrs;
2639 ::decode(xattrs, p);
2640 for (map<string,bufferptr>::iterator p = xattrs.begin(); p != xattrs.end(); ++p) {
2641 dout(10) << "prepare_new_inode setting xattr " << p->first << dendl;
2642 in->xattrs[p->first] = p->second;
2643 }
2644 }
2645
2646 if (!mds->mdsmap->get_inline_data_enabled() ||
2647 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
2648 in->inode.inline_data.version = CEPH_INLINE_NONE;
2649
2650 mdcache->add_inode(in); // add
2651 dout(10) << "prepare_new_inode " << *in << dendl;
2652 return in;
2653 }
2654
2655 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
2656 {
2657 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
2658 << " inotablev " << mds->inotable->get_projected_version()
2659 << dendl;
2660 blob->set_ino_alloc(mdr->alloc_ino,
2661 mdr->used_prealloc_ino,
2662 mdr->prealloc_inos,
2663 mdr->client_request->get_source(),
2664 mds->sessionmap.get_projected(),
2665 mds->inotable->get_projected_version());
2666 }
2667
2668 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
2669 {
2670 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
2671 << " / " << mdr->prealloc_inos
2672 << " / " << mdr->used_prealloc_ino << dendl;
2673
2674 if (mdr->alloc_ino) {
2675 mds->inotable->apply_alloc_id(mdr->alloc_ino);
2676 }
2677 if (mdr->prealloc_inos.size()) {
2678 assert(session);
2679 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
2680 session->info.prealloc_inos.insert(mdr->prealloc_inos);
2681 mds->sessionmap.mark_dirty(session);
2682 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
2683 }
2684 if (mdr->used_prealloc_ino) {
2685 assert(session);
2686 session->info.used_inos.erase(mdr->used_prealloc_ino);
2687 mds->sessionmap.mark_dirty(session);
2688 }
2689 }
2690
2691 class C_MDS_TryFindInode : public ServerContext {
2692 MDRequestRef mdr;
2693 public:
2694 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
2695 void finish(int r) override {
2696 if (r == -ESTALE) // :( find_ino_peers failed
2697 server->respond_to_request(mdr, r);
2698 else
2699 server->dispatch_client_request(mdr);
2700 }
2701 };
2702
2703 CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath)
2704 {
2705 // figure parent dir vs dname
2706 if (refpath.depth() == 0) {
2707 dout(7) << "can't do that to root" << dendl;
2708 respond_to_request(mdr, -EINVAL);
2709 return 0;
2710 }
2711 string dname = refpath.last_dentry();
2712 refpath.pop_dentry();
2713
2714 dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl;
2715
2716 // traverse to parent dir
2717 CInode *diri;
2718 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &trace, &diri, MDS_TRAVERSE_FORWARD);
2719 if (r > 0) return 0; // delayed
2720 if (r < 0) {
2721 if (r == -ESTALE) {
2722 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2723 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
2724 return 0;
2725 }
2726 respond_to_request(mdr, r);
2727 return 0;
2728 }
2729
2730 // is it an auth dir?
2731 CDir *dir = validate_dentry_dir(mdr, diri, dname);
2732 if (!dir)
2733 return 0; // forwarded or waiting for freeze
2734
2735 dout(10) << "traverse_to_auth_dir " << *dir << dendl;
2736 return dir;
2737 }
2738
2739 /* If this returns null, the request has been handled
2740 * as appropriate: forwarded on, or the client's been replied to */
2741 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n,
2742 set<SimpleLock*> &rdlocks,
2743 bool want_auth,
2744 bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's
2745 a snapped dir */
2746 file_layout_t **layout,
2747 bool no_lookup) // true if we cannot return a null dentry lease
2748 {
2749 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2750 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
2751
2752 if (mdr->done_locking)
2753 return mdr->in[n];
2754
2755 // traverse
2756 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &mdr->dn[n], &mdr->in[n], MDS_TRAVERSE_FORWARD);
2757 if (r > 0)
2758 return NULL; // delayed
2759 if (r < 0) { // error
2760 if (r == -ENOENT && n == 0 && mdr->dn[n].size()) {
2761 if (!no_lookup)
2762 mdr->tracedn = mdr->dn[n][mdr->dn[n].size()-1];
2763 respond_to_request(mdr, r);
2764 } else if (r == -ESTALE) {
2765 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2766 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
2767 mdcache->find_ino_peers(refpath.get_ino(), c);
2768 } else {
2769 dout(10) << "FAIL on error " << r << dendl;
2770 respond_to_request(mdr, r);
2771 }
2772 return 0;
2773 }
2774 CInode *ref = mdr->in[n];
2775 dout(10) << "ref is " << *ref << dendl;
2776
2777 // fw to inode auth?
2778 if (mdr->snapid != CEPH_NOSNAP && !no_want_auth)
2779 want_auth = true;
2780
2781 if (want_auth) {
2782 if (ref->is_ambiguous_auth()) {
2783 dout(10) << "waiting for single auth on " << *ref << dendl;
2784 ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
2785 return 0;
2786 }
2787 if (!ref->is_auth()) {
2788 dout(10) << "fw to auth for " << *ref << dendl;
2789 mdcache->request_forward(mdr, ref->authority().first);
2790 return 0;
2791 }
2792
2793 // auth_pin?
2794 // do NOT proceed if freezing, as cap release may defer in that case, and
2795 // we could deadlock when we try to lock @ref.
2796 // if we're already auth_pinned, continue; the release has already been processed.
2797 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
2798 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
2799 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
2800 ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2801 /* If we have any auth pins, this will deadlock.
2802 * But the only way to get here if we've already got auth pins
2803 * is because we're on an inode with snapshots that got updated
2804 * between dispatches of this request. So we're going to drop
2805 * our locks and our auth pins and reacquire them later.
2806 *
2807 * This is safe since we're only in this function when working on
2808 * a single MDS request; otherwise we'd be in
2809 * rdlock_path_xlock_dentry.
2810 */
2811 mds->locker->drop_locks(mdr.get(), NULL);
2812 mdr->drop_local_auth_pins();
2813 if (!mdr->remote_auth_pins.empty())
2814 mds->locker->notify_freeze_waiter(ref);
2815 return 0;
2816 }
2817
2818 mdr->auth_pin(ref);
2819 }
2820
2821 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2822 rdlocks.insert(&mdr->dn[n][i]->lock);
2823 if (layout)
2824 mds->locker->include_snap_rdlocks_wlayout(rdlocks, ref, layout);
2825 else
2826 mds->locker->include_snap_rdlocks(rdlocks, ref);
2827
2828 // set and pin ref
2829 mdr->pin(ref);
2830 return ref;
2831 }
2832
2833
2834 /** rdlock_path_xlock_dentry
2835 * traverse path to the directory that could/would contain dentry.
2836 * make sure i am auth for that dentry, forward as necessary.
2837 * create null dentry in place (or use existing if okexist).
2838 * get rdlocks on traversed dentries, xlock on new dentry.
2839 */
2840 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
2841 set<SimpleLock*>& rdlocks, set<SimpleLock*>& wrlocks, set<SimpleLock*>& xlocks,
2842 bool okexist, bool mustexist, bool alwaysxlock,
2843 file_layout_t **layout)
2844 {
2845 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2846
2847 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
2848
2849 client_t client = mdr->get_client();
2850
2851 if (mdr->done_locking)
2852 return mdr->dn[n].back();
2853
2854 CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
2855 if (!dir) return 0;
2856
2857 CInode *diri = dir->get_inode();
2858 if (!mdr->reqid.name.is_mds()) {
2859 if (diri->is_system() && !diri->is_root()) {
2860 respond_to_request(mdr, -EROFS);
2861 return 0;
2862 }
2863 }
2864 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
2865 respond_to_request(mdr, -ENOENT);
2866 return 0;
2867 }
2868
2869 // make a null dentry?
2870 const string &dname = refpath.last_dentry();
2871 CDentry *dn;
2872 if (mustexist) {
2873 dn = dir->lookup(dname);
2874
2875 // make sure dir is complete
2876 if (!dn && !dir->is_complete() &&
2877 (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2878 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2879 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2880 return 0;
2881 }
2882
2883 // readable?
2884 if (dn && !dn->lock.can_read(client) && dn->lock.get_xlock_by() != mdr) {
2885 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2886 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2887 return 0;
2888 }
2889
2890 // exists?
2891 if (!dn || dn->get_linkage(client, mdr)->is_null()) {
2892 dout(7) << "dentry " << dname << " dne in " << *dir << dendl;
2893 respond_to_request(mdr, -ENOENT);
2894 return 0;
2895 }
2896 } else {
2897 dn = prepare_null_dentry(mdr, dir, dname, okexist);
2898 if (!dn)
2899 return 0;
2900 }
2901
2902 mdr->dn[n].push_back(dn);
2903 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
2904 mdr->in[n] = dnl->get_inode();
2905
2906 // -- lock --
2907 // NOTE: rename takes the same set of locks for srcdn
2908 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2909 rdlocks.insert(&mdr->dn[n][i]->lock);
2910 if (alwaysxlock || dnl->is_null())
2911 xlocks.insert(&dn->lock); // new dn, xlock
2912 else
2913 rdlocks.insert(&dn->lock); // existing dn, rdlock
2914 wrlocks.insert(&dn->get_dir()->inode->filelock); // also, wrlock on dir mtime
2915 wrlocks.insert(&dn->get_dir()->inode->nestlock); // also, wrlock on dir mtime
2916 if (layout)
2917 mds->locker->include_snap_rdlocks_wlayout(rdlocks, dn->get_dir()->inode, layout);
2918 else
2919 mds->locker->include_snap_rdlocks(rdlocks, dn->get_dir()->inode);
2920
2921 return dn;
2922 }
2923
2924
2925
2926
2927
2928 /**
2929 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
2930 *
2931 * @param diri base inode
2932 * @param fg the exact frag we want
2933 * @param mdr request
2934 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
2935 */
2936 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
2937 {
2938 CDir *dir = diri->get_dirfrag(fg);
2939
2940 // not open and inode not mine?
2941 if (!dir && !diri->is_auth()) {
2942 mds_rank_t inauth = diri->authority().first;
2943 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
2944 mdcache->request_forward(mdr, inauth);
2945 return 0;
2946 }
2947
2948 // not open and inode frozen?
2949 if (!dir && diri->is_frozen()) {
2950 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
2951 assert(diri->get_parent_dir());
2952 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2953 return 0;
2954 }
2955
2956 // invent?
2957 if (!dir)
2958 dir = diri->get_or_open_dirfrag(mdcache, fg);
2959
2960 // am i auth for the dirfrag?
2961 if (!dir->is_auth()) {
2962 mds_rank_t auth = dir->authority().first;
2963 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
2964 << ", fw to mds." << auth << dendl;
2965 mdcache->request_forward(mdr, auth);
2966 return 0;
2967 }
2968
2969 return dir;
2970 }
2971
2972
2973 // ===============================================================================
2974 // STAT
2975
2976 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
2977 {
2978 MClientRequest *req = mdr->client_request;
2979 set<SimpleLock*> rdlocks, wrlocks, xlocks;
2980
2981 if (req->get_filepath().depth() == 0 && is_lookup) {
2982 // refpath can't be empty for lookup but it can for
2983 // getattr (we do getattr with empty refpath for mount of '/')
2984 respond_to_request(mdr, -EINVAL);
2985 return;
2986 }
2987
2988 CInode *ref = rdlock_path_pin_ref(mdr, 0, rdlocks, false, false, NULL, !is_lookup);
2989 if (!ref) return;
2990
2991 /*
2992 * if client currently holds the EXCL cap on a field, do not rdlock
2993 * it; client's stat() will result in valid info if _either_ EXCL
2994 * cap is held or MDS rdlocks and reads the value here.
2995 *
2996 * handling this case here is easier than weakening rdlock
2997 * semantics... that would cause problems elsewhere.
2998 */
2999 client_t client = mdr->get_client();
3000 int issued = 0;
3001 Capability *cap = ref->get_client_cap(client);
3002 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3003 mdr->snapid <= cap->client_follows))
3004 issued = cap->issued();
3005
3006 int mask = req->head.args.getattr.mask;
3007 if ((mask & CEPH_CAP_LINK_SHARED) && (issued & CEPH_CAP_LINK_EXCL) == 0) rdlocks.insert(&ref->linklock);
3008 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0) rdlocks.insert(&ref->authlock);
3009 if ((mask & CEPH_CAP_FILE_SHARED) && (issued & CEPH_CAP_FILE_EXCL) == 0) rdlocks.insert(&ref->filelock);
3010 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0) rdlocks.insert(&ref->xattrlock);
3011
3012 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3013 return;
3014
3015 if (!check_access(mdr, ref, MAY_READ))
3016 return;
3017
3018 // note which caps are requested, so we return at least a snapshot
3019 // value for them. (currently this matters for xattrs and inline data)
3020 mdr->getattr_caps = mask;
3021
3022 mds->balancer->hit_inode(ceph_clock_now(), ref, META_POP_IRD,
3023 req->get_source().num());
3024
3025 // reply
3026 dout(10) << "reply to stat on " << *req << dendl;
3027 mdr->tracei = ref;
3028 if (is_lookup)
3029 mdr->tracedn = mdr->dn[0].back();
3030 respond_to_request(mdr, 0);
3031 }
3032
3033 struct C_MDS_LookupIno2 : public ServerContext {
3034 MDRequestRef mdr;
3035 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3036 void finish(int r) override {
3037 server->_lookup_ino_2(mdr, r);
3038 }
3039 };
3040
3041 /* This function DOES clean up the mdr before returning*/
3042 /*
3043 * filepath: ino
3044 */
3045 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3046 bool want_parent, bool want_dentry)
3047 {
3048 MClientRequest *req = mdr->client_request;
3049
3050 inodeno_t ino = req->get_filepath().get_ino();
3051 CInode *in = mdcache->get_inode(ino);
3052 if (in && in->state_test(CInode::STATE_PURGING)) {
3053 respond_to_request(mdr, -ESTALE);
3054 return;
3055 }
3056 if (!in) {
3057 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3058 return;
3059 }
3060
3061 if (mdr && in->snaprealm && !in->snaprealm->is_open() &&
3062 !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
3063 return;
3064 }
3065
3066 // check for nothing (not read or write); this still applies the
3067 // path check.
3068 if (!check_access(mdr, in, 0))
3069 return;
3070
3071 CDentry *dn = in->get_projected_parent_dn();
3072 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3073
3074 set<SimpleLock*> rdlocks;
3075 if (dn && (want_parent || want_dentry)) {
3076 mdr->pin(dn);
3077 rdlocks.insert(&dn->lock);
3078 }
3079
3080 unsigned mask = req->head.args.getattr.mask;
3081 if (mask) {
3082 Capability *cap = in->get_client_cap(mdr->get_client());
3083 int issued = 0;
3084 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3085 issued = cap->issued();
3086 // permission bits, ACL/security xattrs
3087 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3088 rdlocks.insert(&in->authlock);
3089 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3090 rdlocks.insert(&in->xattrlock);
3091
3092 mdr->getattr_caps = mask;
3093 }
3094
3095 if (!rdlocks.empty()) {
3096 set<SimpleLock*> wrlocks, xlocks;
3097 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3098 return;
3099
3100 if (diri != NULL) {
3101 // need read access to directory inode
3102 if (!check_access(mdr, diri, MAY_READ))
3103 return;
3104 }
3105 }
3106
3107 if (want_parent) {
3108 if (in->is_base()) {
3109 respond_to_request(mdr, -EINVAL);
3110 return;
3111 }
3112 if (!diri || diri->is_stray()) {
3113 respond_to_request(mdr, -ESTALE);
3114 return;
3115 }
3116 dout(10) << "reply to lookup_parent " << *in << dendl;
3117 mdr->tracei = diri;
3118 respond_to_request(mdr, 0);
3119 } else {
3120 if (want_dentry) {
3121 inodeno_t dirino = req->get_filepath2().get_ino();
3122 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3123 respond_to_request(mdr, -ENOENT);
3124 return;
3125 }
3126 dout(10) << "reply to lookup_name " << *in << dendl;
3127 } else
3128 dout(10) << "reply to lookup_ino " << *in << dendl;
3129
3130 mdr->tracei = in;
3131 if (want_dentry)
3132 mdr->tracedn = dn;
3133 respond_to_request(mdr, 0);
3134 }
3135 }
3136
3137 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
3138 {
3139 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
3140 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3141
3142 // `r` is a rank if >=0, else an error code
3143 if (r >= 0) {
3144 mds_rank_t dest_rank(r);
3145 if (dest_rank == mds->get_nodeid())
3146 dispatch_client_request(mdr);
3147 else
3148 mdcache->request_forward(mdr, dest_rank);
3149 return;
3150 }
3151
3152 // give up
3153 if (r == -ENOENT || r == -ENODATA)
3154 r = -ESTALE;
3155 respond_to_request(mdr, r);
3156 }
3157
3158
3159 /* This function takes responsibility for the passed mdr*/
3160 void Server::handle_client_open(MDRequestRef& mdr)
3161 {
3162 MClientRequest *req = mdr->client_request;
3163 dout(7) << "open on " << req->get_filepath() << dendl;
3164
3165 int flags = req->head.args.open.flags;
3166 int cmode = ceph_flags_to_mode(flags);
3167 if (cmode < 0) {
3168 respond_to_request(mdr, -EINVAL);
3169 return;
3170 }
3171
3172 bool need_auth = !file_mode_is_readonly(cmode) ||
3173 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
3174
3175 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
3176 dout(7) << "read-only FS" << dendl;
3177 respond_to_request(mdr, -EROFS);
3178 return;
3179 }
3180
3181 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3182 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, need_auth);
3183 if (!cur)
3184 return;
3185
3186 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
3187 assert(!need_auth);
3188 mdr->done_locking = false;
3189 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3190 if (!cur)
3191 return;
3192 }
3193
3194 if (!cur->inode.is_file()) {
3195 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3196 cmode = CEPH_FILE_MODE_PIN;
3197 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3198 if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
3199 flags &= ~CEPH_O_TRUNC;
3200 }
3201
3202 dout(10) << "open flags = " << flags
3203 << ", filemode = " << cmode
3204 << ", need_auth = " << need_auth
3205 << dendl;
3206
3207 // regular file?
3208 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3209 dout(7) << "not a file or dir " << *cur << dendl;
3210 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3211 return;
3212 }*/
3213 if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
3214 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
3215 respond_to_request(mdr, -EINVAL);
3216 return;
3217 }
3218
3219 if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
3220 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
3221 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3222 respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
3223 return;
3224 }
3225
3226 if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
3227 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
3228 dout(7) << "old client cannot open inline data file " << *cur << dendl;
3229 respond_to_request(mdr, -EPERM);
3230 return;
3231 }
3232
3233 // snapped data is read only
3234 if (mdr->snapid != CEPH_NOSNAP &&
3235 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
3236 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
3237 respond_to_request(mdr, -EROFS);
3238 return;
3239 }
3240
3241 unsigned mask = req->head.args.open.mask;
3242 if (mask) {
3243 Capability *cap = cur->get_client_cap(mdr->get_client());
3244 int issued = 0;
3245 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3246 issued = cap->issued();
3247 // permission bits, ACL/security xattrs
3248 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3249 rdlocks.insert(&cur->authlock);
3250 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3251 rdlocks.insert(&cur->xattrlock);
3252
3253 mdr->getattr_caps = mask;
3254 }
3255
3256 // O_TRUNC
3257 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
3258 assert(cur->is_auth());
3259
3260 xlocks.insert(&cur->filelock);
3261 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3262 return;
3263
3264 if (!check_access(mdr, cur, MAY_WRITE))
3265 return;
3266
3267 // wait for pending truncate?
3268 const inode_t *pi = cur->get_projected_inode();
3269 if (pi->is_truncating()) {
3270 dout(10) << " waiting for pending truncate from " << pi->truncate_from
3271 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
3272 mds->locker->drop_locks(mdr.get());
3273 mdr->drop_local_auth_pins();
3274 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
3275 return;
3276 }
3277
3278 do_open_truncate(mdr, cmode);
3279 return;
3280 }
3281
3282 // sync filelock if snapped.
3283 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3284 // and that data itself is flushed so that we can read the snapped data off disk.
3285 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
3286 rdlocks.insert(&cur->filelock);
3287 }
3288
3289 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3290 return;
3291
3292 mask = MAY_READ;
3293 if (cmode & CEPH_FILE_MODE_WR)
3294 mask |= MAY_WRITE;
3295 if (!check_access(mdr, cur, mask))
3296 return;
3297
3298 if (cur->is_file() || cur->is_dir()) {
3299 if (mdr->snapid == CEPH_NOSNAP) {
3300 // register new cap
3301 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr->session, 0, req->is_replay());
3302 if (cap)
3303 dout(12) << "open issued caps " << ccap_string(cap->pending())
3304 << " for " << req->get_source()
3305 << " on " << *cur << dendl;
3306 } else {
3307 int caps = ceph_caps_for_mode(cmode);
3308 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
3309 << " for " << req->get_source()
3310 << " snapid " << mdr->snapid
3311 << " on " << *cur << dendl;
3312 mdr->snap_caps = caps;
3313 }
3314 }
3315
3316 // increase max_size?
3317 if (cmode & CEPH_FILE_MODE_WR)
3318 mds->locker->check_inode_max_size(cur);
3319
3320 // make sure this inode gets into the journal
3321 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
3322 !cur->item_open_file.is_on_list()) {
3323 LogSegment *ls = mds->mdlog->get_current_segment();
3324 EOpen *le = new EOpen(mds->mdlog);
3325 mdlog->start_entry(le);
3326 le->add_clean_inode(cur);
3327 ls->open_files.push_back(&cur->item_open_file);
3328 mdlog->submit_entry(le);
3329 }
3330
3331 // hit pop
3332 if (cmode & CEPH_FILE_MODE_WR)
3333 mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IWR);
3334 else
3335 mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IRD,
3336 mdr->client_request->get_source().num());
3337
3338 CDentry *dn = 0;
3339 if (req->get_dentry_wanted()) {
3340 assert(mdr->dn[0].size());
3341 dn = mdr->dn[0].back();
3342 }
3343
3344 mdr->tracei = cur;
3345 mdr->tracedn = dn;
3346 respond_to_request(mdr, 0);
3347 }
3348
3349 class C_MDS_openc_finish : public ServerLogContext {
3350 CDentry *dn;
3351 CInode *newi;
3352 snapid_t follows;
3353 public:
3354 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni, snapid_t f) :
3355 ServerLogContext(s, r), dn(d), newi(ni), follows(f) {}
3356 void finish(int r) override {
3357 assert(r == 0);
3358
3359 dn->pop_projected_linkage();
3360
3361 // dirty inode, dn, dir
3362 newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
3363 newi->mark_dirty(newi->inode.version+1, mdr->ls);
3364 newi->_mark_dirty_parent(mdr->ls, true);
3365
3366 mdr->apply();
3367
3368 get_mds()->locker->share_inode_max_size(newi);
3369
3370 MDRequestRef null_ref;
3371 get_mds()->mdcache->send_dentry_link(dn, null_ref);
3372
3373 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
3374
3375 server->respond_to_request(mdr, 0);
3376
3377 assert(g_conf->mds_kill_openc_at != 1);
3378 }
3379 };
3380
3381 /* This function takes responsibility for the passed mdr*/
3382 void Server::handle_client_openc(MDRequestRef& mdr)
3383 {
3384 MClientRequest *req = mdr->client_request;
3385 client_t client = mdr->get_client();
3386
3387 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
3388
3389 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
3390 if (cmode < 0) {
3391 respond_to_request(mdr, -EINVAL);
3392 return;
3393 }
3394
3395 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
3396
3397 if (!excl) {
3398 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(),
3399 &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
3400 if (r > 0) return;
3401 if (r == 0) {
3402 // it existed.
3403 handle_client_open(mdr);
3404 return;
3405 }
3406 if (r < 0 && r != -ENOENT) {
3407 if (r == -ESTALE) {
3408 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3409 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
3410 mdcache->find_ino_peers(req->get_filepath().get_ino(), c);
3411 } else {
3412 dout(10) << "FAIL on error " << r << dendl;
3413 respond_to_request(mdr, r);
3414 }
3415 return;
3416 }
3417 }
3418
3419 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3420 file_layout_t *dir_layout = NULL;
3421 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks,
3422 !excl, false, false, &dir_layout);
3423 if (!dn) return;
3424 if (mdr->snapid != CEPH_NOSNAP) {
3425 respond_to_request(mdr, -EROFS);
3426 return;
3427 }
3428 // set layout
3429 file_layout_t layout;
3430 if (dir_layout)
3431 layout = *dir_layout;
3432 else
3433 layout = mdcache->default_file_layout;
3434
3435 // What kind of client caps are required to complete this operation
3436 uint64_t access = MAY_WRITE;
3437
3438 const auto default_layout = layout;
3439
3440 // fill in any special params from client
3441 if (req->head.args.open.stripe_unit)
3442 layout.stripe_unit = req->head.args.open.stripe_unit;
3443 if (req->head.args.open.stripe_count)
3444 layout.stripe_count = req->head.args.open.stripe_count;
3445 if (req->head.args.open.object_size)
3446 layout.object_size = req->head.args.open.object_size;
3447 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
3448 (__s32)req->head.args.open.pool >= 0) {
3449 layout.pool_id = req->head.args.open.pool;
3450
3451 // make sure we have as new a map as the client
3452 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
3453 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
3454 return;
3455 }
3456 }
3457
3458 // If client doesn't have capability to modify layout pools, then
3459 // only permit this request if the requested pool matches what the
3460 // file would have inherited anyway from its parent.
3461 if (default_layout != layout) {
3462 access |= MAY_SET_VXATTR;
3463 }
3464
3465 if (!layout.is_valid()) {
3466 dout(10) << " invalid initial file layout" << dendl;
3467 respond_to_request(mdr, -EINVAL);
3468 return;
3469 }
3470 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
3471 dout(10) << " invalid data pool " << layout.pool_id << dendl;
3472 respond_to_request(mdr, -EINVAL);
3473 return;
3474 }
3475
3476 // created null dn.
3477 CDir *dir = dn->get_dir();
3478 CInode *diri = dir->get_inode();
3479 rdlocks.insert(&diri->authlock);
3480 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3481 return;
3482
3483 if (!check_access(mdr, diri, access))
3484 return;
3485
3486 if (!check_fragment_space(mdr, dir))
3487 return;
3488
3489 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3490
3491 if (!dnl->is_null()) {
3492 // it existed.
3493 assert(req->head.args.open.flags & CEPH_O_EXCL);
3494 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
3495 mdr->tracei = dnl->get_inode();
3496 mdr->tracedn = dn;
3497 respond_to_request(mdr, -EEXIST);
3498 return;
3499 }
3500
3501 // create inode.
3502 SnapRealm *realm = diri->find_snaprealm(); // use directory's realm; inode isn't attached yet.
3503 snapid_t follows = realm->get_newest_seq();
3504
3505 CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
3506 req->head.args.open.mode | S_IFREG, &layout);
3507 assert(in);
3508
3509 // it's a file.
3510 dn->push_projected_linkage(in);
3511
3512 in->inode.version = dn->pre_dirty();
3513 if (layout.pool_id != mdcache->default_file_layout.pool_id)
3514 in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
3515 in->inode.update_backtrace();
3516 if (cmode & CEPH_FILE_MODE_WR) {
3517 in->inode.client_ranges[client].range.first = 0;
3518 in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
3519 in->inode.client_ranges[client].follows = follows;
3520 }
3521 in->inode.rstat.rfiles = 1;
3522
3523 assert(dn->first == follows+1);
3524 in->first = dn->first;
3525
3526 // prepare finisher
3527 mdr->ls = mdlog->get_current_segment();
3528 EUpdate *le = new EUpdate(mdlog, "openc");
3529 mdlog->start_entry(le);
3530 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
3531 journal_allocated_inos(mdr, &le->metablob);
3532 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
3533 le->metablob.add_primary_dentry(dn, in, true, true, true);
3534
3535 // do the open
3536 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
3537 in->authlock.set_state(LOCK_EXCL);
3538 in->xattrlock.set_state(LOCK_EXCL);
3539
3540 // make sure this inode gets into the journal
3541 le->metablob.add_opened_ino(in->ino());
3542 LogSegment *ls = mds->mdlog->get_current_segment();
3543 ls->open_files.push_back(&in->item_open_file);
3544
3545 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in, follows);
3546
3547 if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
3548 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
3549 // add the file created flag onto the reply if create_flags features is supported
3550 ::encode(in->inode.ino, mdr->reply_extra_bl);
3551 }
3552
3553 journal_and_reply(mdr, in, dn, le, fin);
3554
3555 // We hit_dir (via hit_inode) in our finish callback, but by then we might
3556 // have overshot the split size (multiple opencs in flight), so here is
3557 // an early chance to split the dir if this openc makes it oversized.
3558 mds->balancer->maybe_fragment(dir, false);
3559 }
3560
3561
3562
3563 void Server::handle_client_readdir(MDRequestRef& mdr)
3564 {
3565 MClientRequest *req = mdr->client_request;
3566 client_t client = req->get_source().num();
3567 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3568 CInode *diri = rdlock_path_pin_ref(mdr, 0, rdlocks, false, true);
3569 if (!diri) return;
3570
3571 // it's a directory, right?
3572 if (!diri->is_dir()) {
3573 // not a dir
3574 dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
3575 respond_to_request(mdr, -ENOTDIR);
3576 return;
3577 }
3578
3579 rdlocks.insert(&diri->filelock);
3580 rdlocks.insert(&diri->dirfragtreelock);
3581
3582 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3583 return;
3584
3585 if (!check_access(mdr, diri, MAY_READ))
3586 return;
3587
3588 // which frag?
3589 frag_t fg = (__u32)req->head.args.readdir.frag;
3590 unsigned req_flags = (__u32)req->head.args.readdir.flags;
3591 string offset_str = req->get_path2();
3592
3593 __u32 offset_hash = 0;
3594 if (!offset_str.empty())
3595 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
3596 else
3597 offset_hash = (__u32)req->head.args.readdir.offset_hash;
3598
3599 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
3600 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
3601
3602 // does the frag exist?
3603 if (diri->dirfragtree[fg.value()] != fg) {
3604 frag_t newfg;
3605 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3606 if (fg.contains((unsigned)offset_hash)) {
3607 newfg = diri->dirfragtree[offset_hash];
3608 } else {
3609 // client actually wants next frag
3610 newfg = diri->dirfragtree[fg.value()];
3611 }
3612 } else {
3613 offset_str.clear();
3614 newfg = diri->dirfragtree[fg.value()];
3615 }
3616 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
3617 fg = newfg;
3618 }
3619
3620 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
3621 if (!dir) return;
3622
3623 // ok!
3624 dout(10) << "handle_client_readdir on " << *dir << dendl;
3625 assert(dir->is_auth());
3626
3627 if (!dir->is_complete()) {
3628 if (dir->is_frozen()) {
3629 dout(7) << "dir is frozen " << *dir << dendl;
3630 mds->locker->drop_locks(mdr.get());
3631 mdr->drop_local_auth_pins();
3632 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3633 return;
3634 }
3635 // fetch
3636 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
3637 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
3638 return;
3639 }
3640
3641 #ifdef MDS_VERIFY_FRAGSTAT
3642 dir->verify_fragstat();
3643 #endif
3644
3645 utime_t now = ceph_clock_now();
3646 mdr->set_mds_stamp(now);
3647
3648 snapid_t snapid = mdr->snapid;
3649 dout(10) << "snapid " << snapid << dendl;
3650
3651 SnapRealm *realm = diri->find_snaprealm();
3652
3653 unsigned max = req->head.args.readdir.max_entries;
3654 if (!max)
3655 max = dir->get_num_any(); // whatever, something big.
3656 unsigned max_bytes = req->head.args.readdir.max_bytes;
3657 if (!max_bytes)
3658 // make sure at least one item can be encoded
3659 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
3660
3661 // start final blob
3662 bufferlist dirbl;
3663 dir->encode_dirstat(dirbl, mds->get_nodeid());
3664
3665 // count bytes available.
3666 // this isn't perfect, but we should capture the main variable/unbounded size items!
3667 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
3668 int bytes_left = max_bytes - front_bytes;
3669 bytes_left -= realm->get_snap_trace().length();
3670
3671 // build dir contents
3672 bufferlist dnbl;
3673 __u32 numfiles = 0;
3674 bool start = !offset_hash && offset_str.empty();
3675 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
3676 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
3677 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
3678 bool end = (it == dir->end());
3679 for (; !end && numfiles < max; end = (it == dir->end())) {
3680 CDentry *dn = it->second;
3681 ++it;
3682
3683 if (dn->state_test(CDentry::STATE_PURGING))
3684 continue;
3685
3686 bool dnp = dn->use_projected(client, mdr);
3687 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
3688
3689 if (dnl->is_null())
3690 continue;
3691
3692 if (dn->last < snapid || dn->first > snapid) {
3693 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
3694 continue;
3695 }
3696
3697 if (!start) {
3698 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
3699 if (!(offset_key < dn->key()))
3700 continue;
3701 }
3702
3703 CInode *in = dnl->get_inode();
3704
3705 if (in && in->ino() == CEPH_INO_CEPH)
3706 continue;
3707
3708 // remote link?
3709 // better for the MDS to do the work, if we think the client will stat any of these files.
3710 if (dnl->is_remote() && !in) {
3711 in = mdcache->get_inode(dnl->get_remote_ino());
3712 if (in) {
3713 dn->link_remote(dnl, in);
3714 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
3715 dout(10) << "skipping bad remote ino on " << *dn << dendl;
3716 continue;
3717 } else {
3718 // touch everything i _do_ have
3719 for (CDir::map_t::iterator p = dir->begin(); p != dir->end(); ++p)
3720 if (!p->second->get_linkage()->is_null())
3721 mdcache->lru.lru_touch(p->second);
3722
3723 // already issued caps and leases, reply immediately.
3724 if (dnbl.length() > 0) {
3725 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
3726 dout(10) << " open remote dentry after caps were issued, stopping at "
3727 << dnbl.length() << " < " << bytes_left << dendl;
3728 break;
3729 }
3730
3731 mds->locker->drop_locks(mdr.get());
3732 mdr->drop_local_auth_pins();
3733 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
3734 return;
3735 }
3736 }
3737 assert(in);
3738
3739 if ((int)(dnbl.length() + dn->name.length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
3740 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
3741 break;
3742 }
3743
3744 unsigned start_len = dnbl.length();
3745
3746 // dentry
3747 dout(12) << "including dn " << *dn << dendl;
3748 ::encode(dn->name, dnbl);
3749 mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session);
3750
3751 // inode
3752 dout(12) << "including inode " << *in << dendl;
3753 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
3754 if (r < 0) {
3755 // chop off dn->name, lease
3756 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
3757 bufferlist keep;
3758 keep.substr_of(dnbl, 0, start_len);
3759 dnbl.swap(keep);
3760 break;
3761 }
3762 assert(r >= 0);
3763 numfiles++;
3764
3765 // touch dn
3766 mdcache->lru.lru_touch(dn);
3767 }
3768
3769 __u16 flags = 0;
3770 if (end) {
3771 flags = CEPH_READDIR_FRAG_END;
3772 if (start)
3773 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
3774 }
3775 // client only understand END and COMPLETE flags ?
3776 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3777 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
3778 }
3779
3780 // finish final blob
3781 ::encode(numfiles, dirbl);
3782 ::encode(flags, dirbl);
3783 dirbl.claim_append(dnbl);
3784
3785 // yay, reply
3786 dout(10) << "reply to " << *req << " readdir num=" << numfiles
3787 << " bytes=" << dirbl.length()
3788 << " start=" << (int)start
3789 << " end=" << (int)end
3790 << dendl;
3791 mdr->reply_extra_bl = dirbl;
3792
3793 // bump popularity. NOTE: this doesn't quite capture it.
3794 mds->balancer->hit_dir(now, dir, META_POP_IRD, -1, numfiles);
3795
3796 // reply
3797 mdr->tracei = diri;
3798 respond_to_request(mdr, 0);
3799 }
3800
3801
3802
3803 // ===============================================================================
3804 // INODE UPDATES
3805
3806
3807 /*
3808 * finisher for basic inode updates
3809 */
3810 class C_MDS_inode_update_finish : public ServerLogContext {
3811 CInode *in;
3812 bool truncating_smaller, changed_ranges;
3813 public:
3814 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
3815 bool sm=false, bool cr=false) :
3816 ServerLogContext(s, r), in(i), truncating_smaller(sm), changed_ranges(cr) { }
3817 void finish(int r) override {
3818 assert(r == 0);
3819
3820 // apply
3821 in->pop_and_dirty_projected_inode(mdr->ls);
3822 mdr->apply();
3823
3824 // notify any clients
3825 if (truncating_smaller && in->inode.is_truncating()) {
3826 get_mds()->locker->issue_truncate(in);
3827 get_mds()->mdcache->truncate_inode(in, mdr->ls);
3828 }
3829
3830 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
3831
3832 server->respond_to_request(mdr, 0);
3833
3834 if (changed_ranges)
3835 get_mds()->locker->share_inode_max_size(in);
3836 }
3837 };
3838
3839 void Server::handle_client_file_setlock(MDRequestRef& mdr)
3840 {
3841 MClientRequest *req = mdr->client_request;
3842 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3843
3844 // get the inode to operate on, and set up any locks needed for that
3845 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3846 if (!cur)
3847 return;
3848
3849 xlocks.insert(&cur->flocklock);
3850 /* acquire_locks will return true if it gets the locks. If it fails,
3851 it will redeliver this request at a later date, so drop the request.
3852 */
3853 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
3854 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
3855 return;
3856 }
3857
3858 // copy the lock change into a ceph_filelock so we can store/apply it
3859 ceph_filelock set_lock;
3860 set_lock.start = req->head.args.filelock_change.start;
3861 set_lock.length = req->head.args.filelock_change.length;
3862 set_lock.client = req->get_orig_source().num();
3863 set_lock.owner = req->head.args.filelock_change.owner;
3864 set_lock.pid = req->head.args.filelock_change.pid;
3865 set_lock.type = req->head.args.filelock_change.type;
3866 bool will_wait = req->head.args.filelock_change.wait;
3867
3868 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
3869
3870 ceph_lock_state_t *lock_state = NULL;
3871 bool interrupt = false;
3872
3873 // get the appropriate lock state
3874 switch (req->head.args.filelock_change.rule) {
3875 case CEPH_LOCK_FLOCK_INTR:
3876 interrupt = true;
3877 // fall-thru
3878 case CEPH_LOCK_FLOCK:
3879 lock_state = cur->get_flock_lock_state();
3880 break;
3881
3882 case CEPH_LOCK_FCNTL_INTR:
3883 interrupt = true;
3884 // fall-thru
3885 case CEPH_LOCK_FCNTL:
3886 lock_state = cur->get_fcntl_lock_state();
3887 break;
3888
3889 default:
3890 dout(10) << "got unknown lock type " << set_lock.type
3891 << ", dropping request!" << dendl;
3892 respond_to_request(mdr, -EOPNOTSUPP);
3893 return;
3894 }
3895
3896 dout(10) << " state prior to lock change: " << *lock_state << dendl;
3897 if (CEPH_LOCK_UNLOCK == set_lock.type) {
3898 list<ceph_filelock> activated_locks;
3899 list<MDSInternalContextBase*> waiters;
3900 if (lock_state->is_waiting(set_lock)) {
3901 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
3902 lock_state->remove_waiting(set_lock);
3903 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
3904 } else if (!interrupt) {
3905 dout(10) << " unlock attempt on " << set_lock << dendl;
3906 lock_state->remove_lock(set_lock, activated_locks);
3907 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
3908 }
3909 mds->queue_waiters(waiters);
3910
3911 respond_to_request(mdr, 0);
3912 } else {
3913 dout(10) << " lock attempt on " << set_lock << dendl;
3914 bool deadlock = false;
3915 if (mdr->more()->flock_was_waiting &&
3916 !lock_state->is_waiting(set_lock)) {
3917 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
3918 respond_to_request(mdr, -EINTR);
3919 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
3920 dout(10) << " it failed on this attempt" << dendl;
3921 // couldn't set lock right now
3922 if (deadlock) {
3923 respond_to_request(mdr, -EDEADLK);
3924 } else if (!will_wait) {
3925 respond_to_request(mdr, -EWOULDBLOCK);
3926 } else {
3927 dout(10) << " added to waiting list" << dendl;
3928 assert(lock_state->is_waiting(set_lock));
3929 mdr->more()->flock_was_waiting = true;
3930 mds->locker->drop_locks(mdr.get());
3931 mdr->drop_local_auth_pins();
3932 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
3933 }
3934 } else
3935 respond_to_request(mdr, 0);
3936 }
3937 dout(10) << " state after lock change: " << *lock_state << dendl;
3938 }
3939
3940 void Server::handle_client_file_readlock(MDRequestRef& mdr)
3941 {
3942 MClientRequest *req = mdr->client_request;
3943 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3944
3945 // get the inode to operate on, and set up any locks needed for that
3946 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3947 if (!cur)
3948 return;
3949
3950 /* acquire_locks will return true if it gets the locks. If it fails,
3951 it will redeliver this request at a later date, so drop the request.
3952 */
3953 rdlocks.insert(&cur->flocklock);
3954 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
3955 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
3956 return;
3957 }
3958
3959 // copy the lock change into a ceph_filelock so we can store/apply it
3960 ceph_filelock checking_lock;
3961 checking_lock.start = req->head.args.filelock_change.start;
3962 checking_lock.length = req->head.args.filelock_change.length;
3963 checking_lock.client = req->get_orig_source().num();
3964 checking_lock.owner = req->head.args.filelock_change.owner;
3965 checking_lock.pid = req->head.args.filelock_change.pid;
3966 checking_lock.type = req->head.args.filelock_change.type;
3967
3968 // get the appropriate lock state
3969 ceph_lock_state_t *lock_state = NULL;
3970 switch (req->head.args.filelock_change.rule) {
3971 case CEPH_LOCK_FLOCK:
3972 lock_state = cur->get_flock_lock_state();
3973 break;
3974
3975 case CEPH_LOCK_FCNTL:
3976 lock_state = cur->get_fcntl_lock_state();
3977 break;
3978
3979 default:
3980 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
3981 respond_to_request(mdr, -EINVAL);
3982 return;
3983 }
3984 lock_state->look_for_lock(checking_lock);
3985
3986 bufferlist lock_bl;
3987 ::encode(checking_lock, lock_bl);
3988
3989 mdr->reply_extra_bl = lock_bl;
3990 respond_to_request(mdr, 0);
3991 }
3992
3993 void Server::handle_client_setattr(MDRequestRef& mdr)
3994 {
3995 MClientRequest *req = mdr->client_request;
3996 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3997 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3998 if (!cur) return;
3999
4000 if (mdr->snapid != CEPH_NOSNAP) {
4001 respond_to_request(mdr, -EROFS);
4002 return;
4003 }
4004 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
4005 respond_to_request(mdr, -EPERM);
4006 return;
4007 }
4008
4009 __u32 mask = req->head.args.setattr.mask;
4010 __u32 access_mask = MAY_WRITE;
4011
4012 // xlock inode
4013 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
4014 xlocks.insert(&cur->authlock);
4015 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
4016 xlocks.insert(&cur->filelock);
4017 if (mask & CEPH_SETATTR_CTIME)
4018 wrlocks.insert(&cur->versionlock);
4019
4020 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4021 return;
4022
4023 if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
4024 access_mask |= MAY_CHOWN;
4025
4026 if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
4027 access_mask |= MAY_CHGRP;
4028
4029 if (!check_access(mdr, cur, access_mask))
4030 return;
4031
4032 // trunc from bigger -> smaller?
4033 inode_t *pi = cur->get_projected_inode();
4034
4035 uint64_t old_size = MAX(pi->size, req->head.args.setattr.old_size);
4036
4037 // ENOSPC on growing file while full, but allow shrinks
4038 if (is_full && req->head.args.setattr.size > old_size) {
4039 dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
4040 respond_to_request(mdr, -ENOSPC);
4041 return;
4042 }
4043
4044 bool truncating_smaller = false;
4045 if (mask & CEPH_SETATTR_SIZE) {
4046 truncating_smaller = req->head.args.setattr.size < old_size;
4047 if (truncating_smaller && pi->is_truncating()) {
4048 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4049 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4050 mds->locker->drop_locks(mdr.get());
4051 mdr->drop_local_auth_pins();
4052 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4053 return;
4054 }
4055 }
4056
4057 bool changed_ranges = false;
4058
4059 // project update
4060 mdr->ls = mdlog->get_current_segment();
4061 EUpdate *le = new EUpdate(mdlog, "setattr");
4062 mdlog->start_entry(le);
4063
4064 pi = cur->project_inode();
4065
4066 if (mask & CEPH_SETATTR_UID)
4067 pi->uid = req->head.args.setattr.uid;
4068 if (mask & CEPH_SETATTR_GID)
4069 pi->gid = req->head.args.setattr.gid;
4070
4071 if (mask & CEPH_SETATTR_MODE)
4072 pi->mode = (pi->mode & ~07777) | (req->head.args.setattr.mode & 07777);
4073 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
4074 S_ISREG(pi->mode) &&
4075 (pi->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
4076 pi->mode &= ~(S_ISUID|S_ISGID);
4077 }
4078
4079 if (mask & CEPH_SETATTR_MTIME)
4080 pi->mtime = req->head.args.setattr.mtime;
4081 if (mask & CEPH_SETATTR_ATIME)
4082 pi->atime = req->head.args.setattr.atime;
4083 if (mask & CEPH_SETATTR_BTIME)
4084 pi->btime = req->head.args.setattr.btime;
4085 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
4086 pi->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
4087 if (mask & CEPH_SETATTR_SIZE) {
4088 if (truncating_smaller) {
4089 pi->truncate(old_size, req->head.args.setattr.size);
4090 le->metablob.add_truncate_start(cur->ino());
4091 } else {
4092 pi->size = req->head.args.setattr.size;
4093 pi->rstat.rbytes = pi->size;
4094 }
4095 pi->mtime = mdr->get_op_stamp();
4096
4097 // adjust client's max_size?
4098 map<client_t,client_writeable_range_t> new_ranges;
4099 bool max_increased = false;
4100 mds->locker->calc_new_client_ranges(cur, pi->size, &new_ranges, &max_increased);
4101 if (pi->client_ranges != new_ranges) {
4102 dout(10) << " client_ranges " << pi->client_ranges << " -> " << new_ranges << dendl;
4103 pi->client_ranges = new_ranges;
4104 changed_ranges = true;
4105 }
4106 }
4107
4108 pi->version = cur->pre_dirty();
4109 pi->ctime = mdr->get_op_stamp();
4110 pi->change_attr++;
4111
4112 // log + wait
4113 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4114 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4115 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4116
4117 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
4118 truncating_smaller, changed_ranges));
4119
4120 // flush immediately if there are readers/writers waiting
4121 if (xlocks.count(&cur->filelock) &&
4122 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
4123 mds->mdlog->flush();
4124 }
4125
4126 /* Takes responsibility for mdr */
4127 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
4128 {
4129 CInode *in = mdr->in[0];
4130 client_t client = mdr->get_client();
4131 assert(in);
4132
4133 dout(10) << "do_open_truncate " << *in << dendl;
4134
4135 SnapRealm *realm = in->find_snaprealm();
4136 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
4137
4138 mdr->ls = mdlog->get_current_segment();
4139 EUpdate *le = new EUpdate(mdlog, "open_truncate");
4140 mdlog->start_entry(le);
4141
4142 // prepare
4143 inode_t *pi = in->project_inode();
4144 pi->version = in->pre_dirty();
4145 pi->mtime = pi->ctime = mdr->get_op_stamp();
4146 pi->change_attr++;
4147
4148 uint64_t old_size = MAX(pi->size, mdr->client_request->head.args.open.old_size);
4149 if (old_size > 0) {
4150 pi->truncate(old_size, 0);
4151 le->metablob.add_truncate_start(in->ino());
4152 }
4153
4154 bool changed_ranges = false;
4155 if (cmode & CEPH_FILE_MODE_WR) {
4156 pi->client_ranges[client].range.first = 0;
4157 pi->client_ranges[client].range.last = pi->get_layout_size_increment();
4158 pi->client_ranges[client].follows = in->find_snaprealm()->get_newest_seq();
4159 changed_ranges = true;
4160 }
4161
4162 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
4163
4164 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
4165 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
4166
4167 // make sure ino gets into the journal
4168 le->metablob.add_opened_ino(in->ino());
4169 LogSegment *ls = mds->mdlog->get_current_segment();
4170 ls->open_files.push_back(&in->item_open_file);
4171
4172 mdr->o_trunc = true;
4173
4174 CDentry *dn = 0;
4175 if (mdr->client_request->get_dentry_wanted()) {
4176 assert(mdr->dn[0].size());
4177 dn = mdr->dn[0].back();
4178 }
4179
4180 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
4181 changed_ranges));
4182 // Although the `open` part can give an early reply, the truncation won't
4183 // happen until our EUpdate is persistent, to give the client a prompt
4184 // response we must also flush that event.
4185 mdlog->flush();
4186 }
4187
4188
4189 /* This function cleans up the passed mdr */
4190 void Server::handle_client_setlayout(MDRequestRef& mdr)
4191 {
4192 MClientRequest *req = mdr->client_request;
4193 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4194 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4195 if (!cur) return;
4196
4197 if (mdr->snapid != CEPH_NOSNAP) {
4198 respond_to_request(mdr, -EROFS);
4199 return;
4200 }
4201 if (!cur->is_file()) {
4202 respond_to_request(mdr, -EINVAL);
4203 return;
4204 }
4205 if (cur->get_projected_inode()->size ||
4206 cur->get_projected_inode()->truncate_seq > 1) {
4207 respond_to_request(mdr, -ENOTEMPTY);
4208 return;
4209 }
4210
4211 // validate layout
4212 file_layout_t layout = cur->get_projected_inode()->layout;
4213 // save existing layout for later
4214 const auto old_layout = layout;
4215
4216 int access = MAY_WRITE;
4217
4218 if (req->head.args.setlayout.layout.fl_object_size > 0)
4219 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4220 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4221 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4222 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4223 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4224 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4225 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4226
4227 // make sure we have as new a map as the client
4228 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4229 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4230 return;
4231 }
4232 }
4233
4234 // Don't permit layout modifications without 'p' caps
4235 if (layout != old_layout) {
4236 access |= MAY_SET_VXATTR;
4237 }
4238
4239 if (!layout.is_valid()) {
4240 dout(10) << "bad layout" << dendl;
4241 respond_to_request(mdr, -EINVAL);
4242 return;
4243 }
4244 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4245 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4246 respond_to_request(mdr, -EINVAL);
4247 return;
4248 }
4249
4250 xlocks.insert(&cur->filelock);
4251 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4252 return;
4253
4254 if (!check_access(mdr, cur, access))
4255 return;
4256
4257 // project update
4258 inode_t *pi = cur->project_inode();
4259 pi->layout = layout;
4260 // add the old pool to the inode
4261 pi->add_old_pool(old_layout.pool_id);
4262 pi->version = cur->pre_dirty();
4263 pi->ctime = mdr->get_op_stamp();
4264 pi->change_attr++;
4265
4266 // log + wait
4267 mdr->ls = mdlog->get_current_segment();
4268 EUpdate *le = new EUpdate(mdlog, "setlayout");
4269 mdlog->start_entry(le);
4270 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4271 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4272 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4273
4274 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4275 }
4276
4277 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
4278 {
4279 MClientRequest *req = mdr->client_request;
4280 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4281 file_layout_t *dir_layout = NULL;
4282 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4283 if (!cur) return;
4284
4285 if (mdr->snapid != CEPH_NOSNAP) {
4286 respond_to_request(mdr, -EROFS);
4287 return;
4288 }
4289
4290 if (!cur->is_dir()) {
4291 respond_to_request(mdr, -ENOTDIR);
4292 return;
4293 }
4294
4295 xlocks.insert(&cur->policylock);
4296 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4297 return;
4298
4299 // validate layout
4300 const inode_t *old_pi = cur->get_projected_inode();
4301 file_layout_t layout;
4302 if (old_pi->has_layout())
4303 layout = old_pi->layout;
4304 else if (dir_layout)
4305 layout = *dir_layout;
4306 else
4307 layout = mdcache->default_file_layout;
4308
4309 // Level of access required to complete
4310 int access = MAY_WRITE;
4311
4312 const auto old_layout = layout;
4313
4314 if (req->head.args.setlayout.layout.fl_object_size > 0)
4315 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4316 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4317 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4318 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4319 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4320 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4321 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4322 // make sure we have as new a map as the client
4323 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4324 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4325 return;
4326 }
4327 }
4328
4329 if (layout != old_layout) {
4330 access |= MAY_SET_VXATTR;
4331 }
4332
4333 if (!layout.is_valid()) {
4334 dout(10) << "bad layout" << dendl;
4335 respond_to_request(mdr, -EINVAL);
4336 return;
4337 }
4338 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4339 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4340 respond_to_request(mdr, -EINVAL);
4341 return;
4342 }
4343
4344 if (!check_access(mdr, cur, access))
4345 return;
4346
4347 inode_t *pi = cur->project_inode();
4348 pi->layout = layout;
4349 pi->version = cur->pre_dirty();
4350
4351 // log + wait
4352 mdr->ls = mdlog->get_current_segment();
4353 EUpdate *le = new EUpdate(mdlog, "setlayout");
4354 mdlog->start_entry(le);
4355 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4356 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4357 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4358
4359 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4360 }
4361
4362 // XATTRS
4363
4364 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
4365 file_layout_t *layout, bool validate)
4366 {
4367 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
4368 try {
4369 if (name == "layout") {
4370 string::iterator begin = value.begin();
4371 string::iterator end = value.end();
4372 keys_and_values<string::iterator> p; // create instance of parser
4373 std::map<string, string> m; // map to receive results
4374 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4375 return -EINVAL;
4376 }
4377 string left(begin, end);
4378 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4379 if (begin != end)
4380 return -EINVAL;
4381 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4382 // Skip validation on each attr, we do it once at the end (avoid
4383 // rejecting intermediate states if the overall result is ok)
4384 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
4385 osdmap, layout, false);
4386 if (r < 0)
4387 return r;
4388 }
4389 } else if (name == "layout.object_size") {
4390 layout->object_size = boost::lexical_cast<unsigned>(value);
4391 } else if (name == "layout.stripe_unit") {
4392 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
4393 } else if (name == "layout.stripe_count") {
4394 layout->stripe_count = boost::lexical_cast<unsigned>(value);
4395 } else if (name == "layout.pool") {
4396 try {
4397 layout->pool_id = boost::lexical_cast<unsigned>(value);
4398 } catch (boost::bad_lexical_cast const&) {
4399 int64_t pool = osdmap.lookup_pg_pool_name(value);
4400 if (pool < 0) {
4401 dout(10) << " unknown pool " << value << dendl;
4402 return -ENOENT;
4403 }
4404 layout->pool_id = pool;
4405 }
4406 } else if (name == "layout.pool_namespace") {
4407 layout->pool_ns = value;
4408 } else {
4409 dout(10) << " unknown layout vxattr " << name << dendl;
4410 return -EINVAL;
4411 }
4412 } catch (boost::bad_lexical_cast const&) {
4413 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4414 return -EINVAL;
4415 }
4416
4417 if (validate && !layout->is_valid()) {
4418 dout(10) << "bad layout" << dendl;
4419 return -EINVAL;
4420 }
4421 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
4422 dout(10) << " invalid data pool " << layout->pool_id << dendl;
4423 return -EINVAL;
4424 }
4425 return 0;
4426 }
4427
4428 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
4429 {
4430 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
4431 try {
4432 if (name == "quota") {
4433 string::iterator begin = value.begin();
4434 string::iterator end = value.end();
4435 keys_and_values<string::iterator> p; // create instance of parser
4436 std::map<string, string> m; // map to receive results
4437 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4438 return -EINVAL;
4439 }
4440 string left(begin, end);
4441 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4442 if (begin != end)
4443 return -EINVAL;
4444 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4445 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
4446 if (r < 0)
4447 return r;
4448 }
4449 } else if (name == "quota.max_bytes") {
4450 int64_t q = boost::lexical_cast<int64_t>(value);
4451 if (q < 0)
4452 return -EINVAL;
4453 quota->max_bytes = q;
4454 } else if (name == "quota.max_files") {
4455 int64_t q = boost::lexical_cast<int64_t>(value);
4456 if (q < 0)
4457 return -EINVAL;
4458 quota->max_files = q;
4459 } else {
4460 dout(10) << " unknown quota vxattr " << name << dendl;
4461 return -EINVAL;
4462 }
4463 } catch (boost::bad_lexical_cast const&) {
4464 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4465 return -EINVAL;
4466 }
4467
4468 if (!quota->is_valid()) {
4469 dout(10) << "bad quota" << dendl;
4470 return -EINVAL;
4471 }
4472 return 0;
4473 }
4474
4475 /*
4476 * Verify that the file layout attribute carried by client
4477 * is well-formatted.
4478 * Return 0 on success, otherwise this function takes
4479 * responsibility for the passed mdr.
4480 */
4481 int Server::check_layout_vxattr(MDRequestRef& mdr,
4482 string name,
4483 string value,
4484 file_layout_t *layout)
4485 {
4486 MClientRequest *req = mdr->client_request;
4487 epoch_t epoch;
4488 int r;
4489
4490 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4491 r = parse_layout_vxattr(name, value, osdmap, layout);
4492 epoch = osdmap.get_epoch();
4493 });
4494
4495 if (r == -ENOENT) {
4496
4497 // we don't have the specified pool, make sure our map
4498 // is newer than or as new as the client.
4499 epoch_t req_epoch = req->get_osdmap_epoch();
4500
4501 if (req_epoch > epoch) {
4502
4503 // well, our map is older. consult mds.
4504 Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
4505
4506 if (!mds->objecter->wait_for_map(req_epoch, fin))
4507 return r; // wait, fin will retry this request later
4508
4509 delete fin;
4510
4511 // now we have at least as new a map as the client, try again.
4512 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4513 r = parse_layout_vxattr(name, value, osdmap, layout);
4514 epoch = osdmap.get_epoch();
4515 });
4516
4517 assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
4518
4519 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
4520
4521 // For compatibility with client w/ old code, we still need get the
4522 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
4523 // we can remove those code.
4524 mdr->waited_for_osdmap = true;
4525 mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
4526 mds, new C_MDS_RetryRequest(mdcache, mdr)));
4527 return r;
4528 }
4529 }
4530
4531 if (r < 0) {
4532
4533 if (r == -ENOENT)
4534 r = -EINVAL;
4535
4536 respond_to_request(mdr, r);
4537 return r;
4538 }
4539
4540 // all is well
4541 return 0;
4542 }
4543
4544 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
4545 file_layout_t *dir_layout,
4546 set<SimpleLock*> rdlocks,
4547 set<SimpleLock*> wrlocks,
4548 set<SimpleLock*> xlocks)
4549 {
4550 MClientRequest *req = mdr->client_request;
4551 string name(req->get_path2());
4552 bufferlist bl = req->get_data();
4553 string value (bl.c_str(), bl.length());
4554 dout(10) << "handle_set_vxattr " << name
4555 << " val " << value.length()
4556 << " bytes on " << *cur
4557 << dendl;
4558
4559 inode_t *pi = NULL;
4560 string rest;
4561
4562 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
4563 return;
4564 }
4565
4566 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
4567 if (!cur->is_dir()) {
4568 respond_to_request(mdr, -EINVAL);
4569 return;
4570 }
4571
4572 file_layout_t layout;
4573 if (cur->get_projected_inode()->has_layout())
4574 layout = cur->get_projected_inode()->layout;
4575 else if (dir_layout)
4576 layout = *dir_layout;
4577 else
4578 layout = mdcache->default_file_layout;
4579
4580 rest = name.substr(name.find("layout"));
4581 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4582 return;
4583
4584 xlocks.insert(&cur->policylock);
4585 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4586 return;
4587
4588 pi = cur->project_inode();
4589 pi->layout = layout;
4590 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
4591 if (!cur->is_file()) {
4592 respond_to_request(mdr, -EINVAL);
4593 return;
4594 }
4595 if (cur->get_projected_inode()->size ||
4596 cur->get_projected_inode()->truncate_seq > 1) {
4597 respond_to_request(mdr, -ENOTEMPTY);
4598 return;
4599 }
4600 file_layout_t layout = cur->get_projected_inode()->layout;
4601 rest = name.substr(name.find("layout"));
4602 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4603 return;
4604
4605 xlocks.insert(&cur->filelock);
4606 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4607 return;
4608
4609 pi = cur->project_inode();
4610 int64_t old_pool = pi->layout.pool_id;
4611 pi->add_old_pool(old_pool);
4612 pi->layout = layout;
4613 pi->ctime = mdr->get_op_stamp();
4614 } else if (name.compare(0, 10, "ceph.quota") == 0) {
4615 if (!cur->is_dir() || cur->is_root()) {
4616 respond_to_request(mdr, -EINVAL);
4617 return;
4618 }
4619
4620 quota_info_t quota = cur->get_projected_inode()->quota;
4621
4622 rest = name.substr(name.find("quota"));
4623 int r = parse_quota_vxattr(rest, value, &quota);
4624 if (r < 0) {
4625 respond_to_request(mdr, r);
4626 return;
4627 }
4628
4629 xlocks.insert(&cur->policylock);
4630 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4631 return;
4632
4633 pi = cur->project_inode();
4634 pi->quota = quota;
4635 } else if (name.find("ceph.dir.pin") == 0) {
4636 if (!cur->is_dir() || cur->is_root()) {
4637 respond_to_request(mdr, -EINVAL);
4638 return;
4639 }
4640
4641 mds_rank_t rank;
4642 try {
4643 rank = boost::lexical_cast<mds_rank_t>(value);
4644 if (rank < 0) rank = MDS_RANK_NONE;
4645 } catch (boost::bad_lexical_cast const&) {
4646 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4647 respond_to_request(mdr, -EINVAL);
4648 return;
4649 }
4650
4651 xlocks.insert(&cur->policylock);
4652 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4653 return;
4654
4655 pi = cur->project_inode();
4656 cur->set_export_pin(rank);
4657 } else {
4658 dout(10) << " unknown vxattr " << name << dendl;
4659 respond_to_request(mdr, -EINVAL);
4660 return;
4661 }
4662
4663 pi->change_attr++;
4664 pi->ctime = mdr->get_op_stamp();
4665 pi->version = cur->pre_dirty();
4666 if (cur->is_file())
4667 pi->update_backtrace();
4668
4669 // log + wait
4670 mdr->ls = mdlog->get_current_segment();
4671 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
4672 mdlog->start_entry(le);
4673 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4674 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4675 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4676
4677 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4678 return;
4679 }
4680
4681 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
4682 file_layout_t *dir_layout,
4683 set<SimpleLock*> rdlocks,
4684 set<SimpleLock*> wrlocks,
4685 set<SimpleLock*> xlocks)
4686 {
4687 MClientRequest *req = mdr->client_request;
4688 string name(req->get_path2());
4689
4690 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
4691
4692 if (name == "ceph.dir.layout") {
4693 if (!cur->is_dir()) {
4694 respond_to_request(mdr, -ENODATA);
4695 return;
4696 }
4697 if (cur->is_root()) {
4698 dout(10) << "can't remove layout policy on the root directory" << dendl;
4699 respond_to_request(mdr, -EINVAL);
4700 return;
4701 }
4702
4703 if (!cur->get_projected_inode()->has_layout()) {
4704 respond_to_request(mdr, -ENODATA);
4705 return;
4706 }
4707
4708 xlocks.insert(&cur->policylock);
4709 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4710 return;
4711
4712 inode_t *pi = cur->project_inode();
4713 pi->clear_layout();
4714 pi->version = cur->pre_dirty();
4715
4716 // log + wait
4717 mdr->ls = mdlog->get_current_segment();
4718 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
4719 mdlog->start_entry(le);
4720 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4721 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4722 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4723
4724 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4725 return;
4726 } else if (name == "ceph.dir.layout.pool_namespace"
4727 || name == "ceph.file.layout.pool_namespace") {
4728 // Namespace is the only layout field that has a meaningful
4729 // null/none value (empty string, means default layout). Is equivalent
4730 // to a setxattr with empty string: pass through the empty payload of
4731 // the rmxattr request to do this.
4732 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4733 return;
4734 }
4735
4736 respond_to_request(mdr, -ENODATA);
4737 }
4738
4739 class C_MDS_inode_xattr_update_finish : public ServerLogContext {
4740 CInode *in;
4741 public:
4742
4743 C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
4744 ServerLogContext(s, r), in(i) { }
4745 void finish(int r) override {
4746 assert(r == 0);
4747
4748 // apply
4749 in->pop_and_dirty_projected_inode(mdr->ls);
4750
4751 mdr->apply();
4752
4753 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
4754
4755 server->respond_to_request(mdr, 0);
4756 }
4757 };
4758
4759 void Server::handle_client_setxattr(MDRequestRef& mdr)
4760 {
4761 MClientRequest *req = mdr->client_request;
4762 string name(req->get_path2());
4763 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4764 CInode *cur;
4765
4766 file_layout_t *dir_layout = NULL;
4767 if (name.compare(0, 15, "ceph.dir.layout") == 0)
4768 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4769 else
4770 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4771 if (!cur)
4772 return;
4773
4774 if (mdr->snapid != CEPH_NOSNAP) {
4775 respond_to_request(mdr, -EROFS);
4776 return;
4777 }
4778
4779 int flags = req->head.args.setxattr.flags;
4780
4781 // magic ceph.* namespace?
4782 if (name.compare(0, 5, "ceph.") == 0) {
4783 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4784 return;
4785 }
4786
4787 xlocks.insert(&cur->xattrlock);
4788 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4789 return;
4790
4791 if (!check_access(mdr, cur, MAY_WRITE))
4792 return;
4793
4794 map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
4795 size_t len = req->get_data().length();
4796 size_t inc = len + name.length();
4797
4798 // check xattrs kv pairs size
4799 size_t cur_xattrs_size = 0;
4800 for (const auto& p : *pxattrs) {
4801 if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
4802 continue;
4803 }
4804 cur_xattrs_size += p.first.length() + p.second.length();
4805 }
4806
4807 if (((cur_xattrs_size + inc) > g_conf->mds_max_xattr_pairs_size)) {
4808 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
4809 << cur_xattrs_size << ", inc " << inc << dendl;
4810 respond_to_request(mdr, -ENOSPC);
4811 return;
4812 }
4813
4814 if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(name)) {
4815 dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
4816 respond_to_request(mdr, -EEXIST);
4817 return;
4818 }
4819 if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(name)) {
4820 dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
4821 respond_to_request(mdr, -ENODATA);
4822 return;
4823 }
4824
4825 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
4826
4827 // project update
4828 map<string,bufferptr> *px = new map<string,bufferptr>;
4829 inode_t *pi = cur->project_inode(px);
4830 pi->version = cur->pre_dirty();
4831 pi->ctime = mdr->get_op_stamp();
4832 pi->change_attr++;
4833 pi->xattr_version++;
4834 px->erase(name);
4835 if (!(flags & CEPH_XATTR_REMOVE)) {
4836 (*px)[name] = buffer::create(len);
4837 if (len)
4838 req->get_data().copy(0, len, (*px)[name].c_str());
4839 }
4840
4841 // log + wait
4842 mdr->ls = mdlog->get_current_segment();
4843 EUpdate *le = new EUpdate(mdlog, "setxattr");
4844 mdlog->start_entry(le);
4845 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4846 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4847 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4848
4849 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4850 }
4851
4852 void Server::handle_client_removexattr(MDRequestRef& mdr)
4853 {
4854 MClientRequest *req = mdr->client_request;
4855 string name(req->get_path2());
4856 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4857 file_layout_t *dir_layout = NULL;
4858 CInode *cur;
4859 if (name == "ceph.dir.layout")
4860 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4861 else
4862 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4863 if (!cur)
4864 return;
4865
4866 if (mdr->snapid != CEPH_NOSNAP) {
4867 respond_to_request(mdr, -EROFS);
4868 return;
4869 }
4870
4871 if (name.compare(0, 5, "ceph.") == 0) {
4872 handle_remove_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4873 return;
4874 }
4875
4876 xlocks.insert(&cur->xattrlock);
4877 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4878 return;
4879
4880 map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
4881 if (pxattrs->count(name) == 0) {
4882 dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
4883 respond_to_request(mdr, -ENODATA);
4884 return;
4885 }
4886
4887 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
4888
4889 // project update
4890 map<string,bufferptr> *px = new map<string,bufferptr>;
4891 inode_t *pi = cur->project_inode(px);
4892 pi->version = cur->pre_dirty();
4893 pi->ctime = mdr->get_op_stamp();
4894 pi->change_attr++;
4895 pi->xattr_version++;
4896 px->erase(name);
4897
4898 // log + wait
4899 mdr->ls = mdlog->get_current_segment();
4900 EUpdate *le = new EUpdate(mdlog, "removexattr");
4901 mdlog->start_entry(le);
4902 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4903 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4904 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4905
4906 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4907 }
4908
4909
4910 // =================================================================
4911 // DIRECTORY and NAMESPACE OPS
4912
4913
4914 // ------------------------------------------------
4915
4916 // MKNOD
4917
4918 class C_MDS_mknod_finish : public ServerLogContext {
4919 CDentry *dn;
4920 CInode *newi;
4921 public:
4922 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4923 ServerLogContext(s, r), dn(d), newi(ni) {}
4924 void finish(int r) override {
4925 assert(r == 0);
4926
4927 // link the inode
4928 dn->pop_projected_linkage();
4929
4930 // be a bit hacky with the inode version, here.. we decrement it
4931 // just to keep mark_dirty() happen. (we didn't bother projecting
4932 // a new version of hte inode since it's just been created)
4933 newi->inode.version--;
4934 newi->mark_dirty(newi->inode.version + 1, mdr->ls);
4935 newi->_mark_dirty_parent(mdr->ls, true);
4936
4937 // mkdir?
4938 if (newi->inode.is_dir()) {
4939 CDir *dir = newi->get_dirfrag(frag_t());
4940 assert(dir);
4941 dir->fnode.version--;
4942 dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
4943 dir->mark_new(mdr->ls);
4944 }
4945
4946 mdr->apply();
4947
4948 MDRequestRef null_ref;
4949 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4950
4951 if (newi->inode.is_file())
4952 get_mds()->locker->share_inode_max_size(newi);
4953
4954 // hit pop
4955 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
4956
4957 // reply
4958 server->respond_to_request(mdr, 0);
4959 }
4960 };
4961
4962
4963 void Server::handle_client_mknod(MDRequestRef& mdr)
4964 {
4965 MClientRequest *req = mdr->client_request;
4966 client_t client = mdr->get_client();
4967 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4968 file_layout_t *dir_layout = NULL;
4969 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false,
4970 &dir_layout);
4971 if (!dn) return;
4972 if (mdr->snapid != CEPH_NOSNAP) {
4973 respond_to_request(mdr, -EROFS);
4974 return;
4975 }
4976 CInode *diri = dn->get_dir()->get_inode();
4977 rdlocks.insert(&diri->authlock);
4978 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4979 return;
4980
4981 if (!check_access(mdr, diri, MAY_WRITE))
4982 return;
4983
4984 if (!check_fragment_space(mdr, dn->get_dir()))
4985 return;
4986
4987 unsigned mode = req->head.args.mknod.mode;
4988 if ((mode & S_IFMT) == 0)
4989 mode |= S_IFREG;
4990
4991 // set layout
4992 file_layout_t layout;
4993 if (dir_layout && S_ISREG(mode))
4994 layout = *dir_layout;
4995 else
4996 layout = mdcache->default_file_layout;
4997
4998 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
4999 snapid_t follows = realm->get_newest_seq();
5000 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
5001 mode, &layout);
5002 assert(newi);
5003
5004 dn->push_projected_linkage(newi);
5005
5006 newi->inode.rdev = req->head.args.mknod.rdev;
5007 newi->inode.version = dn->pre_dirty();
5008 newi->inode.rstat.rfiles = 1;
5009 if (layout.pool_id != mdcache->default_file_layout.pool_id)
5010 newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
5011 newi->inode.update_backtrace();
5012
5013 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5014 // want to write to it (e.g., if they are reexporting NFS)
5015 if (S_ISREG(newi->inode.mode)) {
5016 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
5017 newi->inode.client_ranges[client].range.first = 0;
5018 newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
5019 newi->inode.client_ranges[client].follows = follows;
5020
5021 // issue a cap on the file
5022 int cmode = CEPH_FILE_MODE_RDWR;
5023 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5024 if (cap) {
5025 cap->set_wanted(0);
5026
5027 // put locks in excl mode
5028 newi->filelock.set_state(LOCK_EXCL);
5029 newi->authlock.set_state(LOCK_EXCL);
5030 newi->xattrlock.set_state(LOCK_EXCL);
5031 }
5032 }
5033
5034 assert(dn->first == follows + 1);
5035 newi->first = dn->first;
5036
5037 dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
5038
5039 // prepare finisher
5040 mdr->ls = mdlog->get_current_segment();
5041 EUpdate *le = new EUpdate(mdlog, "mknod");
5042 mdlog->start_entry(le);
5043 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5044 journal_allocated_inos(mdr, &le->metablob);
5045
5046 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
5047 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5048 le->metablob.add_primary_dentry(dn, newi, true, true, true);
5049
5050 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5051 }
5052
5053
5054
5055 // MKDIR
5056 /* This function takes responsibility for the passed mdr*/
5057 void Server::handle_client_mkdir(MDRequestRef& mdr)
5058 {
5059 MClientRequest *req = mdr->client_request;
5060 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5061 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5062 if (!dn) return;
5063 if (mdr->snapid != CEPH_NOSNAP) {
5064 respond_to_request(mdr, -EROFS);
5065 return;
5066 }
5067 CDir *dir = dn->get_dir();
5068 CInode *diri = dir->get_inode();
5069 rdlocks.insert(&diri->authlock);
5070 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5071 return;
5072
5073 // mkdir check access
5074 if (!check_access(mdr, diri, MAY_WRITE))
5075 return;
5076
5077 if (!check_fragment_space(mdr, dir))
5078 return;
5079
5080 // new inode
5081 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5082 snapid_t follows = realm->get_newest_seq();
5083
5084 unsigned mode = req->head.args.mkdir.mode;
5085 mode &= ~S_IFMT;
5086 mode |= S_IFDIR;
5087 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5088 assert(newi);
5089
5090 // it's a directory.
5091 dn->push_projected_linkage(newi);
5092
5093 newi->inode.version = dn->pre_dirty();
5094 newi->inode.rstat.rsubdirs = 1;
5095 newi->inode.update_backtrace();
5096
5097 dout(12) << " follows " << follows << dendl;
5098 assert(dn->first == follows + 1);
5099 newi->first = dn->first;
5100
5101 // ...and that new dir is empty.
5102 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
5103 newdir->state_set(CDir::STATE_CREATING);
5104 newdir->mark_complete();
5105 newdir->fnode.version = newdir->pre_dirty();
5106
5107 // prepare finisher
5108 mdr->ls = mdlog->get_current_segment();
5109 EUpdate *le = new EUpdate(mdlog, "mkdir");
5110 mdlog->start_entry(le);
5111 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5112 journal_allocated_inos(mdr, &le->metablob);
5113 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5114 le->metablob.add_primary_dentry(dn, newi, true, true);
5115 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
5116
5117 // issue a cap on the directory
5118 int cmode = CEPH_FILE_MODE_RDWR;
5119 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5120 if (cap) {
5121 cap->set_wanted(0);
5122
5123 // put locks in excl mode
5124 newi->filelock.set_state(LOCK_EXCL);
5125 newi->authlock.set_state(LOCK_EXCL);
5126 newi->xattrlock.set_state(LOCK_EXCL);
5127 }
5128
5129 // make sure this inode gets into the journal
5130 le->metablob.add_opened_ino(newi->ino());
5131 LogSegment *ls = mds->mdlog->get_current_segment();
5132 ls->open_files.push_back(&newi->item_open_file);
5133
5134 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5135 }
5136
5137
5138 // SYMLINK
5139
5140 void Server::handle_client_symlink(MDRequestRef& mdr)
5141 {
5142 MClientRequest *req = mdr->client_request;
5143 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5144 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5145 if (!dn) return;
5146 if (mdr->snapid != CEPH_NOSNAP) {
5147 respond_to_request(mdr, -EROFS);
5148 return;
5149 }
5150 CDir *dir = dn->get_dir();
5151 CInode *diri = dir->get_inode();
5152 rdlocks.insert(&diri->authlock);
5153 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5154 return;
5155
5156 if (!check_access(mdr, diri, MAY_WRITE))
5157 return;
5158
5159 if (!check_fragment_space(mdr, dir))
5160 return;
5161
5162 unsigned mode = S_IFLNK | 0777;
5163 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5164 assert(newi);
5165
5166 // it's a symlink
5167 dn->push_projected_linkage(newi);
5168
5169 newi->symlink = req->get_path2();
5170 newi->inode.size = newi->symlink.length();
5171 newi->inode.rstat.rbytes = newi->inode.size;
5172 newi->inode.rstat.rfiles = 1;
5173 newi->inode.version = dn->pre_dirty();
5174 newi->inode.update_backtrace();
5175
5176 newi->first = dn->first;
5177
5178 // prepare finisher
5179 mdr->ls = mdlog->get_current_segment();
5180 EUpdate *le = new EUpdate(mdlog, "symlink");
5181 mdlog->start_entry(le);
5182 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5183 journal_allocated_inos(mdr, &le->metablob);
5184 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5185 le->metablob.add_primary_dentry(dn, newi, true, true);
5186
5187 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5188 }
5189
5190
5191
5192
5193
5194 // LINK
5195
5196 void Server::handle_client_link(MDRequestRef& mdr)
5197 {
5198 MClientRequest *req = mdr->client_request;
5199
5200 dout(7) << "handle_client_link " << req->get_filepath()
5201 << " to " << req->get_filepath2()
5202 << dendl;
5203
5204 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5205
5206 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5207 if (!dn) return;
5208 CInode *targeti = rdlock_path_pin_ref(mdr, 1, rdlocks, false);
5209 if (!targeti) return;
5210 if (mdr->snapid != CEPH_NOSNAP) {
5211 respond_to_request(mdr, -EROFS);
5212 return;
5213 }
5214
5215 CDir *dir = dn->get_dir();
5216 dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
5217 dout(7) << "target is " << *targeti << dendl;
5218 if (targeti->is_dir()) {
5219 // if srcdn is replica, need to make sure its linkage is correct
5220 vector<CDentry*>& trace = mdr->dn[1];
5221 if (trace.empty() ||
5222 trace.back()->is_auth() ||
5223 trace.back()->lock.can_read(mdr->get_client())) {
5224 dout(7) << "target is a dir, failing..." << dendl;
5225 respond_to_request(mdr, -EINVAL);
5226 return;
5227 }
5228 }
5229
5230 xlocks.insert(&targeti->linklock);
5231
5232 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5233 return;
5234
5235 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5236 if (!check_access(mdr, targeti, MAY_WRITE))
5237 return;
5238
5239 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
5240 return;
5241
5242 if (!check_fragment_space(mdr, dir))
5243 return;
5244 }
5245
5246 // go!
5247 assert(g_conf->mds_kill_link_at != 1);
5248
5249 // local or remote?
5250 if (targeti->is_auth())
5251 _link_local(mdr, dn, targeti);
5252 else
5253 _link_remote(mdr, true, dn, targeti);
5254 }
5255
5256
5257 class C_MDS_link_local_finish : public ServerLogContext {
5258 CDentry *dn;
5259 CInode *targeti;
5260 version_t dnpv;
5261 version_t tipv;
5262 public:
5263 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
5264 version_t dnpv_, version_t tipv_) :
5265 ServerLogContext(s, r), dn(d), targeti(ti),
5266 dnpv(dnpv_), tipv(tipv_) { }
5267 void finish(int r) override {
5268 assert(r == 0);
5269 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv);
5270 }
5271 };
5272
5273
5274 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
5275 {
5276 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
5277
5278 mdr->ls = mdlog->get_current_segment();
5279
5280 // predirty NEW dentry
5281 version_t dnpv = dn->pre_dirty();
5282 version_t tipv = targeti->pre_dirty();
5283
5284 // project inode update
5285 inode_t *pi = targeti->project_inode();
5286 pi->nlink++;
5287 pi->ctime = mdr->get_op_stamp();
5288 pi->change_attr++;
5289 pi->version = tipv;
5290
5291 // log + wait
5292 EUpdate *le = new EUpdate(mdlog, "link_local");
5293 mdlog->start_entry(le);
5294 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5295 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
5296 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
5297 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5298 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
5299
5300 // do this after predirty_*, to avoid funky extra dnl arg
5301 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5302
5303 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv));
5304 }
5305
5306 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
5307 version_t dnpv, version_t tipv)
5308 {
5309 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
5310
5311 // link and unlock the NEW dentry
5312 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
5313 if (!dnl->get_inode())
5314 dn->link_remote(dnl, targeti);
5315 dn->mark_dirty(dnpv, mdr->ls);
5316
5317 // target inode
5318 targeti->pop_and_dirty_projected_inode(mdr->ls);
5319
5320 mdr->apply();
5321
5322 MDRequestRef null_ref;
5323 mdcache->send_dentry_link(dn, null_ref);
5324
5325 // bump target popularity
5326 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5327 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
5328
5329 // reply
5330 respond_to_request(mdr, 0);
5331 }
5332
5333
5334 // link / unlink remote
5335
5336 class C_MDS_link_remote_finish : public ServerLogContext {
5337 bool inc;
5338 CDentry *dn;
5339 CInode *targeti;
5340 version_t dpv;
5341 public:
5342 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
5343 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
5344 dpv(d->get_projected_version()) {}
5345 void finish(int r) override {
5346 assert(r == 0);
5347 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
5348 }
5349 };
5350
5351 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
5352 {
5353 dout(10) << "_link_remote "
5354 << (inc ? "link ":"unlink ")
5355 << *dn << " to " << *targeti << dendl;
5356
5357 // 1. send LinkPrepare to dest (journal nlink++ prepare)
5358 mds_rank_t linkauth = targeti->authority().first;
5359 if (mdr->more()->witnessed.count(linkauth) == 0) {
5360 if (mds->is_cluster_degraded() &&
5361 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
5362 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
5363 if (mdr->more()->waiting_on_slave.empty())
5364 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
5365 return;
5366 }
5367
5368 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
5369 int op;
5370 if (inc)
5371 op = MMDSSlaveRequest::OP_LINKPREP;
5372 else
5373 op = MMDSSlaveRequest::OP_UNLINKPREP;
5374 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, op);
5375 targeti->set_object_info(req->get_object_info());
5376 req->op_stamp = mdr->get_op_stamp();
5377 mds->send_message_mds(req, linkauth);
5378
5379 assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
5380 mdr->more()->waiting_on_slave.insert(linkauth);
5381 return;
5382 }
5383 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
5384
5385 assert(g_conf->mds_kill_link_at != 2);
5386
5387 mdr->set_mds_stamp(ceph_clock_now());
5388
5389 // add to event
5390 mdr->ls = mdlog->get_current_segment();
5391 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
5392 mdlog->start_entry(le);
5393 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5394 if (!mdr->more()->witnessed.empty()) {
5395 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
5396 le->reqid = mdr->reqid;
5397 le->had_slaves = true;
5398 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
5399 }
5400
5401 if (inc) {
5402 dn->pre_dirty();
5403 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
5404 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5405 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5406 } else {
5407 dn->pre_dirty();
5408 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
5409 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
5410 le->metablob.add_null_dentry(dn, true);
5411 dn->push_projected_linkage();
5412 }
5413
5414 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
5415 }
5416
5417 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
5418 CDentry *dn, CInode *targeti,
5419 version_t dpv)
5420 {
5421 dout(10) << "_link_remote_finish "
5422 << (inc ? "link ":"unlink ")
5423 << *dn << " to " << *targeti << dendl;
5424
5425 assert(g_conf->mds_kill_link_at != 3);
5426
5427 if (!mdr->more()->witnessed.empty())
5428 mdcache->logged_master_update(mdr->reqid);
5429
5430 if (inc) {
5431 // link the new dentry
5432 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
5433 if (!dnl->get_inode())
5434 dn->link_remote(dnl, targeti);
5435 dn->mark_dirty(dpv, mdr->ls);
5436 } else {
5437 // unlink main dentry
5438 dn->get_dir()->unlink_inode(dn);
5439 dn->pop_projected_linkage();
5440 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
5441 }
5442
5443 mdr->apply();
5444
5445 MDRequestRef null_ref;
5446 if (inc)
5447 mdcache->send_dentry_link(dn, null_ref);
5448 else
5449 mdcache->send_dentry_unlink(dn, NULL, null_ref);
5450
5451 // bump target popularity
5452 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5453 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
5454
5455 // reply
5456 respond_to_request(mdr, 0);
5457
5458 if (!inc)
5459 // removing a new dn?
5460 dn->get_dir()->try_remove_unlinked_dn(dn);
5461 }
5462
5463
5464 // remote linking/unlinking
5465
5466 class C_MDS_SlaveLinkPrep : public ServerLogContext {
5467 CInode *targeti;
5468 public:
5469 C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t) :
5470 ServerLogContext(s, r), targeti(t) { }
5471 void finish(int r) override {
5472 assert(r == 0);
5473 server->_logged_slave_link(mdr, targeti);
5474 }
5475 };
5476
5477 class C_MDS_SlaveLinkCommit : public ServerContext {
5478 MDRequestRef mdr;
5479 CInode *targeti;
5480 public:
5481 C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
5482 ServerContext(s), mdr(r), targeti(t) { }
5483 void finish(int r) override {
5484 server->_commit_slave_link(mdr, r, targeti);
5485 }
5486 };
5487
5488 /* This function DOES put the mdr->slave_request before returning*/
5489 void Server::handle_slave_link_prep(MDRequestRef& mdr)
5490 {
5491 dout(10) << "handle_slave_link_prep " << *mdr
5492 << " on " << mdr->slave_request->get_object_info()
5493 << dendl;
5494
5495 assert(g_conf->mds_kill_link_at != 4);
5496
5497 CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
5498 assert(targeti);
5499 dout(10) << "targeti " << *targeti << dendl;
5500 CDentry *dn = targeti->get_parent_dn();
5501 CDentry::linkage_t *dnl = dn->get_linkage();
5502 assert(dnl->is_primary());
5503
5504 mdr->set_op_stamp(mdr->slave_request->op_stamp);
5505
5506 mdr->auth_pin(targeti);
5507
5508 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
5509 assert(g_conf->mds_kill_link_at != 5);
5510
5511 // journal it
5512 mdr->ls = mdlog->get_current_segment();
5513 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
5514 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
5515 mdlog->start_entry(le);
5516
5517 inode_t *pi = dnl->get_inode()->project_inode();
5518
5519 // update journaled target inode
5520 bool inc;
5521 if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
5522 inc = true;
5523 pi->nlink++;
5524 } else {
5525 inc = false;
5526 pi->nlink--;
5527 }
5528
5529 link_rollback rollback;
5530 rollback.reqid = mdr->reqid;
5531 rollback.ino = targeti->ino();
5532 rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
5533 const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
5534 rollback.old_dir_mtime = pf->fragstat.mtime;
5535 rollback.old_dir_rctime = pf->rstat.rctime;
5536 rollback.was_inc = inc;
5537 ::encode(rollback, le->rollback);
5538 mdr->more()->rollback_bl = le->rollback;
5539
5540 pi->ctime = mdr->get_op_stamp();
5541 pi->version = targeti->pre_dirty();
5542
5543 dout(10) << " projected inode " << pi << " v " << pi->version << dendl;
5544
5545 // commit case
5546 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
5547 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
5548
5549 // set up commit waiter
5550 mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
5551
5552 mdr->more()->slave_update_journaled = true;
5553 submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti),
5554 mdr, __func__);
5555 mdlog->flush();
5556 }
5557
5558 void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti)
5559 {
5560 dout(10) << "_logged_slave_link " << *mdr
5561 << " " << *targeti << dendl;
5562
5563 assert(g_conf->mds_kill_link_at != 6);
5564
5565 // update the target
5566 targeti->pop_and_dirty_projected_inode(mdr->ls);
5567 mdr->apply();
5568
5569 // hit pop
5570 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5571
5572 // done.
5573 mdr->slave_request->put();
5574 mdr->slave_request = 0;
5575
5576 // ack
5577 if (!mdr->aborted) {
5578 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5579 MMDSSlaveRequest::OP_LINKPREPACK);
5580 mds->send_message_mds(reply, mdr->slave_to_mds);
5581 } else {
5582 dout(10) << " abort flag set, finishing" << dendl;
5583 mdcache->request_finish(mdr);
5584 }
5585 }
5586
5587
5588 struct C_MDS_CommittedSlave : public ServerLogContext {
5589 C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
5590 void finish(int r) override {
5591 server->_committed_slave(mdr);
5592 }
5593 };
5594
5595 void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
5596 {
5597 dout(10) << "_commit_slave_link " << *mdr
5598 << " r=" << r
5599 << " " << *targeti << dendl;
5600
5601 assert(g_conf->mds_kill_link_at != 7);
5602
5603 if (r == 0) {
5604 // drop our pins, etc.
5605 mdr->cleanup();
5606
5607 // write a commit to the journal
5608 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
5609 ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
5610 mdlog->start_entry(le);
5611 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
5612 mdlog->flush();
5613 } else {
5614 do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
5615 }
5616 }
5617
5618 void Server::_committed_slave(MDRequestRef& mdr)
5619 {
5620 dout(10) << "_committed_slave " << *mdr << dendl;
5621
5622 assert(g_conf->mds_kill_link_at != 8);
5623
5624 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5625 MMDSSlaveRequest::OP_COMMITTED);
5626 mds->send_message_mds(req, mdr->slave_to_mds);
5627 mdcache->request_finish(mdr);
5628 }
5629
5630 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
5631 MutationRef mut;
5632 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r) : ServerLogContext(s, r), mut(m) {}
5633 void finish(int r) override {
5634 server->_link_rollback_finish(mut, mdr);
5635 }
5636 };
5637
5638 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
5639 {
5640 link_rollback rollback;
5641 bufferlist::iterator p = rbl.begin();
5642 ::decode(rollback, p);
5643
5644 dout(10) << "do_link_rollback on " << rollback.reqid
5645 << (rollback.was_inc ? " inc":" dec")
5646 << " ino " << rollback.ino
5647 << dendl;
5648
5649 assert(g_conf->mds_kill_link_at != 9);
5650
5651 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
5652 assert(mdr || mds->is_resolve());
5653
5654 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
5655 mut->ls = mds->mdlog->get_current_segment();
5656
5657 CInode *in = mdcache->get_inode(rollback.ino);
5658 assert(in);
5659 dout(10) << " target is " << *in << dendl;
5660 assert(!in->is_projected()); // live slave request hold versionlock xlock.
5661
5662 inode_t *pi = in->project_inode();
5663 pi->version = in->pre_dirty();
5664 mut->add_projected_inode(in);
5665
5666 // parent dir rctime
5667 CDir *parent = in->get_projected_parent_dn()->get_dir();
5668 fnode_t *pf = parent->project_fnode();
5669 mut->add_projected_fnode(parent);
5670 pf->version = parent->pre_dirty();
5671 if (pf->fragstat.mtime == pi->ctime) {
5672 pf->fragstat.mtime = rollback.old_dir_mtime;
5673 if (pf->rstat.rctime == pi->ctime)
5674 pf->rstat.rctime = rollback.old_dir_rctime;
5675 mut->add_updated_lock(&parent->get_inode()->filelock);
5676 mut->add_updated_lock(&parent->get_inode()->nestlock);
5677 }
5678
5679 // inode
5680 pi->ctime = rollback.old_ctime;
5681 if (rollback.was_inc)
5682 pi->nlink--;
5683 else
5684 pi->nlink++;
5685
5686 // journal it
5687 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
5688 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
5689 mdlog->start_entry(le);
5690 le->commit.add_dir_context(parent);
5691 le->commit.add_dir(parent, true);
5692 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
5693
5694 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr),
5695 mdr, __func__);
5696 mdlog->flush();
5697 }
5698
5699 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr)
5700 {
5701 dout(10) << "_link_rollback_finish" << dendl;
5702
5703 assert(g_conf->mds_kill_link_at != 10);
5704
5705 mut->apply();
5706 if (mdr)
5707 mdcache->request_finish(mdr);
5708
5709 mdcache->finish_rollback(mut->reqid);
5710
5711 mut->cleanup();
5712 }
5713
5714
5715 /* This function DOES NOT put the passed message before returning*/
5716 void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *m)
5717 {
5718 dout(10) << "handle_slave_link_prep_ack " << *mdr
5719 << " " << *m << dendl;
5720 mds_rank_t from = mds_rank_t(m->get_source().num());
5721
5722 assert(g_conf->mds_kill_link_at != 11);
5723
5724 // note slave
5725 mdr->more()->slaves.insert(from);
5726
5727 // witnessed!
5728 assert(mdr->more()->witnessed.count(from) == 0);
5729 mdr->more()->witnessed.insert(from);
5730 assert(!m->is_not_journaled());
5731 mdr->more()->has_journaled_slaves = true;
5732
5733 // remove from waiting list
5734 assert(mdr->more()->waiting_on_slave.count(from));
5735 mdr->more()->waiting_on_slave.erase(from);
5736
5737 assert(mdr->more()->waiting_on_slave.empty());
5738
5739 dispatch_client_request(mdr); // go again!
5740 }
5741
5742
5743
5744
5745
5746 // UNLINK
5747
5748 void Server::handle_client_unlink(MDRequestRef& mdr)
5749 {
5750 MClientRequest *req = mdr->client_request;
5751 client_t client = mdr->get_client();
5752
5753 // rmdir or unlink?
5754 bool rmdir = false;
5755 if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true;
5756
5757 if (req->get_filepath().depth() == 0) {
5758 respond_to_request(mdr, -EINVAL);
5759 return;
5760 }
5761
5762 // traverse to path
5763 vector<CDentry*> trace;
5764 CInode *in;
5765 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(), &trace, &in, MDS_TRAVERSE_FORWARD);
5766 if (r > 0) return;
5767 if (r < 0) {
5768 if (r == -ESTALE) {
5769 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
5770 mdcache->find_ino_peers(req->get_filepath().get_ino(), new C_MDS_TryFindInode(this, mdr));
5771 return;
5772 }
5773 respond_to_request(mdr, r);
5774 return;
5775 }
5776 if (mdr->snapid != CEPH_NOSNAP) {
5777 respond_to_request(mdr, -EROFS);
5778 return;
5779 }
5780
5781 CDentry *dn = trace[trace.size()-1];
5782 assert(dn);
5783 if (!dn->is_auth()) {
5784 mdcache->request_forward(mdr, dn->authority().first);
5785 return;
5786 }
5787
5788 CInode *diri = dn->get_dir()->get_inode();
5789
5790 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
5791 assert(!dnl->is_null());
5792
5793 if (rmdir) {
5794 dout(7) << "handle_client_rmdir on " << *dn << dendl;
5795 } else {
5796 dout(7) << "handle_client_unlink on " << *dn << dendl;
5797 }
5798 dout(7) << "dn links to " << *in << dendl;
5799
5800 // rmdir vs is_dir
5801 if (in->is_dir()) {
5802 if (rmdir) {
5803 // do empty directory checks
5804 if (_dir_is_nonempty_unlocked(mdr, in)) {
5805 respond_to_request(mdr, -ENOTEMPTY);
5806 return;
5807 }
5808 } else {
5809 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
5810 respond_to_request(mdr, -EISDIR);
5811 return;
5812 }
5813 } else {
5814 if (rmdir) {
5815 // unlink
5816 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
5817 respond_to_request(mdr, -ENOTDIR);
5818 return;
5819 }
5820 }
5821
5822 // -- create stray dentry? --
5823 CDentry *straydn = NULL;
5824 if (dnl->is_primary()) {
5825 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
5826 if (!straydn)
5827 return;
5828 dout(10) << " straydn is " << *straydn << dendl;
5829 } else if (mdr->straydn) {
5830 mdr->unpin(mdr->straydn);
5831 mdr->straydn = NULL;
5832 }
5833
5834 // lock
5835 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5836
5837 for (int i=0; i<(int)trace.size()-1; i++)
5838 rdlocks.insert(&trace[i]->lock);
5839 xlocks.insert(&dn->lock);
5840 wrlocks.insert(&diri->filelock);
5841 wrlocks.insert(&diri->nestlock);
5842 xlocks.insert(&in->linklock);
5843 if (straydn) {
5844 wrlocks.insert(&straydn->get_dir()->inode->filelock);
5845 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
5846 xlocks.insert(&straydn->lock);
5847 }
5848 if (in->is_dir())
5849 rdlocks.insert(&in->filelock); // to verify it's empty
5850 mds->locker->include_snap_rdlocks(rdlocks, dnl->get_inode());
5851
5852 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5853 return;
5854
5855 if (in->is_dir() &&
5856 _dir_is_nonempty(mdr, in)) {
5857 respond_to_request(mdr, -ENOTEMPTY);
5858 return;
5859 }
5860
5861 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5862 if (!check_access(mdr, diri, MAY_WRITE))
5863 return;
5864 }
5865
5866 // yay!
5867 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
5868 // subtree root auths need to be witnesses
5869 set<mds_rank_t> witnesses;
5870 in->list_replicas(witnesses);
5871 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
5872
5873 for (set<mds_rank_t>::iterator p = witnesses.begin();
5874 p != witnesses.end();
5875 ++p) {
5876 if (mdr->more()->witnessed.count(*p)) {
5877 dout(10) << " already witnessed by mds." << *p << dendl;
5878 } else if (mdr->more()->waiting_on_slave.count(*p)) {
5879 dout(10) << " already waiting on witness mds." << *p << dendl;
5880 } else {
5881 if (!_rmdir_prepare_witness(mdr, *p, trace, straydn))
5882 return;
5883 }
5884 }
5885 if (!mdr->more()->waiting_on_slave.empty())
5886 return; // we're waiting for a witness.
5887 }
5888
5889 // ok!
5890 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
5891 _link_remote(mdr, false, dn, dnl->get_inode());
5892 else
5893 _unlink_local(mdr, dn, straydn);
5894 }
5895
5896 class C_MDS_unlink_local_finish : public ServerLogContext {
5897 CDentry *dn;
5898 CDentry *straydn;
5899 version_t dnpv; // deleted dentry
5900 public:
5901 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
5902 ServerLogContext(s, r), dn(d), straydn(sd),
5903 dnpv(d->get_projected_version()) {}
5904 void finish(int r) override {
5905 assert(r == 0);
5906 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
5907 }
5908 };
5909
5910 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
5911 {
5912 dout(10) << "_unlink_local " << *dn << dendl;
5913
5914 CDentry::linkage_t *dnl = dn->get_projected_linkage();
5915 CInode *in = dnl->get_inode();
5916
5917 SnapRealm *realm = in->find_snaprealm();
5918 snapid_t follows = realm->get_newest_seq();
5919
5920 // ok, let's do it.
5921 mdr->ls = mdlog->get_current_segment();
5922
5923 // prepare log entry
5924 EUpdate *le = new EUpdate(mdlog, "unlink_local");
5925 mdlog->start_entry(le);
5926 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5927 if (!mdr->more()->witnessed.empty()) {
5928 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
5929 le->reqid = mdr->reqid;
5930 le->had_slaves = true;
5931 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
5932 }
5933
5934 if (straydn) {
5935 assert(dnl->is_primary());
5936 straydn->push_projected_linkage(in);
5937 straydn->first = follows + 1;
5938 }
5939
5940 // the unlinked dentry
5941 dn->pre_dirty();
5942
5943 inode_t *pi = in->project_inode();
5944 dn->make_path_string(pi->stray_prior_path, true);
5945 mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
5946 pi->version = in->pre_dirty();
5947 pi->ctime = mdr->get_op_stamp();
5948 pi->change_attr++;
5949 pi->nlink--;
5950 if (pi->nlink == 0)
5951 in->state_set(CInode::STATE_ORPHAN);
5952
5953 if (dnl->is_primary()) {
5954 // primary link. add stray dentry.
5955 assert(straydn);
5956 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
5957 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5958
5959 // project snaprealm, too
5960 if (in->snaprealm || follows + 1 > in->get_oldest_snap())
5961 in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
5962
5963 pi->update_backtrace();
5964 le->metablob.add_primary_dentry(straydn, in, true, true);
5965 } else {
5966 // remote link. update remote inode.
5967 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
5968 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5969 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5970 }
5971
5972 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
5973 le->metablob.add_null_dentry(dn, true);
5974
5975 if (in->is_dir()) {
5976 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
5977 le->metablob.renamed_dirino = in->ino();
5978 }
5979
5980 dn->push_projected_linkage();
5981
5982 if (in->is_dir()) {
5983 assert(straydn);
5984 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
5985 }
5986
5987 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
5988 }
5989
5990 void Server::_unlink_local_finish(MDRequestRef& mdr,
5991 CDentry *dn, CDentry *straydn,
5992 version_t dnpv)
5993 {
5994 dout(10) << "_unlink_local_finish " << *dn << dendl;
5995
5996 if (!mdr->more()->witnessed.empty())
5997 mdcache->logged_master_update(mdr->reqid);
5998
5999 // unlink main dentry
6000 dn->get_dir()->unlink_inode(dn);
6001 dn->pop_projected_linkage();
6002
6003 // relink as stray? (i.e. was primary link?)
6004 CInode *strayin = NULL;
6005 bool snap_is_new = false;
6006 if (straydn) {
6007 dout(20) << " straydn is " << *straydn << dendl;
6008 CDentry::linkage_t *straydnl = straydn->pop_projected_linkage();
6009 strayin = straydnl->get_inode();
6010
6011 snap_is_new = strayin->snaprealm ? true : false;
6012 mdcache->touch_dentry_bottom(straydn);
6013 }
6014
6015 dn->mark_dirty(dnpv, mdr->ls);
6016 mdr->apply();
6017
6018 if (snap_is_new) //only new if strayin exists
6019 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, true);
6020
6021 mdcache->send_dentry_unlink(dn, straydn, mdr);
6022
6023 // update subtree map?
6024 if (straydn && strayin->is_dir())
6025 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
6026
6027 // bump pop
6028 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
6029
6030 // reply
6031 respond_to_request(mdr, 0);
6032
6033 // removing a new dn?
6034 dn->get_dir()->try_remove_unlinked_dn(dn);
6035
6036 // clean up ?
6037 // respond_to_request() drops locks. So stray reintegration can race with us.
6038 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6039 // Tip off the MDCache that this dentry is a stray that
6040 // might be elegible for purge.
6041 mdcache->notify_stray(straydn);
6042 }
6043 }
6044
6045 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
6046 {
6047 if (mds->is_cluster_degraded() &&
6048 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6049 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
6050 if (mdr->more()->waiting_on_slave.empty())
6051 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6052 return false;
6053 }
6054
6055 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
6056 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6057 MMDSSlaveRequest::OP_RMDIRPREP);
6058 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
6059 for (auto dn : trace)
6060 req->srcdnpath.push_dentry(dn->name);
6061 mdcache->replicate_stray(straydn, who, req->stray);
6062
6063 req->op_stamp = mdr->get_op_stamp();
6064 mds->send_message_mds(req, who);
6065
6066 assert(mdr->more()->waiting_on_slave.count(who) == 0);
6067 mdr->more()->waiting_on_slave.insert(who);
6068 return true;
6069 }
6070
6071 struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
6072 CDentry *dn, *straydn;
6073 C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
6074 : ServerLogContext(s, r), dn(d), straydn(st) {}
6075 void finish(int r) override {
6076 server->_logged_slave_rmdir(mdr, dn, straydn);
6077 }
6078 };
6079
6080 struct C_MDS_SlaveRmdirCommit : public ServerContext {
6081 MDRequestRef mdr;
6082 CDentry *straydn;
6083 C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
6084 : ServerContext(s), mdr(r), straydn(sd) { }
6085 void finish(int r) override {
6086 server->_commit_slave_rmdir(mdr, r, straydn);
6087 }
6088 };
6089
6090 void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
6091 {
6092 dout(10) << "handle_slave_rmdir_prep " << *mdr
6093 << " " << mdr->slave_request->srcdnpath
6094 << " to " << mdr->slave_request->destdnpath
6095 << dendl;
6096
6097 vector<CDentry*> trace;
6098 filepath srcpath(mdr->slave_request->srcdnpath);
6099 dout(10) << " src " << srcpath << dendl;
6100 CInode *in;
6101 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK);
6102 if (r > 0) return;
6103 if (r == -ESTALE) {
6104 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
6105 mdr->slave_to_mds);
6106 return;
6107 }
6108 assert(r == 0);
6109 CDentry *dn = trace[trace.size()-1];
6110 dout(10) << " dn " << *dn << dendl;
6111 mdr->pin(dn);
6112
6113 assert(mdr->straydn);
6114 CDentry *straydn = mdr->straydn;
6115 dout(10) << " straydn " << *straydn << dendl;
6116
6117 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6118
6119 rmdir_rollback rollback;
6120 rollback.reqid = mdr->reqid;
6121 rollback.src_dir = dn->get_dir()->dirfrag();
6122 rollback.src_dname = dn->name;
6123 rollback.dest_dir = straydn->get_dir()->dirfrag();
6124 rollback.dest_dname = straydn->name;
6125 ::encode(rollback, mdr->more()->rollback_bl);
6126 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
6127
6128 // set up commit waiter
6129 mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
6130
6131 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
6132 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
6133 dn->get_dir()->unlink_inode(dn);
6134 straydn->get_dir()->link_primary_inode(straydn, in);
6135
6136 assert(straydn->first >= in->first);
6137 in->first = straydn->first;
6138
6139 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), false);
6140
6141 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6142 MMDSSlaveRequest::OP_RMDIRPREPACK);
6143 reply->mark_not_journaled();
6144 mds->send_message_mds(reply, mdr->slave_to_mds);
6145
6146 // send caps to auth (if we're not already)
6147 if (in->is_any_caps() && !in->state_test(CInode::STATE_EXPORTINGCAPS))
6148 mdcache->migrator->export_caps(in);
6149
6150 mdcache->touch_dentry_bottom(straydn); // move stray to end of lru
6151
6152 mdr->slave_request->put();
6153 mdr->slave_request = 0;
6154 mdr->straydn = 0;
6155 return;
6156 }
6157
6158 straydn->push_projected_linkage(in);
6159 dn->push_projected_linkage();
6160
6161 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
6162 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
6163 mdlog->start_entry(le);
6164 le->rollback = mdr->more()->rollback_bl;
6165
6166 le->commit.add_dir_context(straydn->get_dir());
6167 le->commit.add_primary_dentry(straydn, in, true);
6168 // slave: no need to journal original dentry
6169
6170 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6171 le->commit.renamed_dirino = in->ino();
6172
6173 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6174
6175 mdr->more()->slave_update_journaled = true;
6176 submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
6177 mdr, __func__);
6178 mdlog->flush();
6179 }
6180
6181 void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
6182 {
6183 dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
6184
6185 // update our cache now, so we are consistent with what is in the journal
6186 // when we journal a subtree map
6187 CInode *in = dn->get_linkage()->get_inode();
6188 dn->get_dir()->unlink_inode(dn);
6189 straydn->pop_projected_linkage();
6190 dn->pop_projected_linkage();
6191 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), true);
6192
6193 // done.
6194 mdr->slave_request->put();
6195 mdr->slave_request = 0;
6196 mdr->straydn = 0;
6197
6198 if (!mdr->aborted) {
6199 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6200 MMDSSlaveRequest::OP_RMDIRPREPACK);
6201 mds->send_message_mds(reply, mdr->slave_to_mds);
6202 } else {
6203 dout(10) << " abort flag set, finishing" << dendl;
6204 mdcache->request_finish(mdr);
6205 }
6206 }
6207
6208 void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
6209 {
6210 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
6211 << " " << *ack << dendl;
6212
6213 mds_rank_t from = mds_rank_t(ack->get_source().num());
6214
6215 mdr->more()->slaves.insert(from);
6216 mdr->more()->witnessed.insert(from);
6217 if (!ack->is_not_journaled())
6218 mdr->more()->has_journaled_slaves = true;
6219
6220 // remove from waiting list
6221 assert(mdr->more()->waiting_on_slave.count(from));
6222 mdr->more()->waiting_on_slave.erase(from);
6223
6224 if (mdr->more()->waiting_on_slave.empty())
6225 dispatch_client_request(mdr); // go again!
6226 else
6227 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
6228 }
6229
6230 void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
6231 {
6232 dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
6233
6234 if (r == 0) {
6235 if (mdr->more()->slave_update_journaled) {
6236 CInode *strayin = straydn->get_projected_linkage()->get_inode();
6237 if (strayin && !strayin->snaprealm)
6238 mdcache->clear_dirty_bits_for_stray(strayin);
6239 }
6240
6241 mdr->cleanup();
6242
6243 if (mdr->more()->slave_update_journaled) {
6244 // write a commit to the journal
6245 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
6246 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
6247 ESlaveUpdate::RMDIR);
6248 mdlog->start_entry(le);
6249 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
6250 mdlog->flush();
6251 } else {
6252 _committed_slave(mdr);
6253 }
6254 } else {
6255 // abort
6256 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
6257 }
6258 }
6259
6260 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
6261 metareqid_t reqid;
6262 CDentry *dn;
6263 CDentry *straydn;
6264 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
6265 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
6266 void finish(int r) override {
6267 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
6268 }
6269 };
6270
6271 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
6272 {
6273 // unlink the other rollback methods, the rmdir rollback is only
6274 // needed to record the subtree changes in the journal for inode
6275 // replicas who are auth for empty dirfrags. no actual changes to
6276 // the file system are taking place here, so there is no Mutation.
6277
6278 rmdir_rollback rollback;
6279 bufferlist::iterator p = rbl.begin();
6280 ::decode(rollback, p);
6281
6282 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
6283 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
6284 assert(mdr || mds->is_resolve());
6285
6286 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
6287 if (!dir)
6288 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
6289 assert(dir);
6290 CDentry *dn = dir->lookup(rollback.src_dname);
6291 assert(dn);
6292 dout(10) << " dn " << *dn << dendl;
6293 dir = mdcache->get_dirfrag(rollback.dest_dir);
6294 assert(dir);
6295 CDentry *straydn = dir->lookup(rollback.dest_dname);
6296 assert(straydn);
6297 dout(10) << " straydn " << *dn << dendl;
6298 CInode *in = straydn->get_linkage()->get_inode();
6299
6300 if (mdr && !mdr->more()->slave_update_journaled) {
6301 assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
6302
6303 straydn->get_dir()->unlink_inode(straydn);
6304 dn->get_dir()->link_primary_inode(dn, in);
6305
6306 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), false);
6307
6308 mdcache->request_finish(mdr);
6309 mdcache->finish_rollback(rollback.reqid);
6310 return;
6311 }
6312
6313 dn->push_projected_linkage(in);
6314 straydn->push_projected_linkage();
6315
6316 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
6317 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
6318 mdlog->start_entry(le);
6319
6320 le->commit.add_dir_context(dn->get_dir());
6321 le->commit.add_primary_dentry(dn, in, true);
6322 // slave: no need to journal straydn
6323
6324 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6325 le->commit.renamed_dirino = in->ino();
6326
6327 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
6328
6329 submit_mdlog_entry(le,
6330 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
6331 dn, straydn),
6332 mdr, __func__);
6333 mdlog->flush();
6334 }
6335
6336 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
6337 {
6338 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
6339
6340 straydn->get_dir()->unlink_inode(straydn);
6341 dn->pop_projected_linkage();
6342 straydn->pop_projected_linkage();
6343
6344 CInode *in = dn->get_linkage()->get_inode();
6345 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), true);
6346 if (mds->is_resolve()) {
6347 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
6348 mdcache->try_trim_non_auth_subtree(root);
6349 }
6350
6351 if (mdr)
6352 mdcache->request_finish(mdr);
6353
6354 mdcache->finish_rollback(reqid);
6355 }
6356
6357
6358 /** _dir_is_nonempty[_unlocked]
6359 *
6360 * check if a directory is non-empty (i.e. we can rmdir it).
6361 *
6362 * the unlocked varient this is a fastpath check. we can't really be
6363 * sure until we rdlock the filelock.
6364 */
6365 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
6366 {
6367 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
6368 assert(in->is_auth());
6369
6370 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
6371 return true; // in a snapshot!
6372
6373 list<CDir*> ls;
6374 in->get_dirfrags(ls);
6375 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6376 CDir *dir = *p;
6377 // is the frag obviously non-empty?
6378 if (dir->is_auth()) {
6379 if (dir->get_projected_fnode()->fragstat.size()) {
6380 dout(10) << "dir_is_nonempty_unlocked dirstat has "
6381 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
6382 return true;
6383 }
6384 }
6385 }
6386
6387 return false;
6388 }
6389
6390 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
6391 {
6392 dout(10) << "dir_is_nonempty " << *in << dendl;
6393 assert(in->is_auth());
6394 assert(in->filelock.can_read(mdr->get_client()));
6395
6396 frag_info_t dirstat;
6397 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
6398
6399 list<CDir*> ls;
6400 in->get_dirfrags(ls);
6401 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6402 CDir *dir = *p;
6403 const fnode_t *pf = dir->get_projected_fnode();
6404 if (pf->fragstat.size()) {
6405 dout(10) << "dir_is_nonempty dirstat has "
6406 << pf->fragstat.size() << " items " << *dir << dendl;
6407 return true;
6408 }
6409
6410 if (pf->accounted_fragstat.version == dirstat_version)
6411 dirstat.add(pf->accounted_fragstat);
6412 else
6413 dirstat.add(pf->fragstat);
6414 }
6415
6416 return dirstat.size() != in->get_projected_inode()->dirstat.size();
6417 }
6418
6419
6420 // ======================================================
6421
6422
6423 class C_MDS_rename_finish : public ServerLogContext {
6424 CDentry *srcdn;
6425 CDentry *destdn;
6426 CDentry *straydn;
6427 public:
6428 C_MDS_rename_finish(Server *s, MDRequestRef& r,
6429 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
6430 ServerLogContext(s, r),
6431 srcdn(sdn), destdn(ddn), straydn(stdn) { }
6432 void finish(int r) override {
6433 assert(r == 0);
6434 server->_rename_finish(mdr, srcdn, destdn, straydn);
6435 }
6436 };
6437
6438
6439 /** handle_client_rename
6440 *
6441 * rename master is the destdn auth. this is because cached inodes
6442 * must remain connected. thus, any replica of srci, must also
6443 * replicate destdn, and possibly straydn, so that srci (and
6444 * destdn->inode) remain connected during the rename.
6445 *
6446 * to do this, we freeze srci, then master (destdn auth) verifies that
6447 * all other nodes have also replciated destdn and straydn. note that
6448 * destdn replicas need not also replicate srci. this only works when
6449 * destdn is master.
6450 *
6451 * This function takes responsibility for the passed mdr.
6452 */
6453 void Server::handle_client_rename(MDRequestRef& mdr)
6454 {
6455 MClientRequest *req = mdr->client_request;
6456 dout(7) << "handle_client_rename " << *req << dendl;
6457
6458 filepath destpath = req->get_filepath();
6459 filepath srcpath = req->get_filepath2();
6460 if (destpath.depth() == 0 || srcpath.depth() == 0) {
6461 respond_to_request(mdr, -EINVAL);
6462 return;
6463 }
6464 const string &destname = destpath.last_dentry();
6465
6466 vector<CDentry*>& srctrace = mdr->dn[1];
6467 vector<CDentry*>& desttrace = mdr->dn[0];
6468
6469 set<SimpleLock*> rdlocks, wrlocks, xlocks;
6470
6471 CDentry *destdn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, true, false, true);
6472 if (!destdn) return;
6473 dout(10) << " destdn " << *destdn << dendl;
6474 if (mdr->snapid != CEPH_NOSNAP) {
6475 respond_to_request(mdr, -EROFS);
6476 return;
6477 }
6478 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
6479 CDir *destdir = destdn->get_dir();
6480 assert(destdir->is_auth());
6481
6482 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER);
6483 if (r > 0)
6484 return; // delayed
6485 if (r < 0) {
6486 if (r == -ESTALE) {
6487 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
6488 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
6489 } else {
6490 dout(10) << "FAIL on error " << r << dendl;
6491 respond_to_request(mdr, r);
6492 }
6493 return;
6494
6495 }
6496 assert(!srctrace.empty());
6497 CDentry *srcdn = srctrace[srctrace.size()-1];
6498 dout(10) << " srcdn " << *srcdn << dendl;
6499 if (srcdn->last != CEPH_NOSNAP) {
6500 respond_to_request(mdr, -EROFS);
6501 return;
6502 }
6503 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
6504 CInode *srci = srcdnl->get_inode();
6505 dout(10) << " srci " << *srci << dendl;
6506
6507 CInode *oldin = 0;
6508 if (!destdnl->is_null()) {
6509 //dout(10) << "dest dn exists " << *destdn << dendl;
6510 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
6511 if (!oldin) return;
6512 dout(10) << " oldin " << *oldin << dendl;
6513
6514 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
6515 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
6516 respond_to_request(mdr, -ENOTEMPTY);
6517 return;
6518 }
6519
6520 // if srcdn is replica, need to make sure its linkage is correct
6521 if (srcdn->is_auth() ||
6522 srcdn->lock.can_read(mdr->get_client()) ||
6523 (srcdn->lock.is_xlocked() && srcdn->lock.get_xlock_by() == mdr)) {
6524 // mv /some/thing /to/some/existing_other_thing
6525 if (oldin->is_dir() && !srci->is_dir()) {
6526 respond_to_request(mdr, -EISDIR);
6527 return;
6528 }
6529 if (!oldin->is_dir() && srci->is_dir()) {
6530 respond_to_request(mdr, -ENOTDIR);
6531 return;
6532 }
6533 if (srci == oldin && !srcdn->get_dir()->inode->is_stray()) {
6534 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
6535 return;
6536 }
6537 }
6538 }
6539
6540 // -- some sanity checks --
6541
6542 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
6543 if (destpath.get_ino() != srcpath.get_ino() &&
6544 !(req->get_source().is_mds() &&
6545 MDS_INO_IS_MDSDIR(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
6546 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
6547 CInode *destbase = desttrace[0]->get_dir()->get_inode();
6548 // ok, extend srctrace toward root until it is an ancestor of desttrace.
6549 while (srcbase != destbase &&
6550 !srcbase->is_projected_ancestor_of(destbase)) {
6551 CDentry *pdn = srcbase->get_projected_parent_dn();
6552 srctrace.insert(srctrace.begin(), pdn);
6553 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
6554 srcbase = pdn->get_dir()->get_inode();
6555 }
6556
6557 // then, extend destpath until it shares the same parent inode as srcpath.
6558 while (destbase != srcbase) {
6559 CDentry *pdn = destbase->get_projected_parent_dn();
6560 desttrace.insert(desttrace.begin(), pdn);
6561 rdlocks.insert(&pdn->lock);
6562 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
6563 destbase = pdn->get_dir()->get_inode();
6564 }
6565 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
6566 }
6567
6568 // src == dest?
6569 if (srcdn->get_dir() == destdir && srcdn->name == destname) {
6570 dout(7) << "rename src=dest, noop" << dendl;
6571 respond_to_request(mdr, 0);
6572 return;
6573 }
6574
6575 // dest a child of src?
6576 // e.g. mv /usr /usr/foo
6577 CDentry *pdn = destdir->inode->get_projected_parent_dn();
6578 while (pdn) {
6579 if (pdn == srcdn) {
6580 dout(7) << "cannot rename item to be a child of itself" << dendl;
6581 respond_to_request(mdr, -EINVAL);
6582 return;
6583 }
6584 pdn = pdn->get_dir()->inode->parent;
6585 }
6586
6587 // is this a stray migration, reintegration or merge? (sanity checks!)
6588 if (mdr->reqid.name.is_mds() &&
6589 !(MDS_INO_IS_MDSDIR(srcpath.get_ino()) &&
6590 MDS_INO_IS_MDSDIR(destpath.get_ino())) &&
6591 !(destdnl->is_remote() &&
6592 destdnl->get_remote_ino() == srci->ino())) {
6593 respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
6594 return;
6595 }
6596
6597 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
6598 (srcdnl->is_primary() || destdnl->is_primary()));
6599 if (linkmerge)
6600 dout(10) << " this is a link merge" << dendl;
6601
6602 // -- create stray dentry? --
6603 CDentry *straydn = NULL;
6604 if (destdnl->is_primary() && !linkmerge) {
6605 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
6606 if (!straydn)
6607 return;
6608 dout(10) << " straydn is " << *straydn << dendl;
6609 } else if (mdr->straydn) {
6610 mdr->unpin(mdr->straydn);
6611 mdr->straydn = NULL;
6612 }
6613
6614 // -- prepare witness list --
6615 /*
6616 * NOTE: we use _all_ replicas as witnesses.
6617 * this probably isn't totally necessary (esp for file renames),
6618 * but if/when we change that, we have to make sure rejoin is
6619 * sufficiently robust to handle strong rejoins from survivors
6620 * with totally wrong dentry->inode linkage.
6621 * (currently, it can ignore rename effects, because the resolve
6622 * stage will sort them out.)
6623 */
6624 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
6625 if (srcdn->is_auth())
6626 srcdn->list_replicas(witnesses);
6627 else
6628 witnesses.insert(srcdn->authority().first);
6629 if (srcdnl->is_remote() && !srci->is_auth())
6630 witnesses.insert(srci->authority().first);
6631 destdn->list_replicas(witnesses);
6632 if (destdnl->is_remote() && !oldin->is_auth())
6633 witnesses.insert(oldin->authority().first);
6634 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
6635
6636
6637 // -- locks --
6638 map<SimpleLock*, mds_rank_t> remote_wrlocks;
6639
6640 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
6641 for (int i=0; i<(int)srctrace.size(); i++)
6642 rdlocks.insert(&srctrace[i]->lock);
6643 xlocks.insert(&srcdn->lock);
6644 mds_rank_t srcdirauth = srcdn->get_dir()->authority().first;
6645 if (srcdirauth != mds->get_nodeid()) {
6646 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
6647 remote_wrlocks[&srcdn->get_dir()->inode->filelock] = srcdirauth;
6648 remote_wrlocks[&srcdn->get_dir()->inode->nestlock] = srcdirauth;
6649 if (srci->is_dir())
6650 rdlocks.insert(&srci->dirfragtreelock);
6651 } else {
6652 wrlocks.insert(&srcdn->get_dir()->inode->filelock);
6653 wrlocks.insert(&srcdn->get_dir()->inode->nestlock);
6654 }
6655 mds->locker->include_snap_rdlocks(rdlocks, srcdn->get_dir()->inode);
6656
6657 // straydn?
6658 if (straydn) {
6659 wrlocks.insert(&straydn->get_dir()->inode->filelock);
6660 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
6661 xlocks.insert(&straydn->lock);
6662 }
6663
6664 // xlock versionlock on dentries if there are witnesses.
6665 // replicas can't see projected dentry linkages, and will get
6666 // confused if we try to pipeline things.
6667 if (!witnesses.empty()) {
6668 // take xlock on all projected ancestor dentries for srcdn and destdn.
6669 // this ensures the srcdn and destdn can be traversed to by the witnesses.
6670 for (int i= 0; i<(int)srctrace.size(); i++) {
6671 if (srctrace[i]->is_auth() && srctrace[i]->is_projected())
6672 xlocks.insert(&srctrace[i]->versionlock);
6673 }
6674 for (int i=0; i<(int)desttrace.size(); i++) {
6675 if (desttrace[i]->is_auth() && desttrace[i]->is_projected())
6676 xlocks.insert(&desttrace[i]->versionlock);
6677 }
6678 // xlock srci and oldin's primary dentries, so witnesses can call
6679 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
6680 // is traversed.
6681 if (srcdnl->is_remote())
6682 xlocks.insert(&srci->get_projected_parent_dn()->lock);
6683 if (destdnl->is_remote())
6684 xlocks.insert(&oldin->get_projected_parent_dn()->lock);
6685 }
6686
6687 // we need to update srci's ctime. xlock its least contended lock to do that...
6688 xlocks.insert(&srci->linklock);
6689
6690 // xlock oldin (for nlink--)
6691 if (oldin) {
6692 xlocks.insert(&oldin->linklock);
6693 if (oldin->is_dir())
6694 rdlocks.insert(&oldin->filelock);
6695 }
6696 if (srcdnl->is_primary() && srci->is_dir())
6697 // FIXME: this should happen whenever we are renamning between
6698 // realms, regardless of the file type
6699 // FIXME: If/when this changes, make sure to update the
6700 // "allowance" in handle_slave_rename_prep
6701 xlocks.insert(&srci->snaplock); // FIXME: an auth bcast could be sufficient?
6702 else
6703 rdlocks.insert(&srci->snaplock);
6704
6705 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL;
6706 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks,
6707 &remote_wrlocks, auth_pin_freeze))
6708 return;
6709
6710 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6711 if (!check_access(mdr, srcdn->get_dir()->get_inode(), MAY_WRITE))
6712 return;
6713
6714 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
6715 return;
6716
6717 if (!check_fragment_space(mdr, destdn->get_dir()))
6718 return;
6719
6720 if (!check_access(mdr, srci, MAY_WRITE))
6721 return;
6722 }
6723
6724 // with read lock, really verify oldin is empty
6725 if (oldin &&
6726 oldin->is_dir() &&
6727 _dir_is_nonempty(mdr, oldin)) {
6728 respond_to_request(mdr, -ENOTEMPTY);
6729 return;
6730 }
6731
6732 /* project_past_snaprealm_parent() will do this job
6733 *
6734 // moving between snaprealms?
6735 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
6736 SnapRealm *srcrealm = srci->find_snaprealm();
6737 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
6738 if (srcrealm != destrealm &&
6739 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
6740 destrealm->get_newest_seq() + 1 > srcdn->first)) {
6741 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
6742 mdcache->snaprealm_create(mdr, srci);
6743 return;
6744 }
6745 }
6746 */
6747
6748 assert(g_conf->mds_kill_rename_at != 1);
6749
6750 // -- open all srcdn inode frags, if any --
6751 // we need these open so that auth can properly delegate from inode to dirfrags
6752 // after the inode is _ours_.
6753 if (srcdnl->is_primary() &&
6754 !srcdn->is_auth() &&
6755 srci->is_dir()) {
6756 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
6757 mdr->set_stickydirs(srci);
6758
6759 list<frag_t> frags;
6760 srci->dirfragtree.get_leaves(frags);
6761 for (list<frag_t>::iterator p = frags.begin();
6762 p != frags.end();
6763 ++p) {
6764 CDir *dir = srci->get_dirfrag(*p);
6765 if (!dir) {
6766 dout(10) << " opening " << *p << " under " << *srci << dendl;
6767 mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr));
6768 return;
6769 }
6770 }
6771 }
6772
6773 // -- prepare witnesses --
6774
6775 // do srcdn auth last
6776 mds_rank_t last = MDS_RANK_NONE;
6777 if (!srcdn->is_auth()) {
6778 last = srcdn->authority().first;
6779 mdr->more()->srcdn_auth_mds = last;
6780 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
6781 // are involved in the rename operation.
6782 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
6783 dout(10) << " preparing ambiguous auth for srci" << dendl;
6784 assert(mdr->more()->is_remote_frozen_authpin);
6785 assert(mdr->more()->rename_inode == srci);
6786 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6787 return;
6788 }
6789 }
6790
6791 for (set<mds_rank_t>::iterator p = witnesses.begin();
6792 p != witnesses.end();
6793 ++p) {
6794 if (*p == last) continue; // do it last!
6795 if (mdr->more()->witnessed.count(*p)) {
6796 dout(10) << " already witnessed by mds." << *p << dendl;
6797 } else if (mdr->more()->waiting_on_slave.count(*p)) {
6798 dout(10) << " already waiting on witness mds." << *p << dendl;
6799 } else {
6800 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
6801 return;
6802 }
6803 }
6804 if (!mdr->more()->waiting_on_slave.empty())
6805 return; // we're waiting for a witness.
6806
6807 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
6808 dout(10) << " preparing last witness (srcdn auth)" << dendl;
6809 assert(mdr->more()->waiting_on_slave.count(last) == 0);
6810 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6811 return;
6812 }
6813
6814 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
6815 if (!mdr->more()->slaves.empty() && !srci->is_dir())
6816 assert(g_conf->mds_kill_rename_at != 3);
6817 if (!mdr->more()->slaves.empty() && srci->is_dir())
6818 assert(g_conf->mds_kill_rename_at != 4);
6819
6820 // -- declare now --
6821 mdr->set_mds_stamp(ceph_clock_now());
6822
6823 // -- prepare journal entry --
6824 mdr->ls = mdlog->get_current_segment();
6825 EUpdate *le = new EUpdate(mdlog, "rename");
6826 mdlog->start_entry(le);
6827 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6828 if (!mdr->more()->witnessed.empty()) {
6829 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6830
6831 le->reqid = mdr->reqid;
6832 le->had_slaves = true;
6833
6834 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6835 // no need to send frozen auth pin to recovring auth MDS of srci
6836 mdr->more()->is_remote_frozen_authpin = false;
6837 }
6838
6839 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
6840 if (le->client_map.length())
6841 le->cmapv = mds->sessionmap.get_projected();
6842
6843 // -- commit locally --
6844 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
6845
6846 journal_and_reply(mdr, srci, destdn, le, fin);
6847 }
6848
6849
6850 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
6851 {
6852 dout(10) << "_rename_finish " << *mdr << dendl;
6853
6854 if (!mdr->more()->witnessed.empty())
6855 mdcache->logged_master_update(mdr->reqid);
6856
6857 // apply
6858 _rename_apply(mdr, srcdn, destdn, straydn);
6859
6860 mdcache->send_dentry_link(destdn, mdr);
6861
6862 CDentry::linkage_t *destdnl = destdn->get_linkage();
6863 CInode *in = destdnl->get_inode();
6864 bool need_eval = mdr->more()->cap_imports.count(in);
6865
6866 // test hack: test slave commit
6867 if (!mdr->more()->slaves.empty() && !in->is_dir())
6868 assert(g_conf->mds_kill_rename_at != 5);
6869 if (!mdr->more()->slaves.empty() && in->is_dir())
6870 assert(g_conf->mds_kill_rename_at != 6);
6871
6872 // bump popularity
6873 mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
6874 if (destdnl->is_remote() && in->is_auth())
6875 mds->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
6876
6877 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
6878
6879 assert(g_conf->mds_kill_rename_at != 7);
6880
6881 // reply
6882 respond_to_request(mdr, 0);
6883
6884 if (need_eval)
6885 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
6886
6887 // clean up?
6888 // respond_to_request() drops locks. So stray reintegration can race with us.
6889 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6890 mdcache->notify_stray(straydn);
6891 }
6892 }
6893
6894
6895
6896 // helpers
6897
6898 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
6899 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
6900 {
6901 if (mds->is_cluster_degraded() &&
6902 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6903 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
6904 if (mdr->more()->waiting_on_slave.empty())
6905 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6906 return false;
6907 }
6908
6909 dout(10) << "_rename_prepare_witness mds." << who << dendl;
6910 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6911 MMDSSlaveRequest::OP_RENAMEPREP);
6912
6913 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
6914 for (auto dn : srctrace)
6915 req->srcdnpath.push_dentry(dn->name);
6916 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
6917 for (auto dn : dsttrace)
6918 req->destdnpath.push_dentry(dn->name);
6919 if (straydn)
6920 mdcache->replicate_stray(straydn, who, req->stray);
6921
6922 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
6923
6924 // srcdn auth will verify our current witness list is sufficient
6925 req->witnesses = witnesse;
6926
6927 req->op_stamp = mdr->get_op_stamp();
6928 mds->send_message_mds(req, who);
6929
6930 assert(mdr->more()->waiting_on_slave.count(who) == 0);
6931 mdr->more()->waiting_on_slave.insert(who);
6932 return true;
6933 }
6934
6935 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
6936 {
6937 version_t oldpv = mdr->more()->inode_import_v;
6938
6939 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
6940
6941 /* import node */
6942 bufferlist::iterator blp = mdr->more()->inode_import.begin();
6943
6944 // imported caps
6945 ::decode(mdr->more()->imported_client_map, blp);
6946 ::encode(mdr->more()->imported_client_map, *client_map_bl,
6947 mds->mdsmap->get_up_features());
6948 prepare_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
6949
6950 list<ScatterLock*> updated_scatterlocks;
6951 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
6952 mdr->more()->cap_imports, updated_scatterlocks);
6953
6954 // hack: force back to !auth and clean, temporarily
6955 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
6956 srcdnl->get_inode()->mark_clean();
6957
6958 return oldpv;
6959 }
6960
6961 bool Server::_need_force_journal(CInode *diri, bool empty)
6962 {
6963 list<CDir*> ls;
6964 diri->get_dirfrags(ls);
6965
6966 bool force_journal = false;
6967 if (empty) {
6968 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6969 if ((*p)->is_subtree_root() && (*p)->get_dir_auth().first == mds->get_nodeid()) {
6970 dout(10) << " frag " << (*p)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
6971 force_journal = true;
6972 break;
6973 } else
6974 dout(20) << " frag " << (*p)->get_frag() << " is not auth subtree dirfrag" << dendl;
6975 }
6976 } else {
6977 // see if any children of our frags are auth subtrees.
6978 list<CDir*> subtrees;
6979 mdcache->list_subtrees(subtrees);
6980 dout(10) << " subtrees " << subtrees << " frags " << ls << dendl;
6981 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6982 CDir *dir = *p;
6983 for (list<CDir*>::iterator q = subtrees.begin(); q != subtrees.end(); ++q) {
6984 if (dir->contains(*q)) {
6985 if ((*q)->get_dir_auth().first == mds->get_nodeid()) {
6986 dout(10) << " frag " << (*p)->get_frag() << " contains (maybe) auth subtree, will force journal "
6987 << **q << dendl;
6988 force_journal = true;
6989 break;
6990 } else
6991 dout(20) << " frag " << (*p)->get_frag() << " contains but isn't auth for " << **q << dendl;
6992 } else
6993 dout(20) << " frag " << (*p)->get_frag() << " does not contain " << **q << dendl;
6994 }
6995 if (force_journal)
6996 break;
6997 }
6998 }
6999 return force_journal;
7000 }
7001
7002 void Server::_rename_prepare(MDRequestRef& mdr,
7003 EMetaBlob *metablob, bufferlist *client_map_bl,
7004 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7005 {
7006 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7007 if (straydn)
7008 dout(10) << " straydn " << *straydn << dendl;
7009
7010 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7011 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7012 CInode *srci = srcdnl->get_inode();
7013 CInode *oldin = destdnl->get_inode();
7014
7015 // primary+remote link merge?
7016 bool linkmerge = (srci == destdnl->get_inode() &&
7017 (srcdnl->is_primary() || destdnl->is_primary()));
7018 bool silent = srcdn->get_dir()->inode->is_stray();
7019
7020 bool force_journal_dest = false;
7021 if (srci->is_dir() && !destdn->is_auth()) {
7022 if (srci->is_auth()) {
7023 // if we are auth for srci and exporting it, force journal because journal replay needs
7024 // the source inode to create auth subtrees.
7025 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
7026 force_journal_dest = true;
7027 } else
7028 force_journal_dest = _need_force_journal(srci, false);
7029 }
7030
7031 bool force_journal_stray = false;
7032 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
7033 force_journal_stray = _need_force_journal(oldin, true);
7034
7035 if (linkmerge)
7036 dout(10) << " merging remote and primary links to the same inode" << dendl;
7037 if (silent)
7038 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
7039 if (force_journal_dest)
7040 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
7041 if (force_journal_stray)
7042 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
7043
7044 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
7045 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
7046 metablob->renamed_dirino = srci->ino();
7047 } else if (oldin && oldin->is_dir() && force_journal_stray) {
7048 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
7049 metablob->renamed_dirino = oldin->ino();
7050 }
7051
7052 // prepare
7053 inode_t *pi = 0; // renamed inode
7054 inode_t *tpi = 0; // target/overwritten inode
7055
7056 // target inode
7057 if (!linkmerge) {
7058 if (destdnl->is_primary()) {
7059 assert(straydn); // moving to straydn.
7060 // link--, and move.
7061 if (destdn->is_auth()) {
7062 tpi = oldin->project_inode(); //project_snaprealm
7063 tpi->version = straydn->pre_dirty(tpi->version);
7064 tpi->update_backtrace();
7065 }
7066 straydn->push_projected_linkage(oldin);
7067 } else if (destdnl->is_remote()) {
7068 // nlink-- targeti
7069 if (oldin->is_auth()) {
7070 tpi = oldin->project_inode();
7071 tpi->version = oldin->pre_dirty();
7072 }
7073 }
7074 }
7075
7076 // dest
7077 if (srcdnl->is_remote()) {
7078 if (!linkmerge) {
7079 // destdn
7080 if (destdn->is_auth())
7081 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
7082 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7083 // srci
7084 if (srci->is_auth()) {
7085 pi = srci->project_inode();
7086 pi->version = srci->pre_dirty();
7087 }
7088 } else {
7089 dout(10) << " will merge remote onto primary link" << dendl;
7090 if (destdn->is_auth()) {
7091 pi = oldin->project_inode();
7092 pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
7093 }
7094 }
7095 } else { // primary
7096 if (destdn->is_auth()) {
7097 version_t oldpv;
7098 if (srcdn->is_auth())
7099 oldpv = srci->get_projected_version();
7100 else {
7101 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
7102
7103 // note which dirfrags have child subtrees in the journal
7104 // event, so that we can open those (as bounds) during replay.
7105 if (srci->is_dir()) {
7106 list<CDir*> ls;
7107 srci->get_dirfrags(ls);
7108 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7109 CDir *dir = *p;
7110 if (!dir->is_auth())
7111 metablob->renamed_dir_frags.push_back(dir->get_frag());
7112 }
7113 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
7114 }
7115 }
7116 pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
7117 // & srcdnl->snaprealm
7118 pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
7119 pi->update_backtrace();
7120 }
7121 destdn->push_projected_linkage(srci);
7122 }
7123
7124 // src
7125 if (srcdn->is_auth())
7126 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
7127 srcdn->push_projected_linkage(); // push null linkage
7128
7129 if (!silent) {
7130 if (pi) {
7131 pi->ctime = mdr->get_op_stamp();
7132 pi->change_attr++;
7133 if (linkmerge)
7134 pi->nlink--;
7135 }
7136 if (tpi) {
7137 tpi->ctime = mdr->get_op_stamp();
7138 tpi->change_attr++;
7139 destdn->make_path_string(tpi->stray_prior_path, true);
7140 tpi->nlink--;
7141 if (tpi->nlink == 0)
7142 oldin->state_set(CInode::STATE_ORPHAN);
7143 }
7144 }
7145
7146 // prepare nesting, mtime updates
7147 int predirty_dir = silent ? 0:PREDIRTY_DIR;
7148
7149 // guarantee stray dir is processed first during journal replay. unlink the old inode,
7150 // then link the source inode to destdn
7151 if (destdnl->is_primary()) {
7152 assert(straydn);
7153 if (straydn->is_auth()) {
7154 metablob->add_dir_context(straydn->get_dir());
7155 metablob->add_dir(straydn->get_dir(), true);
7156 }
7157 }
7158
7159 // sub off target
7160 if (destdn->is_auth() && !destdnl->is_null()) {
7161 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
7162 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
7163 if (destdnl->is_primary()) {
7164 assert(straydn);
7165 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
7166 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7167 }
7168 }
7169
7170 // move srcdn
7171 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
7172 int flags = predirty_dir | predirty_primary;
7173 if (srcdn->is_auth())
7174 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
7175 if (destdn->is_auth())
7176 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
7177
7178 SnapRealm *src_realm = srci->find_snaprealm();
7179 SnapRealm *dest_realm = destdn->get_dir()->inode->find_snaprealm();
7180 snapid_t next_dest_snap = dest_realm->get_newest_seq() + 1;
7181
7182 // add it all to the metablob
7183 // target inode
7184 if (!linkmerge) {
7185 if (destdnl->is_primary()) {
7186 assert(straydn);
7187 if (destdn->is_auth()) {
7188 // project snaprealm, too
7189 if (oldin->snaprealm || dest_realm->get_newest_seq() + 1 > oldin->get_oldest_snap())
7190 oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
7191 straydn->first = MAX(oldin->first, next_dest_snap);
7192 metablob->add_primary_dentry(straydn, oldin, true, true);
7193 } else if (force_journal_stray) {
7194 dout(10) << " forced journaling straydn " << *straydn << dendl;
7195 metablob->add_dir_context(straydn->get_dir());
7196 metablob->add_primary_dentry(straydn, oldin, true);
7197 }
7198 } else if (destdnl->is_remote()) {
7199 if (oldin->is_auth()) {
7200 // auth for targeti
7201 metablob->add_dir_context(oldin->get_projected_parent_dir());
7202 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
7203 CEPH_NOSNAP, 0, destdnl);
7204 metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
7205 }
7206 }
7207 }
7208
7209 // dest
7210 if (srcdnl->is_remote()) {
7211 if (!linkmerge) {
7212 if (destdn->is_auth() && !destdnl->is_null())
7213 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7214 else
7215 destdn->first = MAX(destdn->first, next_dest_snap);
7216
7217 if (destdn->is_auth())
7218 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7219 if (srci->get_projected_parent_dn()->is_auth()) { // it's remote
7220 metablob->add_dir_context(srci->get_projected_parent_dir());
7221 mdcache->journal_cow_dentry(mdr.get(), metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl);
7222 metablob->add_primary_dentry(srci->get_projected_parent_dn(), srci, true);
7223 }
7224 } else {
7225 if (destdn->is_auth() && !destdnl->is_null())
7226 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7227 else
7228 destdn->first = MAX(destdn->first, next_dest_snap);
7229
7230 if (destdn->is_auth())
7231 metablob->add_primary_dentry(destdn, destdnl->get_inode(), true, true);
7232 }
7233 } else if (srcdnl->is_primary()) {
7234 // project snap parent update?
7235 if (destdn->is_auth() && src_realm != dest_realm &&
7236 (srci->snaprealm || src_realm->get_newest_seq() + 1 > srci->get_oldest_snap()))
7237 srci->project_past_snaprealm_parent(dest_realm);
7238
7239 if (destdn->is_auth() && !destdnl->is_null())
7240 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7241 else
7242 destdn->first = MAX(destdn->first, next_dest_snap);
7243
7244 if (destdn->is_auth())
7245 metablob->add_primary_dentry(destdn, srci, true, true);
7246 else if (force_journal_dest) {
7247 dout(10) << " forced journaling destdn " << *destdn << dendl;
7248 metablob->add_dir_context(destdn->get_dir());
7249 metablob->add_primary_dentry(destdn, srci, true);
7250 if (srcdn->is_auth() && srci->is_dir()) {
7251 // journal new subtrees root dirfrags
7252 list<CDir*> ls;
7253 srci->get_dirfrags(ls);
7254 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7255 CDir *dir = *p;
7256 if (dir->is_auth())
7257 metablob->add_dir(dir, true);
7258 }
7259 }
7260 }
7261 }
7262
7263 // src
7264 if (srcdn->is_auth()) {
7265 dout(10) << " journaling srcdn " << *srcdn << dendl;
7266 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
7267 // also journal the inode in case we need do slave rename rollback. It is Ok to add
7268 // both primary and NULL dentries. Because during journal replay, null dentry is
7269 // processed after primary dentry.
7270 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
7271 metablob->add_primary_dentry(srcdn, srci, true);
7272 metablob->add_null_dentry(srcdn, true);
7273 } else
7274 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
7275
7276 // make renamed inode first track the dn
7277 if (srcdnl->is_primary() && destdn->is_auth())
7278 srci->first = destdn->first;
7279
7280 if (oldin && oldin->is_dir()) {
7281 assert(straydn);
7282 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
7283 }
7284 if (srci->is_dir())
7285 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
7286
7287 }
7288
7289
7290 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7291 {
7292 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7293 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
7294
7295 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7296 CDentry::linkage_t *destdnl = destdn->get_linkage();
7297
7298 CInode *oldin = destdnl->get_inode();
7299
7300 // primary+remote link merge?
7301 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7302 (srcdnl->is_primary() || destdnl->is_primary()));
7303
7304 // target inode
7305 if (!linkmerge) {
7306 if (destdnl->is_primary()) {
7307 assert(straydn);
7308 dout(10) << "straydn is " << *straydn << dendl;
7309 destdn->get_dir()->unlink_inode(destdn, false);
7310
7311 straydn->pop_projected_linkage();
7312 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7313 assert(!straydn->is_projected()); // no other projected
7314
7315 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
7316
7317 // nlink-- targeti
7318 if (destdn->is_auth()) {
7319 bool hadrealm = (oldin->snaprealm ? true : false);
7320 oldin->pop_and_dirty_projected_inode(mdr->ls);
7321 if (oldin->snaprealm && !hadrealm)
7322 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT);
7323 } else {
7324 // FIXME this snaprealm is not filled out correctly
7325 //oldin->open_snaprealm(); might be sufficient..
7326 }
7327 } else if (destdnl->is_remote()) {
7328 destdn->get_dir()->unlink_inode(destdn, false);
7329 if (oldin->is_auth())
7330 oldin->pop_and_dirty_projected_inode(mdr->ls);
7331 }
7332 }
7333
7334 // unlink src before we relink it at dest
7335 CInode *in = srcdnl->get_inode();
7336 assert(in);
7337
7338 bool srcdn_was_remote = srcdnl->is_remote();
7339 srcdn->get_dir()->unlink_inode(srcdn);
7340
7341 // dest
7342 if (srcdn_was_remote) {
7343 if (!linkmerge) {
7344 // destdn
7345 destdnl = destdn->pop_projected_linkage();
7346 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7347 assert(!destdn->is_projected()); // no other projected
7348
7349 destdn->link_remote(destdnl, in);
7350 if (destdn->is_auth())
7351 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
7352 // in
7353 if (in->is_auth())
7354 in->pop_and_dirty_projected_inode(mdr->ls);
7355 } else {
7356 dout(10) << "merging remote onto primary link" << dendl;
7357 oldin->pop_and_dirty_projected_inode(mdr->ls);
7358 }
7359 } else { // primary
7360 if (linkmerge) {
7361 dout(10) << "merging primary onto remote link" << dendl;
7362 destdn->get_dir()->unlink_inode(destdn, false);
7363 }
7364 destdnl = destdn->pop_projected_linkage();
7365 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7366 assert(!destdn->is_projected()); // no other projected
7367
7368 // srcdn inode import?
7369 if (!srcdn->is_auth() && destdn->is_auth()) {
7370 assert(mdr->more()->inode_import.length() > 0);
7371
7372 map<client_t,Capability::Import> imported_caps;
7373
7374 // finish cap imports
7375 finish_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
7376 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
7377 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
7378 mdr->more()->srcdn_auth_mds, true,
7379 mdr->more()->cap_imports[destdnl->get_inode()],
7380 imported_caps);
7381 }
7382
7383 mdr->more()->inode_import.clear();
7384 ::encode(imported_caps, mdr->more()->inode_import);
7385
7386 /* hack: add an auth pin for each xlock we hold. These were
7387 * remote xlocks previously but now they're local and
7388 * we're going to try and unpin when we xlock_finish. */
7389 for (set<SimpleLock *>::iterator i = mdr->xlocks.begin();
7390 i != mdr->xlocks.end();
7391 ++i)
7392 if ((*i)->get_parent() == destdnl->get_inode() &&
7393 !(*i)->is_locallock())
7394 mds->locker->xlock_import(*i);
7395
7396 // hack: fix auth bit
7397 in->state_set(CInode::STATE_AUTH);
7398
7399 mdr->clear_ambiguous_auth();
7400 }
7401
7402 if (destdn->is_auth()) {
7403 in->pop_and_dirty_projected_inode(mdr->ls);
7404
7405 } else {
7406 // FIXME: fix up snaprealm!
7407 }
7408 }
7409
7410 // src
7411 if (srcdn->is_auth())
7412 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
7413 srcdn->pop_projected_linkage();
7414 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7415 assert(!srcdn->is_projected()); // no other projected
7416
7417 // apply remaining projected inodes (nested)
7418 mdr->apply();
7419
7420 // update subtree map?
7421 if (destdnl->is_primary() && in->is_dir())
7422 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
7423
7424 if (straydn && oldin->is_dir())
7425 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
7426
7427 // removing a new dn?
7428 if (srcdn->is_auth())
7429 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
7430 }
7431
7432
7433
7434 // ------------
7435 // SLAVE
7436
7437 class C_MDS_SlaveRenamePrep : public ServerLogContext {
7438 CDentry *srcdn, *destdn, *straydn;
7439 public:
7440 C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7441 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
7442 void finish(int r) override {
7443 server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
7444 }
7445 };
7446
7447 class C_MDS_SlaveRenameCommit : public ServerContext {
7448 MDRequestRef mdr;
7449 CDentry *srcdn, *destdn, *straydn;
7450 public:
7451 C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7452 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
7453 void finish(int r) override {
7454 server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
7455 }
7456 };
7457
7458 class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
7459 MDRequestRef mdr;
7460 public:
7461 C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
7462 ServerContext(s), mdr(r) {}
7463 void finish(int r) override {
7464 server->_slave_rename_sessions_flushed(mdr);
7465 }
7466 };
7467
7468 /* This function DOES put the mdr->slave_request before returning*/
7469 void Server::handle_slave_rename_prep(MDRequestRef& mdr)
7470 {
7471 dout(10) << "handle_slave_rename_prep " << *mdr
7472 << " " << mdr->slave_request->srcdnpath
7473 << " to " << mdr->slave_request->destdnpath
7474 << dendl;
7475
7476 if (mdr->slave_request->is_interrupted()) {
7477 dout(10) << " slave request interrupted, sending noop reply" << dendl;
7478 MMDSSlaveRequest *reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7479 reply->mark_interrupted();
7480 mds->send_message_mds(reply, mdr->slave_to_mds);
7481 mdr->slave_request->put();
7482 mdr->slave_request = 0;
7483 return;
7484 }
7485
7486 // discover destdn
7487 filepath destpath(mdr->slave_request->destdnpath);
7488 dout(10) << " dest " << destpath << dendl;
7489 vector<CDentry*> trace;
7490 int r = mdcache->path_traverse(mdr, NULL, NULL, destpath, &trace, NULL, MDS_TRAVERSE_DISCOVERXLOCK);
7491 if (r > 0) return;
7492 if (r == -ESTALE) {
7493 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
7494 mdr->slave_to_mds);
7495 return;
7496 }
7497 assert(r == 0); // we shouldn't get an error here!
7498
7499 CDentry *destdn = trace[trace.size()-1];
7500 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7501 dout(10) << " destdn " << *destdn << dendl;
7502 mdr->pin(destdn);
7503
7504 // discover srcdn
7505 filepath srcpath(mdr->slave_request->srcdnpath);
7506 dout(10) << " src " << srcpath << dendl;
7507 CInode *srci = nullptr;
7508 r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &srci, MDS_TRAVERSE_DISCOVERXLOCK);
7509 if (r > 0) return;
7510 assert(r == 0);
7511
7512 // srcpath must not point to a null dentry
7513 assert(srci != nullptr);
7514
7515 CDentry *srcdn = trace[trace.size()-1];
7516 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7517 dout(10) << " srcdn " << *srcdn << dendl;
7518 mdr->pin(srcdn);
7519 mdr->pin(srci);
7520
7521 // stray?
7522 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7523 (srcdnl->is_primary() || destdnl->is_primary()));
7524 CDentry *straydn = mdr->straydn;
7525 if (destdnl->is_primary() && !linkmerge)
7526 assert(straydn);
7527
7528 mdr->set_op_stamp(mdr->slave_request->op_stamp);
7529 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
7530
7531 // set up commit waiter (early, to clean up any freezing etc we do)
7532 if (!mdr->more()->slave_commit)
7533 mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
7534
7535 // am i srcdn auth?
7536 if (srcdn->is_auth()) {
7537 set<mds_rank_t> srcdnrep;
7538 srcdn->list_replicas(srcdnrep);
7539
7540 bool reply_witness = false;
7541 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
7542 // freeze?
7543 // we need this to
7544 // - avoid conflicting lock state changes
7545 // - avoid concurrent updates to the inode
7546 // (this could also be accomplished with the versionlock)
7547 int allowance = 2; // 1 for the mdr auth_pin, 1 for the link lock
7548 allowance += srcdnl->get_inode()->is_dir(); // for the snap lock
7549 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
7550 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
7551
7552 // unfreeze auth pin after freezing the inode to avoid queueing waiters
7553 if (srcdnl->get_inode()->is_frozen_auth_pin())
7554 mdr->unfreeze_auth_pin();
7555
7556 if (!frozen_inode) {
7557 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
7558 return;
7559 }
7560
7561 /*
7562 * set ambiguous auth for srci
7563 * NOTE: we don't worry about ambiguous cache expire as we do
7564 * with subtree migrations because all slaves will pin
7565 * srcdn->get_inode() for duration of this rename.
7566 */
7567 mdr->set_ambiguous_auth(srcdnl->get_inode());
7568
7569 // just mark the source inode as ambiguous auth if more than two MDS are involved.
7570 // the master will send another OP_RENAMEPREP slave request later.
7571 if (mdr->slave_request->witnesses.size() > 1) {
7572 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
7573 reply_witness = true;
7574 }
7575
7576 // make sure bystanders have received all lock related messages
7577 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7578 if (*p == mdr->slave_to_mds ||
7579 (mds->is_cluster_degraded() &&
7580 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
7581 continue;
7582 MMDSSlaveRequest *notify = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7583 MMDSSlaveRequest::OP_RENAMENOTIFY);
7584 mds->send_message_mds(notify, *p);
7585 mdr->more()->waiting_on_slave.insert(*p);
7586 }
7587
7588 // make sure clients have received all cap related messages
7589 set<client_t> export_client_set;
7590 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
7591
7592 MDSGatherBuilder gather(g_ceph_context);
7593 flush_client_sessions(export_client_set, gather);
7594 if (gather.has_subs()) {
7595 mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
7596 gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
7597 gather.activate();
7598 }
7599 }
7600
7601 // is witness list sufficient?
7602 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7603 if (*p == mdr->slave_to_mds ||
7604 mdr->slave_request->witnesses.count(*p)) continue;
7605 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
7606 reply_witness = true;
7607 break;
7608 }
7609
7610 if (reply_witness) {
7611 assert(!srcdnrep.empty());
7612 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7613 MMDSSlaveRequest::OP_RENAMEPREPACK);
7614 reply->witnesses.swap(srcdnrep);
7615 mds->send_message_mds(reply, mdr->slave_to_mds);
7616 mdr->slave_request->put();
7617 mdr->slave_request = 0;
7618 return;
7619 }
7620 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
7621 if (!mdr->more()->waiting_on_slave.empty()) {
7622 dout(10) << " still waiting for rename notify acks from "
7623 << mdr->more()->waiting_on_slave << dendl;
7624 return;
7625 }
7626 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
7627 // set ambiguous auth for srci on witnesses
7628 mdr->set_ambiguous_auth(srcdnl->get_inode());
7629 }
7630
7631 // encode everything we'd need to roll this back... basically, just the original state.
7632 rename_rollback rollback;
7633
7634 rollback.reqid = mdr->reqid;
7635
7636 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
7637 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7638 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
7639 rollback.orig_src.dname = srcdn->name;
7640 if (srcdnl->is_primary())
7641 rollback.orig_src.ino = srcdnl->get_inode()->ino();
7642 else {
7643 assert(srcdnl->is_remote());
7644 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
7645 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
7646 }
7647
7648 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
7649 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7650 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
7651 rollback.orig_dest.dname = destdn->name;
7652 if (destdnl->is_primary())
7653 rollback.orig_dest.ino = destdnl->get_inode()->ino();
7654 else if (destdnl->is_remote()) {
7655 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
7656 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
7657 }
7658
7659 if (straydn) {
7660 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
7661 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
7662 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
7663 rollback.stray.dname = straydn->name;
7664 }
7665 ::encode(rollback, mdr->more()->rollback_bl);
7666 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7667
7668 // journal.
7669 mdr->ls = mdlog->get_current_segment();
7670 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
7671 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
7672 mdlog->start_entry(le);
7673 le->rollback = mdr->more()->rollback_bl;
7674
7675 bufferlist blah; // inode import data... obviously not used if we're the slave
7676 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
7677
7678 if (le->commit.empty()) {
7679 dout(10) << " empty metablob, skipping journal" << dendl;
7680 mdlog->cancel_entry(le);
7681 mdr->ls = NULL;
7682 _logged_slave_rename(mdr, srcdn, destdn, straydn);
7683 } else {
7684 mdr->more()->slave_update_journaled = true;
7685 submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
7686 mdr, __func__);
7687 mdlog->flush();
7688 }
7689 }
7690
7691 void Server::_logged_slave_rename(MDRequestRef& mdr,
7692 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7693 {
7694 dout(10) << "_logged_slave_rename " << *mdr << dendl;
7695
7696 // prepare ack
7697 MMDSSlaveRequest *reply = NULL;
7698 if (!mdr->aborted) {
7699 reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7700 if (!mdr->more()->slave_update_journaled)
7701 reply->mark_not_journaled();
7702 }
7703
7704 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7705 CDentry::linkage_t *destdnl = NULL;
7706 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
7707
7708 // export srci?
7709 if (srcdn->is_auth() && srcdnl->is_primary()) {
7710 // set export bounds for CInode::encode_export()
7711 list<CDir*> bounds;
7712 if (srcdnl->get_inode()->is_dir()) {
7713 srcdnl->get_inode()->get_dirfrags(bounds);
7714 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7715 (*p)->state_set(CDir::STATE_EXPORTBOUND);
7716 }
7717
7718 map<client_t,entity_inst_t> exported_client_map;
7719 bufferlist inodebl;
7720 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
7721 exported_client_map);
7722
7723 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7724 (*p)->state_clear(CDir::STATE_EXPORTBOUND);
7725
7726 if (reply) {
7727 ::encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
7728 reply->inode_export.claim_append(inodebl);
7729 reply->inode_export_v = srcdnl->get_inode()->inode.version;
7730 }
7731
7732 // remove mdr auth pin
7733 mdr->auth_unpin(srcdnl->get_inode());
7734 mdr->more()->is_inode_exporter = true;
7735
7736 if (srcdnl->get_inode()->is_dirty())
7737 srcdnl->get_inode()->mark_clean();
7738
7739 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
7740 }
7741
7742 // apply
7743 _rename_apply(mdr, srcdn, destdn, straydn);
7744
7745 destdnl = destdn->get_linkage();
7746
7747 // bump popularity
7748 mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
7749 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
7750 mds->balancer->hit_inode(mdr->get_mds_stamp(), destdnl->get_inode(),
7751 META_POP_IWR);
7752
7753 // done.
7754 mdr->slave_request->put();
7755 mdr->slave_request = 0;
7756 mdr->straydn = 0;
7757
7758 if (reply) {
7759 mds->send_message_mds(reply, mdr->slave_to_mds);
7760 } else {
7761 assert(mdr->aborted);
7762 dout(10) << " abort flag set, finishing" << dendl;
7763 mdcache->request_finish(mdr);
7764 }
7765 }
7766
7767 void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
7768 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7769 {
7770 dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
7771
7772 CDentry::linkage_t *destdnl = destdn->get_linkage();
7773
7774 list<MDSInternalContextBase*> finished;
7775 if (r == 0) {
7776 // unfreeze+singleauth inode
7777 // hmm, do i really need to delay this?
7778 if (mdr->more()->is_inode_exporter) {
7779
7780 CInode *in = destdnl->get_inode();
7781
7782 // drop our pins
7783 // we exported, clear out any xlocks that we moved to another MDS
7784 set<SimpleLock*>::iterator i = mdr->xlocks.begin();
7785 while (i != mdr->xlocks.end()) {
7786 SimpleLock *lock = *i++;
7787
7788 // we only care about xlocks on the exported inode
7789 if (lock->get_parent() == in &&
7790 !lock->is_locallock())
7791 mds->locker->xlock_export(lock, mdr.get());
7792 }
7793
7794 map<client_t,Capability::Import> peer_imported;
7795 bufferlist::iterator bp = mdr->more()->inode_import.begin();
7796 ::decode(peer_imported, bp);
7797
7798 dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl;
7799 mdcache->migrator->finish_export_inode(destdnl->get_inode(),
7800 mdr->get_mds_stamp(),
7801 mdr->slave_to_mds, peer_imported, finished);
7802 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
7803
7804 // unfreeze
7805 assert(destdnl->get_inode()->is_frozen_inode());
7806 destdnl->get_inode()->unfreeze_inode(finished);
7807 }
7808
7809 // singleauth
7810 if (mdr->more()->is_ambiguous_auth) {
7811 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
7812 mdr->more()->is_ambiguous_auth = false;
7813 }
7814
7815 if (straydn && mdr->more()->slave_update_journaled) {
7816 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7817 if (strayin && !strayin->snaprealm)
7818 mdcache->clear_dirty_bits_for_stray(strayin);
7819 }
7820
7821 mds->queue_waiters(finished);
7822 mdr->cleanup();
7823
7824 if (mdr->more()->slave_update_journaled) {
7825 // write a commit to the journal
7826 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
7827 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
7828 ESlaveUpdate::RENAME);
7829 mdlog->start_entry(le);
7830 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
7831 mdlog->flush();
7832 } else {
7833 _committed_slave(mdr);
7834 }
7835 } else {
7836
7837 // abort
7838 // rollback_bl may be empty if we froze the inode but had to provide an expanded
7839 // witness list from the master, and they failed before we tried prep again.
7840 if (mdr->more()->rollback_bl.length()) {
7841 if (mdr->more()->is_inode_exporter) {
7842 dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl;
7843 destdnl->get_inode()->abort_export();
7844 }
7845 if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
7846 mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
7847 // rollback but preserve the slave request
7848 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
7849 mdr->more()->rollback_bl.clear();
7850 } else
7851 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
7852 } else {
7853 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
7854 // singleauth
7855 if (mdr->more()->is_ambiguous_auth) {
7856 if (srcdn->is_auth())
7857 mdr->more()->rename_inode->unfreeze_inode(finished);
7858
7859 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
7860 mdr->more()->is_ambiguous_auth = false;
7861 }
7862 mds->queue_waiters(finished);
7863 mdcache->request_finish(mdr);
7864 }
7865 }
7866 }
7867
7868 void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
7869 bool isdir, int linkunlink, nest_info_t &rstat)
7870 {
7871 fnode_t *pf;
7872 pf = dir->project_fnode();
7873 mut->add_projected_fnode(dir);
7874 pf->version = dir->pre_dirty();
7875
7876 if (isdir) {
7877 pf->fragstat.nsubdirs += linkunlink;
7878 } else {
7879 pf->fragstat.nfiles += linkunlink;
7880 }
7881 if (r.ino) {
7882 pf->rstat.rbytes += linkunlink * rstat.rbytes;
7883 pf->rstat.rfiles += linkunlink * rstat.rfiles;
7884 pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
7885 pf->rstat.rsnaprealms += linkunlink * rstat.rsnaprealms;
7886 }
7887 if (pf->fragstat.mtime == ctime) {
7888 pf->fragstat.mtime = r.dirfrag_old_mtime;
7889 if (pf->rstat.rctime == ctime)
7890 pf->rstat.rctime = r.dirfrag_old_rctime;
7891 }
7892 mut->add_updated_lock(&dir->get_inode()->filelock);
7893 mut->add_updated_lock(&dir->get_inode()->nestlock);
7894 }
7895
7896 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
7897 MutationRef mut;
7898 CDentry *srcdn;
7899 version_t srcdnpv;
7900 CDentry *destdn;
7901 CDentry *straydn;
7902 bool finish_mdr;
7903 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
7904 CDentry *sd, version_t pv, CDentry *dd,
7905 CDentry *st, bool f) :
7906 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
7907 straydn(st), finish_mdr(f) {}
7908 void finish(int r) override {
7909 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
7910 destdn, straydn, finish_mdr);
7911 }
7912 };
7913
7914 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
7915 bool finish_mdr)
7916 {
7917 rename_rollback rollback;
7918 bufferlist::iterator p = rbl.begin();
7919 ::decode(rollback, p);
7920
7921 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
7922 // need to finish this update before sending resolve to claim the subtree
7923 mdcache->add_rollback(rollback.reqid, master);
7924
7925 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7926 mut->ls = mds->mdlog->get_current_segment();
7927
7928 CDentry *srcdn = NULL;
7929 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
7930 if (!srcdir)
7931 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
7932 if (srcdir) {
7933 dout(10) << " srcdir " << *srcdir << dendl;
7934 srcdn = srcdir->lookup(rollback.orig_src.dname);
7935 if (srcdn) {
7936 dout(10) << " srcdn " << *srcdn << dendl;
7937 assert(srcdn->get_linkage()->is_null());
7938 } else
7939 dout(10) << " srcdn not found" << dendl;
7940 } else
7941 dout(10) << " srcdir not found" << dendl;
7942
7943 CDentry *destdn = NULL;
7944 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
7945 if (!destdir)
7946 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
7947 if (destdir) {
7948 dout(10) << " destdir " << *destdir << dendl;
7949 destdn = destdir->lookup(rollback.orig_dest.dname);
7950 if (destdn)
7951 dout(10) << " destdn " << *destdn << dendl;
7952 else
7953 dout(10) << " destdn not found" << dendl;
7954 } else
7955 dout(10) << " destdir not found" << dendl;
7956
7957 CInode *in = NULL;
7958 if (rollback.orig_src.ino) {
7959 in = mdcache->get_inode(rollback.orig_src.ino);
7960 if (in && in->is_dir())
7961 assert(srcdn && destdn);
7962 } else
7963 in = mdcache->get_inode(rollback.orig_src.remote_ino);
7964
7965 CDir *straydir = NULL;
7966 CDentry *straydn = NULL;
7967 if (rollback.stray.dirfrag.ino) {
7968 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
7969 if (straydir) {
7970 dout(10) << "straydir " << *straydir << dendl;
7971 straydn = straydir->lookup(rollback.stray.dname);
7972 if (straydn) {
7973 dout(10) << " straydn " << *straydn << dendl;
7974 assert(straydn->get_linkage()->is_primary());
7975 } else
7976 dout(10) << " straydn not found" << dendl;
7977 } else
7978 dout(10) << "straydir not found" << dendl;
7979 }
7980
7981 CInode *target = NULL;
7982 if (rollback.orig_dest.ino) {
7983 target = mdcache->get_inode(rollback.orig_dest.ino);
7984 if (target)
7985 assert(destdn && straydn);
7986 } else if (rollback.orig_dest.remote_ino)
7987 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
7988
7989 // can't use is_auth() in the resolve stage
7990 mds_rank_t whoami = mds->get_nodeid();
7991 // slave
7992 assert(!destdn || destdn->authority().first != whoami);
7993 assert(!straydn || straydn->authority().first != whoami);
7994
7995 bool force_journal_src = false;
7996 bool force_journal_dest = false;
7997 if (in && in->is_dir() && srcdn->authority().first != whoami)
7998 force_journal_src = _need_force_journal(in, false);
7999 if (in && target && target->is_dir())
8000 force_journal_dest = _need_force_journal(in, true);
8001
8002 version_t srcdnpv = 0;
8003 // repair src
8004 if (srcdn) {
8005 if (srcdn->authority().first == whoami)
8006 srcdnpv = srcdn->pre_dirty();
8007 if (rollback.orig_src.ino) {
8008 assert(in);
8009 srcdn->push_projected_linkage(in);
8010 } else
8011 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
8012 rollback.orig_src.remote_d_type);
8013 }
8014
8015 inode_t *pi = 0;
8016 if (in) {
8017 if (in->authority().first == whoami) {
8018 pi = in->project_inode();
8019 mut->add_projected_inode(in);
8020 pi->version = in->pre_dirty();
8021 } else
8022 pi = in->get_projected_inode();
8023 if (pi->ctime == rollback.ctime)
8024 pi->ctime = rollback.orig_src.old_ctime;
8025 }
8026
8027 if (srcdn && srcdn->authority().first == whoami) {
8028 nest_info_t blah;
8029 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
8030 in ? in->is_dir() : false, 1, pi ? pi->accounted_rstat : blah);
8031 }
8032
8033 // repair dest
8034 if (destdn) {
8035 if (rollback.orig_dest.ino && target) {
8036 destdn->push_projected_linkage(target);
8037 } else if (rollback.orig_dest.remote_ino) {
8038 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
8039 rollback.orig_dest.remote_d_type);
8040 } else {
8041 // the dentry will be trimmed soon, it's ok to have wrong linkage
8042 if (rollback.orig_dest.ino)
8043 assert(mds->is_resolve());
8044 destdn->push_projected_linkage();
8045 }
8046 }
8047
8048 if (straydn)
8049 straydn->push_projected_linkage();
8050
8051 if (target) {
8052 inode_t *ti = NULL;
8053 if (target->authority().first == whoami) {
8054 ti = target->project_inode();
8055 mut->add_projected_inode(target);
8056 ti->version = target->pre_dirty();
8057 } else
8058 ti = target->get_projected_inode();
8059 if (ti->ctime == rollback.ctime)
8060 ti->ctime = rollback.orig_dest.old_ctime;
8061 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
8062 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
8063 assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
8064 else
8065 assert(rollback.orig_dest.remote_ino &&
8066 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
8067 } else
8068 ti->nlink++;
8069 }
8070
8071 if (srcdn)
8072 dout(0) << " srcdn back to " << *srcdn << dendl;
8073 if (in)
8074 dout(0) << " srci back to " << *in << dendl;
8075 if (destdn)
8076 dout(0) << " destdn back to " << *destdn << dendl;
8077 if (target)
8078 dout(0) << " desti back to " << *target << dendl;
8079
8080 // journal it
8081 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
8082 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
8083 mdlog->start_entry(le);
8084
8085 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
8086 le->commit.add_dir_context(srcdir);
8087 if (rollback.orig_src.ino)
8088 le->commit.add_primary_dentry(srcdn, 0, true);
8089 else
8090 le->commit.add_remote_dentry(srcdn, true);
8091 }
8092
8093 if (!rollback.orig_src.ino && // remote linkage
8094 in && in->authority().first == whoami) {
8095 le->commit.add_dir_context(in->get_projected_parent_dir());
8096 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
8097 }
8098
8099 if (force_journal_dest) {
8100 assert(rollback.orig_dest.ino);
8101 le->commit.add_dir_context(destdir);
8102 le->commit.add_primary_dentry(destdn, 0, true);
8103 }
8104
8105 // slave: no need to journal straydn
8106
8107 if (target && target != in && target->authority().first == whoami) {
8108 assert(rollback.orig_dest.remote_ino);
8109 le->commit.add_dir_context(target->get_projected_parent_dir());
8110 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
8111 }
8112
8113 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
8114 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
8115 le->commit.renamed_dirino = in->ino();
8116 if (srcdn->authority().first == whoami) {
8117 list<CDir*> ls;
8118 in->get_dirfrags(ls);
8119 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
8120 CDir *dir = *p;
8121 if (!dir->is_auth())
8122 le->commit.renamed_dir_frags.push_back(dir->get_frag());
8123 }
8124 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
8125 }
8126 } else if (force_journal_dest) {
8127 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
8128 le->commit.renamed_dirino = target->ino();
8129 }
8130
8131 if (target && target->is_dir()) {
8132 assert(destdn);
8133 mdcache->project_subtree_rename(target, straydir, destdir);
8134 }
8135
8136 if (in && in->is_dir()) {
8137 assert(srcdn);
8138 mdcache->project_subtree_rename(in, destdir, srcdir);
8139 }
8140
8141 if (mdr && !mdr->more()->slave_update_journaled) {
8142 assert(le->commit.empty());
8143 mdlog->cancel_entry(le);
8144 mut->ls = NULL;
8145 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, finish_mdr);
8146 } else {
8147 assert(!le->commit.empty());
8148 if (mdr)
8149 mdr->more()->slave_update_journaled = false;
8150 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr, srcdn, srcdnpv,
8151 destdn, straydn, finish_mdr);
8152 submit_mdlog_entry(le, fin, mdr, __func__);
8153 mdlog->flush();
8154 }
8155 }
8156
8157 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
8158 version_t srcdnpv, CDentry *destdn,
8159 CDentry *straydn, bool finish_mdr)
8160 {
8161 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
8162
8163 if (straydn) {
8164 straydn->get_dir()->unlink_inode(straydn);
8165 straydn->pop_projected_linkage();
8166 }
8167 if (destdn) {
8168 destdn->get_dir()->unlink_inode(destdn);
8169 destdn->pop_projected_linkage();
8170 }
8171 if (srcdn) {
8172 srcdn->pop_projected_linkage();
8173 if (srcdn->authority().first == mds->get_nodeid())
8174 srcdn->mark_dirty(srcdnpv, mut->ls);
8175 }
8176
8177 mut->apply();
8178
8179 if (srcdn && srcdn->get_linkage()->is_primary()) {
8180 CInode *in = srcdn->get_linkage()->get_inode();
8181 if (srcdn->authority().first == mds->get_nodeid())
8182 in->state_set(CInode::STATE_AUTH);
8183 // update subtree map?
8184 if (in && in->is_dir()) {
8185 assert(destdn);
8186 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
8187 }
8188 }
8189
8190 if (destdn) {
8191 CInode *oldin = destdn->get_linkage()->get_inode();
8192 // update subtree map?
8193 if (oldin && oldin->is_dir()) {
8194 assert(straydn);
8195 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
8196 }
8197 }
8198
8199 if (mds->is_resolve()) {
8200 CDir *root = NULL;
8201 if (straydn)
8202 root = mdcache->get_subtree_root(straydn->get_dir());
8203 else if (destdn)
8204 root = mdcache->get_subtree_root(destdn->get_dir());
8205 if (root)
8206 mdcache->try_trim_non_auth_subtree(root);
8207 }
8208
8209 if (mdr) {
8210 list<MDSInternalContextBase*> finished;
8211 if (mdr->more()->is_ambiguous_auth) {
8212 if (srcdn->is_auth())
8213 mdr->more()->rename_inode->unfreeze_inode(finished);
8214
8215 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8216 mdr->more()->is_ambiguous_auth = false;
8217 }
8218 mds->queue_waiters(finished);
8219 if (finish_mdr || mdr->aborted)
8220 mdcache->request_finish(mdr);
8221 else
8222 mdr->more()->slave_rolling_back = false;
8223 }
8224
8225 mdcache->finish_rollback(mut->reqid);
8226
8227 mut->cleanup();
8228 }
8229
8230 /* This function DOES put the passed message before returning*/
8231 void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8232 {
8233 dout(10) << "handle_slave_rename_prep_ack " << *mdr
8234 << " witnessed by " << ack->get_source()
8235 << " " << *ack << dendl;
8236 mds_rank_t from = mds_rank_t(ack->get_source().num());
8237
8238 // note slave
8239 mdr->more()->slaves.insert(from);
8240 if (mdr->more()->srcdn_auth_mds == from &&
8241 mdr->more()->is_remote_frozen_authpin &&
8242 !mdr->more()->is_ambiguous_auth) {
8243 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
8244 }
8245
8246 // witnessed? or add extra witnesses?
8247 assert(mdr->more()->witnessed.count(from) == 0);
8248 if (ack->is_interrupted()) {
8249 dout(10) << " slave request interrupted, noop" << dendl;
8250 } else if (ack->witnesses.empty()) {
8251 mdr->more()->witnessed.insert(from);
8252 if (!ack->is_not_journaled())
8253 mdr->more()->has_journaled_slaves = true;
8254 } else {
8255 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
8256 mdr->more()->extra_witnesses.swap(ack->witnesses);
8257 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
8258 }
8259
8260 // srci import?
8261 if (ack->inode_export.length()) {
8262 dout(10) << " got srci import" << dendl;
8263 mdr->more()->inode_import.claim(ack->inode_export);
8264 mdr->more()->inode_import_v = ack->inode_export_v;
8265 }
8266
8267 // remove from waiting list
8268 assert(mdr->more()->waiting_on_slave.count(from));
8269 mdr->more()->waiting_on_slave.erase(from);
8270
8271 if (mdr->more()->waiting_on_slave.empty())
8272 dispatch_client_request(mdr); // go again!
8273 else
8274 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
8275 }
8276
8277 void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8278 {
8279 dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
8280 << ack->get_source() << dendl;
8281 assert(mdr->is_slave());
8282 mds_rank_t from = mds_rank_t(ack->get_source().num());
8283
8284 if (mdr->more()->waiting_on_slave.count(from)) {
8285 mdr->more()->waiting_on_slave.erase(from);
8286
8287 if (mdr->more()->waiting_on_slave.empty()) {
8288 if (mdr->slave_request)
8289 dispatch_slave_request(mdr);
8290 } else
8291 dout(10) << " still waiting for rename notify acks from "
8292 << mdr->more()->waiting_on_slave << dendl;
8293 }
8294 }
8295
8296 void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
8297 {
8298 dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
8299
8300 if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
8301 mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
8302
8303 if (mdr->more()->waiting_on_slave.empty()) {
8304 if (mdr->slave_request)
8305 dispatch_slave_request(mdr);
8306 } else
8307 dout(10) << " still waiting for rename notify acks from "
8308 << mdr->more()->waiting_on_slave << dendl;
8309 }
8310 }
8311
8312 // snaps
8313 /* This function takes responsibility for the passed mdr*/
8314 void Server::handle_client_lssnap(MDRequestRef& mdr)
8315 {
8316 MClientRequest *req = mdr->client_request;
8317
8318 // traverse to path
8319 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8320 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8321 respond_to_request(mdr, -ESTALE);
8322 return;
8323 }
8324 if (!diri->is_auth()) {
8325 mdcache->request_forward(mdr, diri->authority().first);
8326 return;
8327 }
8328 if (!diri->is_dir()) {
8329 respond_to_request(mdr, -ENOTDIR);
8330 return;
8331 }
8332 dout(10) << "lssnap on " << *diri << dendl;
8333
8334 // lock snap
8335 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8336 mds->locker->include_snap_rdlocks(rdlocks, diri);
8337 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8338 return;
8339
8340 if (!check_access(mdr, diri, MAY_READ))
8341 return;
8342
8343 SnapRealm *realm = diri->find_snaprealm();
8344 map<snapid_t,SnapInfo*> infomap;
8345 realm->get_snap_info(infomap, diri->get_oldest_snap());
8346
8347 unsigned max_entries = req->head.args.readdir.max_entries;
8348 if (!max_entries)
8349 max_entries = infomap.size();
8350 int max_bytes = req->head.args.readdir.max_bytes;
8351 if (!max_bytes)
8352 // make sure at least one item can be encoded
8353 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
8354
8355 __u64 last_snapid = 0;
8356 string offset_str = req->get_path2();
8357 if (!offset_str.empty())
8358 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
8359
8360 bufferlist dirbl;
8361 encode_empty_dirstat(dirbl);
8362
8363 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
8364
8365 __u32 num = 0;
8366 bufferlist dnbl;
8367 map<snapid_t,SnapInfo*>::iterator p = infomap.upper_bound(last_snapid);
8368 for (; p != infomap.end() && num < max_entries; ++p) {
8369 dout(10) << p->first << " -> " << *p->second << dendl;
8370
8371 // actual
8372 string snap_name;
8373 if (p->second->ino == diri->ino())
8374 snap_name = p->second->name;
8375 else
8376 snap_name = p->second->get_long_name();
8377
8378 unsigned start_len = dnbl.length();
8379 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
8380 break;
8381
8382 ::encode(snap_name, dnbl);
8383 encode_infinite_lease(dnbl);
8384
8385 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
8386 if (r < 0) {
8387 bufferlist keep;
8388 keep.substr_of(dnbl, 0, start_len);
8389 dnbl.swap(keep);
8390 break;
8391 }
8392 ++num;
8393 }
8394
8395 ::encode(num, dirbl);
8396 __u16 flags = 0;
8397 if (p == infomap.end()) {
8398 flags = CEPH_READDIR_FRAG_END;
8399 if (last_snapid == 0)
8400 flags |= CEPH_READDIR_FRAG_COMPLETE;
8401 }
8402 ::encode(flags, dirbl);
8403 dirbl.claim_append(dnbl);
8404
8405 mdr->reply_extra_bl = dirbl;
8406 mdr->tracei = diri;
8407 respond_to_request(mdr, 0);
8408 }
8409
8410
8411 // MKSNAP
8412
8413 struct C_MDS_mksnap_finish : public ServerLogContext {
8414 CInode *diri;
8415 SnapInfo info;
8416 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
8417 ServerLogContext(s, r), diri(di), info(i) {}
8418 void finish(int r) override {
8419 server->_mksnap_finish(mdr, diri, info);
8420 }
8421 };
8422
8423 /* This function takes responsibility for the passed mdr*/
8424 void Server::handle_client_mksnap(MDRequestRef& mdr)
8425 {
8426 if (!mds->mdsmap->allows_snaps()) {
8427 // you can't make snapshots until you set an option right now
8428 respond_to_request(mdr, -EPERM);
8429 return;
8430 }
8431
8432 MClientRequest *req = mdr->client_request;
8433 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8434 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8435 respond_to_request(mdr, -ESTALE);
8436 return;
8437 }
8438
8439 if (!diri->is_auth()) { // fw to auth?
8440 mdcache->request_forward(mdr, diri->authority().first);
8441 return;
8442 }
8443
8444 // dir only
8445 if (!diri->is_dir()) {
8446 respond_to_request(mdr, -ENOTDIR);
8447 return;
8448 }
8449 if (diri->is_system() && !diri->is_root()) {
8450 // no snaps in system dirs (root is ok)
8451 respond_to_request(mdr, -EPERM);
8452 return;
8453 }
8454
8455 const string &snapname = req->get_filepath().last_dentry();
8456
8457 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8458 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8459 respond_to_request(mdr, -EPERM);
8460 return;
8461 }
8462
8463 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
8464
8465 // lock snap
8466 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8467
8468 mds->locker->include_snap_rdlocks(rdlocks, diri);
8469 rdlocks.erase(&diri->snaplock);
8470 xlocks.insert(&diri->snaplock);
8471
8472 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8473 return;
8474
8475 if (!check_access(mdr, diri, MAY_WRITE))
8476 return;
8477
8478 // make sure name is unique
8479 if (diri->snaprealm &&
8480 diri->snaprealm->exists(snapname)) {
8481 respond_to_request(mdr, -EEXIST);
8482 return;
8483 }
8484 if (snapname.length() == 0 ||
8485 snapname[0] == '_') {
8486 respond_to_request(mdr, -EINVAL);
8487 return;
8488 }
8489
8490 // allocate a snapid
8491 if (!mdr->more()->stid) {
8492 // prepare an stid
8493 mds->snapclient->prepare_create(diri->ino(), snapname,
8494 mdr->get_mds_stamp(),
8495 &mdr->more()->stid, &mdr->more()->snapidbl,
8496 new C_MDS_RetryRequest(mdcache, mdr));
8497 return;
8498 }
8499
8500 version_t stid = mdr->more()->stid;
8501 snapid_t snapid;
8502 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8503 ::decode(snapid, p);
8504 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
8505
8506 // journal
8507 SnapInfo info;
8508 info.ino = diri->ino();
8509 info.snapid = snapid;
8510 info.name = snapname;
8511 info.stamp = mdr->get_op_stamp();
8512
8513 inode_t *pi = diri->project_inode();
8514 pi->ctime = info.stamp;
8515 pi->version = diri->pre_dirty();
8516
8517 // project the snaprealm
8518 sr_t *newsnap = diri->project_snaprealm(snapid);
8519 newsnap->snaps[snapid] = info;
8520 newsnap->seq = snapid;
8521 newsnap->last_created = snapid;
8522
8523 // journal the inode changes
8524 mdr->ls = mdlog->get_current_segment();
8525 EUpdate *le = new EUpdate(mdlog, "mksnap");
8526 mdlog->start_entry(le);
8527
8528 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8529 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8530 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8531 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8532
8533 // journal the snaprealm changes
8534 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
8535 mdr, __func__);
8536 mdlog->flush();
8537 }
8538
8539 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
8540 {
8541 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
8542
8543 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
8544
8545 diri->pop_and_dirty_projected_inode(mdr->ls);
8546 mdr->apply();
8547
8548 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8549
8550 // create snap
8551 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8552
8553 mdcache->do_realm_invalidate_and_update_notify(diri, op);
8554
8555 // yay
8556 mdr->in[0] = diri;
8557 mdr->snapid = info.snapid;
8558 mdr->tracei = diri;
8559 respond_to_request(mdr, 0);
8560 }
8561
8562
8563 // RMSNAP
8564
8565 struct C_MDS_rmsnap_finish : public ServerLogContext {
8566 CInode *diri;
8567 snapid_t snapid;
8568 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8569 ServerLogContext(s, r), diri(di), snapid(sn) {}
8570 void finish(int r) override {
8571 server->_rmsnap_finish(mdr, diri, snapid);
8572 }
8573 };
8574
8575 /* This function takes responsibility for the passed mdr*/
8576 void Server::handle_client_rmsnap(MDRequestRef& mdr)
8577 {
8578 MClientRequest *req = mdr->client_request;
8579
8580 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8581 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8582 respond_to_request(mdr, -ESTALE);
8583 return;
8584 }
8585 if (!diri->is_auth()) { // fw to auth?
8586 mdcache->request_forward(mdr, diri->authority().first);
8587 return;
8588 }
8589 if (!diri->is_dir()) {
8590 respond_to_request(mdr, -ENOTDIR);
8591 return;
8592 }
8593
8594 const string &snapname = req->get_filepath().last_dentry();
8595
8596 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8597 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8598 respond_to_request(mdr, -EPERM);
8599 return;
8600 }
8601
8602 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
8603
8604 // does snap exist?
8605 if (snapname.length() == 0 || snapname[0] == '_') {
8606 respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
8607 return;
8608 }
8609 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
8610 respond_to_request(mdr, -ENOENT);
8611 return;
8612 }
8613 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
8614 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
8615
8616 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8617 mds->locker->include_snap_rdlocks(rdlocks, diri);
8618 rdlocks.erase(&diri->snaplock);
8619 xlocks.insert(&diri->snaplock);
8620
8621 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8622 return;
8623
8624 if (!check_access(mdr, diri, MAY_WRITE))
8625 return;
8626
8627 // prepare
8628 if (!mdr->more()->stid) {
8629 mds->snapclient->prepare_destroy(diri->ino(), snapid,
8630 &mdr->more()->stid, &mdr->more()->snapidbl,
8631 new C_MDS_RetryRequest(mdcache, mdr));
8632 return;
8633 }
8634 version_t stid = mdr->more()->stid;
8635 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8636 snapid_t seq;
8637 ::decode(seq, p);
8638 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8639
8640 // journal
8641 inode_t *pi = diri->project_inode();
8642 pi->version = diri->pre_dirty();
8643 pi->ctime = mdr->get_op_stamp();
8644
8645 mdr->ls = mdlog->get_current_segment();
8646 EUpdate *le = new EUpdate(mdlog, "rmsnap");
8647 mdlog->start_entry(le);
8648
8649 // project the snaprealm
8650 sr_t *newnode = diri->project_snaprealm();
8651 newnode->snaps.erase(snapid);
8652 newnode->seq = seq;
8653 newnode->last_destroyed = seq;
8654
8655 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8656 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8657 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8658 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8659
8660 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
8661 mdr, __func__);
8662 mdlog->flush();
8663 }
8664
8665 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8666 {
8667 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
8668 snapid_t stid = mdr->more()->stid;
8669 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8670 snapid_t seq;
8671 ::decode(seq, p);
8672
8673 diri->pop_and_dirty_projected_inode(mdr->ls);
8674 mdr->apply();
8675
8676 mds->snapclient->commit(stid, mdr->ls);
8677
8678 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8679
8680 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
8681
8682 // yay
8683 mdr->in[0] = diri;
8684 respond_to_request(mdr, 0);
8685
8686 // purge snapshot data
8687 if (diri->snaprealm->have_past_parents_open())
8688 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
8689 }
8690
8691 struct C_MDS_renamesnap_finish : public ServerLogContext {
8692 CInode *diri;
8693 snapid_t snapid;
8694 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8695 ServerLogContext(s, r), diri(di), snapid(sn) {}
8696 void finish(int r) override {
8697 server->_renamesnap_finish(mdr, diri, snapid);
8698 }
8699 };
8700
8701 /* This function takes responsibility for the passed mdr*/
8702 void Server::handle_client_renamesnap(MDRequestRef& mdr)
8703 {
8704 MClientRequest *req = mdr->client_request;
8705 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
8706 respond_to_request(mdr, -EINVAL);
8707 return;
8708 }
8709
8710 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8711 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8712 respond_to_request(mdr, -ESTALE);
8713 return;
8714 }
8715
8716 if (!diri->is_auth()) { // fw to auth?
8717 mdcache->request_forward(mdr, diri->authority().first);
8718 return;
8719 }
8720
8721 if (!diri->is_dir()) { // dir only
8722 respond_to_request(mdr, -ENOTDIR);
8723 return;
8724 }
8725
8726 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid ||
8727 mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8728 respond_to_request(mdr, -EPERM);
8729 return;
8730 }
8731
8732 const string &dstname = req->get_filepath().last_dentry();
8733 const string &srcname = req->get_filepath2().last_dentry();
8734 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
8735
8736 if (srcname.length() == 0 || srcname[0] == '_') {
8737 respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
8738 return;
8739 }
8740 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
8741 respond_to_request(mdr, -ENOENT);
8742 return;
8743 }
8744 if (dstname.length() == 0 || dstname[0] == '_') {
8745 respond_to_request(mdr, -EINVAL);
8746 return;
8747 }
8748 if (diri->snaprealm->exists(dstname)) {
8749 respond_to_request(mdr, -EEXIST);
8750 return;
8751 }
8752
8753 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
8754 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
8755
8756 // lock snap
8757 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8758
8759 mds->locker->include_snap_rdlocks(rdlocks, diri);
8760 rdlocks.erase(&diri->snaplock);
8761 xlocks.insert(&diri->snaplock);
8762
8763 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8764 return;
8765
8766 if (!check_access(mdr, diri, MAY_WRITE))
8767 return;
8768
8769 // prepare
8770 if (!mdr->more()->stid) {
8771 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
8772 &mdr->more()->stid, &mdr->more()->snapidbl,
8773 new C_MDS_RetryRequest(mdcache, mdr));
8774 return;
8775 }
8776
8777 version_t stid = mdr->more()->stid;
8778 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8779 snapid_t seq;
8780 ::decode(seq, p);
8781 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8782
8783 // journal
8784 inode_t *pi = diri->project_inode();
8785 pi->ctime = mdr->get_op_stamp();
8786 pi->version = diri->pre_dirty();
8787
8788 // project the snaprealm
8789 sr_t *newsnap = diri->project_snaprealm();
8790 assert(newsnap->snaps.count(snapid));
8791 newsnap->snaps[snapid].name = dstname;
8792
8793 // journal the inode changes
8794 mdr->ls = mdlog->get_current_segment();
8795 EUpdate *le = new EUpdate(mdlog, "renamesnap");
8796 mdlog->start_entry(le);
8797
8798 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8799 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8800 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8801 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8802
8803 // journal the snaprealm changes
8804 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
8805 mdr, __func__);
8806 mdlog->flush();
8807 }
8808
8809 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8810 {
8811 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
8812
8813 diri->pop_and_dirty_projected_inode(mdr->ls);
8814 mdr->apply();
8815
8816 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8817
8818 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8819
8820 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE, true);
8821
8822 // yay
8823 mdr->in[0] = diri;
8824 mdr->tracei = diri;
8825 mdr->snapid = snapid;
8826 respond_to_request(mdr, 0);
8827 }
8828
8829 /**
8830 * Return true if server is in state RECONNECT and this
8831 * client has not yet reconnected.
8832 */
8833 bool Server::waiting_for_reconnect(client_t c) const
8834 {
8835 return client_reconnect_gather.count(c) > 0;
8836 }
8837
8838 void Server::dump_reconnect_status(Formatter *f) const
8839 {
8840 f->open_object_section("reconnect_status");
8841 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
8842 f->close_section();
8843 }