]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20
21 #include "MDSRank.h"
22 #include "Server.h"
23 #include "Locker.h"
24 #include "MDCache.h"
25 #include "MDLog.h"
26 #include "Migrator.h"
27 #include "MDBalancer.h"
28 #include "InoTable.h"
29 #include "SnapClient.h"
30 #include "Mutation.h"
31
32 #include "msg/Messenger.h"
33
34 #include "osdc/Objecter.h"
35
36 #include "messages/MClientSession.h"
37 #include "messages/MClientRequest.h"
38 #include "messages/MClientReply.h"
39 #include "messages/MClientReconnect.h"
40 #include "messages/MClientCaps.h"
41 #include "messages/MClientSnap.h"
42
43 #include "messages/MMDSSlaveRequest.h"
44
45 #include "messages/MLock.h"
46
47 #include "events/EUpdate.h"
48 #include "events/ESlaveUpdate.h"
49 #include "events/ESession.h"
50 #include "events/EOpen.h"
51 #include "events/ECommitted.h"
52
53 #include "include/filepath.h"
54 #include "common/errno.h"
55 #include "common/Timer.h"
56 #include "common/perf_counters.h"
57 #include "include/compat.h"
58 #include "osd/OSDMap.h"
59
60 #include <errno.h>
61
62 #include <list>
63 #include <iostream>
64 using namespace std;
65
66 #include "common/config.h"
67
68 #define dout_context g_ceph_context
69 #define dout_subsys ceph_subsys_mds
70 #undef dout_prefix
71 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
72
73
74 class ServerContext : public MDSInternalContextBase {
75 protected:
76 Server *server;
77 MDSRank *get_mds() override
78 {
79 return server->mds;
80 }
81
82 public:
83 explicit ServerContext(Server *s) : server(s) {
84 assert(server != NULL);
85 }
86 };
87
88 class ServerLogContext : public MDSLogContextBase {
89 protected:
90 Server *server;
91 MDSRank *get_mds() override
92 {
93 return server->mds;
94 }
95
96 MDRequestRef mdr;
97 void pre_finish(int r) override {
98 if (mdr)
99 mdr->mark_event("journal_committed: ");
100 }
101 public:
102 explicit ServerLogContext(Server *s) : server(s) {
103 assert(server != NULL);
104 }
105 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
106 assert(server != NULL);
107 }
108 };
109
110 void Server::create_logger()
111 {
112 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
113 plb.add_u64_counter(l_mdss_handle_client_request,"handle_client_request",
114 "Client requests", "hcr");
115 plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
116 "Slave requests", "hsr");
117 plb.add_u64_counter(l_mdss_handle_client_session, "handle_client_session",
118 "Client session messages", "hcs");
119 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", "Client requests dispatched");
120 plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request", "Server requests dispatched");
121 plb.add_u64_counter(l_mdss_req_lookuphash, "req_lookuphash",
122 "Request type lookup hash of inode");
123 plb.add_u64_counter(l_mdss_req_lookupino, "req_lookupino",
124 "Request type lookup inode");
125 plb.add_u64_counter(l_mdss_req_lookupparent, "req_lookupparent",
126 "Request type lookup parent");
127 plb.add_u64_counter(l_mdss_req_lookupname, "req_lookupname",
128 "Request type lookup name");
129 plb.add_u64_counter(l_mdss_req_lookup, "req_lookup",
130 "Request type lookup");
131 plb.add_u64_counter(l_mdss_req_lookupsnap, "req_lookupsnap",
132 "Request type lookup snapshot");
133 plb.add_u64_counter(l_mdss_req_getattr, "req_getattr",
134 "Request type get attribute");
135 plb.add_u64_counter(l_mdss_req_setattr, "req_setattr",
136 "Request type set attribute");
137 plb.add_u64_counter(l_mdss_req_setlayout, "req_setlayout",
138 "Request type set file layout");
139 plb.add_u64_counter(l_mdss_req_setdirlayout, "req_setdirlayout",
140 "Request type set directory layout");
141 plb.add_u64_counter(l_mdss_req_setxattr, "req_setxattr",
142 "Request type set extended attribute");
143 plb.add_u64_counter(l_mdss_req_rmxattr, "req_rmxattr",
144 "Request type remove extended attribute");
145 plb.add_u64_counter(l_mdss_req_readdir, "req_readdir",
146 "Request type read directory");
147 plb.add_u64_counter(l_mdss_req_setfilelock, "req_setfilelock",
148 "Request type set file lock");
149 plb.add_u64_counter(l_mdss_req_getfilelock, "req_getfilelock",
150 "Request type get file lock");
151 plb.add_u64_counter(l_mdss_req_create, "req_create",
152 "Request type create");
153 plb.add_u64_counter(l_mdss_req_open, "req_open",
154 "Request type open");
155 plb.add_u64_counter(l_mdss_req_mknod, "req_mknod",
156 "Request type make node");
157 plb.add_u64_counter(l_mdss_req_link, "req_link",
158 "Request type link");
159 plb.add_u64_counter(l_mdss_req_unlink, "req_unlink",
160 "Request type unlink");
161 plb.add_u64_counter(l_mdss_req_rmdir, "req_rmdir",
162 "Request type remove directory");
163 plb.add_u64_counter(l_mdss_req_rename, "req_rename",
164 "Request type rename");
165 plb.add_u64_counter(l_mdss_req_mkdir, "req_mkdir",
166 "Request type make directory");
167 plb.add_u64_counter(l_mdss_req_symlink, "req_symlink",
168 "Request type symbolic link");
169 plb.add_u64_counter(l_mdss_req_lssnap, "req_lssnap",
170 "Request type list snapshot");
171 plb.add_u64_counter(l_mdss_req_mksnap, "req_mksnap",
172 "Request type make snapshot");
173 plb.add_u64_counter(l_mdss_req_rmsnap, "req_rmsnap",
174 "Request type remove snapshot");
175 plb.add_u64_counter(l_mdss_req_renamesnap, "req_renamesnap",
176 "Request type rename snapshot");
177 logger = plb.create_perf_counters();
178 g_ceph_context->get_perfcounters_collection()->add(logger);
179 }
180
181 Server::Server(MDSRank *m) :
182 mds(m),
183 mdcache(mds->mdcache), mdlog(mds->mdlog),
184 logger(0),
185 is_full(false),
186 reconnect_done(NULL),
187 failed_reconnects(0),
188 terminating_sessions(false)
189 {
190 }
191
192
193 /* This function DOES put the passed message before returning*/
194 void Server::dispatch(Message *m)
195 {
196 switch (m->get_type()) {
197 case CEPH_MSG_CLIENT_RECONNECT:
198 handle_client_reconnect(static_cast<MClientReconnect*>(m));
199 return;
200 }
201
202 // active?
203 if (!mds->is_active() &&
204 !(mds->is_stopping() && m->get_source().is_mds())) {
205 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
206 (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT)) {
207 MClientRequest *req = static_cast<MClientRequest*>(m);
208 Session *session = get_session(req);
209 if (!session || session->is_closed()) {
210 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
211 req->put();
212 return;
213 }
214 bool queue_replay = false;
215 if (req->is_replay()) {
216 dout(3) << "queuing replayed op" << dendl;
217 queue_replay = true;
218 } else if (req->get_retry_attempt()) {
219 // process completed request in clientreplay stage. The completed request
220 // might have created new file/directorie. This guarantees MDS sends a reply
221 // to client before other request modifies the new file/directorie.
222 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
223 dout(3) << "queuing completed op" << dendl;
224 queue_replay = true;
225 }
226 // this request was created before the cap reconnect message, drop any embedded
227 // cap releases.
228 req->releases.clear();
229 }
230 if (queue_replay) {
231 req->mark_queued_for_replay();
232 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
233 return;
234 }
235 }
236
237 bool wait_for_active = true;
238 if (m->get_type() == MSG_MDS_SLAVE_REQUEST) {
239 // handle_slave_request() will wait if necessary
240 wait_for_active = false;
241 } else if (mds->is_clientreplay()) {
242 // session open requests need to be handled during replay,
243 // close requests need to be delayed
244 if ((m->get_type() == CEPH_MSG_CLIENT_SESSION &&
245 (static_cast<MClientSession*>(m))->get_op() != CEPH_SESSION_REQUEST_CLOSE)) {
246 wait_for_active = false;
247 } else if (m->get_type() == CEPH_MSG_CLIENT_REQUEST) {
248 MClientRequest *req = static_cast<MClientRequest*>(m);
249 if (req->is_queued_for_replay()) {
250 wait_for_active = false;
251 }
252 }
253 }
254 if (wait_for_active) {
255 dout(3) << "not active yet, waiting" << dendl;
256 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
257 return;
258 }
259 }
260
261 switch (m->get_type()) {
262 case CEPH_MSG_CLIENT_SESSION:
263 handle_client_session(static_cast<MClientSession*>(m));
264 return;
265 case CEPH_MSG_CLIENT_REQUEST:
266 handle_client_request(static_cast<MClientRequest*>(m));
267 return;
268 case MSG_MDS_SLAVE_REQUEST:
269 handle_slave_request(static_cast<MMDSSlaveRequest*>(m));
270 return;
271 default:
272 derr << "server unknown message " << m->get_type() << dendl;
273 assert(0 == "server unknown message");
274 }
275 }
276
277
278
279 // ----------------------------------------------------------
280 // SESSION management
281
282 class C_MDS_session_finish : public ServerLogContext {
283 Session *session;
284 uint64_t state_seq;
285 bool open;
286 version_t cmapv;
287 interval_set<inodeno_t> inos;
288 version_t inotablev;
289 Context *fin;
290 public:
291 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
292 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
293 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
294 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { }
295 void finish(int r) override {
296 assert(r == 0);
297 server->_session_logged(session, state_seq, open, cmapv, inos, inotablev);
298 if (fin) {
299 fin->complete(r);
300 }
301 }
302 };
303
304 Session *Server::get_session(Message *m)
305 {
306 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
307 if (session) {
308 dout(20) << "get_session have " << session << " " << session->info.inst
309 << " state " << session->get_state_name() << dendl;
310 session->put(); // not carry ref
311 } else {
312 dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
313 }
314 return session;
315 }
316
317 /* This function DOES put the passed message before returning*/
318 void Server::handle_client_session(MClientSession *m)
319 {
320 version_t pv;
321 Session *session = get_session(m);
322
323 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
324 assert(m->get_source().is_client()); // should _not_ come from an mds!
325
326 if (!session) {
327 dout(0) << " ignoring sessionless msg " << *m << dendl;
328 m->put();
329 return;
330 }
331
332 if (logger)
333 logger->inc(l_mdss_handle_client_session);
334
335 uint64_t sseq = 0;
336 switch (m->get_op()) {
337 case CEPH_SESSION_REQUEST_OPEN:
338 if (session->is_opening() ||
339 session->is_open() ||
340 session->is_stale() ||
341 session->is_killing()) {
342 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
343 m->put();
344 return;
345 }
346 assert(session->is_closed() ||
347 session->is_closing());
348
349 session->set_client_metadata(m->client_meta);
350 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN "
351 << session->info.client_metadata.size() << " metadata entries:" << dendl;
352 for (map<string, string>::iterator i = session->info.client_metadata.begin();
353 i != session->info.client_metadata.end(); ++i) {
354 dout(20) << " " << i->first << ": " << i->second << dendl;
355 }
356
357 // Special case for the 'root' metadata path; validate that the claimed
358 // root is actually within the caps of the session
359 if (session->info.client_metadata.count("root")) {
360 const auto claimed_root = session->info.client_metadata.at("root");
361 // claimed_root has a leading "/" which we strip before passing
362 // into caps check
363 if (claimed_root.empty() || claimed_root[0] != '/' ||
364 !session->auth_caps.path_capable(claimed_root.substr(1))) {
365 derr << __func__ << " forbidden path claimed as mount root: "
366 << claimed_root << " by " << m->get_source() << dendl;
367 // Tell the client we're rejecting their open
368 mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
369 mds->clog->warn() << "client session with invalid root '" <<
370 claimed_root << "' denied (" << session->info.inst << ")";
371 session->clear();
372 // Drop out; don't record this session in SessionMap or journal it.
373 break;
374 }
375 }
376
377 if (session->is_closed())
378 mds->sessionmap.add_session(session);
379
380 pv = mds->sessionmap.mark_projected(session);
381 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
382 mds->sessionmap.touch_session(session);
383 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, m->client_meta),
384 new C_MDS_session_finish(this, session, sseq, true, pv));
385 mdlog->flush();
386 break;
387
388 case CEPH_SESSION_REQUEST_RENEWCAPS:
389 if (session->is_open() ||
390 session->is_stale()) {
391 mds->sessionmap.touch_session(session);
392 if (session->is_stale()) {
393 mds->sessionmap.set_state(session, Session::STATE_OPEN);
394 mds->locker->resume_stale_caps(session);
395 mds->sessionmap.touch_session(session);
396 }
397 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_RENEWCAPS, m->get_seq()));
398 } else {
399 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
400 }
401 break;
402
403 case CEPH_SESSION_REQUEST_CLOSE:
404 {
405 if (session->is_closed() ||
406 session->is_closing() ||
407 session->is_killing()) {
408 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
409 m->put();
410 return;
411 }
412 if (session->is_importing()) {
413 dout(10) << "ignoring close req on importing session" << dendl;
414 m->put();
415 return;
416 }
417 assert(session->is_open() ||
418 session->is_stale() ||
419 session->is_opening());
420 if (m->get_seq() < session->get_push_seq()) {
421 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
422 << ", dropping" << dendl;
423 m->put();
424 return;
425 }
426 // We are getting a seq that is higher than expected.
427 // Handle the same as any other seqn error.
428 //
429 if (m->get_seq() != session->get_push_seq()) {
430 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
431 << ", BUGGY!" << dendl;
432 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
433 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
434 m->put();
435 return;
436 }
437 journal_close_session(session, Session::STATE_CLOSING, NULL);
438 }
439 break;
440
441 case CEPH_SESSION_FLUSHMSG_ACK:
442 finish_flush_session(session, m->get_seq());
443 break;
444
445 default:
446 ceph_abort();
447 }
448 m->put();
449 }
450
451 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
452 {
453 for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
454 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
455 assert(session);
456 if (!session->is_open() ||
457 !session->connection.get() ||
458 !session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER))
459 continue;
460 version_t seq = session->wait_for_flush(gather.new_sub());
461 mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
462 }
463 }
464
465 void Server::finish_flush_session(Session *session, version_t seq)
466 {
467 list<MDSInternalContextBase*> finished;
468 session->finish_flush(seq, finished);
469 mds->queue_waiters(finished);
470 }
471
472 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
473 interval_set<inodeno_t>& inos, version_t piv)
474 {
475 dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close")
476 << " " << pv << dendl;
477
478 if (piv) {
479 assert(session->is_closing() || session->is_killing() ||
480 session->is_opening()); // re-open closing session
481 session->info.prealloc_inos.subtract(inos);
482 mds->inotable->apply_release_ids(inos);
483 assert(mds->inotable->get_version() == piv);
484 }
485
486 mds->sessionmap.mark_dirty(session);
487
488 // apply
489 if (session->get_state_seq() != state_seq) {
490 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
491 << ", noop" << dendl;
492 // close must have been canceled (by an import?), or any number of other things..
493 } else if (open) {
494 assert(session->is_opening());
495 mds->sessionmap.set_state(session, Session::STATE_OPEN);
496 mds->sessionmap.touch_session(session);
497 assert(session->connection != NULL);
498 session->connection->send_message(new MClientSession(CEPH_SESSION_OPEN));
499 if (mdcache->is_readonly())
500 session->connection->send_message(new MClientSession(CEPH_SESSION_FORCE_RO));
501 } else if (session->is_closing() ||
502 session->is_killing()) {
503 // kill any lingering capabilities, leases, requests
504 while (!session->caps.empty()) {
505 Capability *cap = session->caps.front();
506 CInode *in = cap->get_inode();
507 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
508 mds->locker->remove_client_cap(in, session->info.inst.name.num());
509 }
510 while (!session->leases.empty()) {
511 ClientLease *r = session->leases.front();
512 CDentry *dn = static_cast<CDentry*>(r->parent);
513 dout(20) << " killing client lease of " << *dn << dendl;
514 dn->remove_client_lease(r, mds->locker);
515 }
516 if (client_reconnect_gather.count(session->info.get_client())) {
517 dout(20) << " removing client from reconnect set" << dendl;
518 client_reconnect_gather.erase(session->info.get_client());
519
520 if (client_reconnect_gather.empty()) {
521 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
522 reconnect_gather_finish();
523 }
524 }
525
526 if (session->is_closing()) {
527 // mark con disposable. if there is a fault, we will get a
528 // reset and clean it up. if the client hasn't received the
529 // CLOSE message yet, they will reconnect and get an
530 // ms_handle_remote_reset() and realize they had in fact closed.
531 // do this *before* sending the message to avoid a possible
532 // race.
533 if (session->connection != NULL) {
534 // Conditional because terminate_sessions will indiscrimately
535 // put sessions in CLOSING whether they ever had a conn or not.
536 session->connection->mark_disposable();
537 }
538
539 // reset session
540 mds->send_message_client(new MClientSession(CEPH_SESSION_CLOSE), session);
541 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
542 session->clear();
543 mds->sessionmap.remove_session(session);
544 } else if (session->is_killing()) {
545 // destroy session, close connection
546 if (session->connection != NULL) {
547 session->connection->mark_down();
548 }
549 mds->sessionmap.remove_session(session);
550 } else {
551 ceph_abort();
552 }
553 } else {
554 ceph_abort();
555 }
556 }
557
558 /**
559 * Inject sessions from some source other than actual connections.
560 *
561 * For example:
562 * - sessions inferred from journal replay
563 * - sessions learned from other MDSs during rejoin
564 * - sessions learned from other MDSs during dir/caps migration
565 * - sessions learned from other MDSs during a cross-MDS rename
566 */
567 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
568 map<client_t,uint64_t>& sseqmap)
569 {
570 version_t pv = mds->sessionmap.get_projected();
571
572 dout(10) << "prepare_force_open_sessions " << pv
573 << " on " << cm.size() << " clients"
574 << dendl;
575 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
576
577 Session *session = mds->sessionmap.get_or_add_session(p->second);
578 pv = mds->sessionmap.mark_projected(session);
579 if (session->is_closed() ||
580 session->is_closing() ||
581 session->is_killing())
582 sseqmap[p->first] = mds->sessionmap.set_state(session, Session::STATE_OPENING);
583 else
584 assert(session->is_open() ||
585 session->is_opening() ||
586 session->is_stale());
587 session->inc_importing();
588 }
589 return pv;
590 }
591
592 void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm,
593 map<client_t,uint64_t>& sseqmap,
594 bool dec_import)
595 {
596 /*
597 * FIXME: need to carefully consider the race conditions between a
598 * client trying to close a session and an MDS doing an import
599 * trying to force open a session...
600 */
601 dout(10) << "finish_force_open_sessions on " << cm.size() << " clients,"
602 << " initial v " << mds->sessionmap.get_version() << dendl;
603
604
605 int sessions_inserted = 0;
606 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
607 sessions_inserted++;
608
609 Session *session = mds->sessionmap.get_session(p->second.name);
610 assert(session);
611
612 if (sseqmap.count(p->first)) {
613 uint64_t sseq = sseqmap[p->first];
614 if (session->get_state_seq() != sseq) {
615 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
616 } else {
617 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
618 mds->sessionmap.set_state(session, Session::STATE_OPEN);
619 mds->sessionmap.touch_session(session);
620 mds->send_message_client(new MClientSession(CEPH_SESSION_OPEN), session);
621 if (mdcache->is_readonly())
622 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
623 }
624 } else {
625 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
626 assert(session->is_open() || session->is_stale());
627 }
628
629 if (dec_import) {
630 session->dec_importing();
631 }
632
633 mds->sessionmap.mark_dirty(session);
634 }
635
636 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
637 }
638
639 class C_MDS_TerminatedSessions : public ServerContext {
640 void finish(int r) override {
641 server->terminating_sessions = false;
642 }
643 public:
644 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
645 };
646
647 void Server::terminate_sessions()
648 {
649 dout(2) << "terminate_sessions" << dendl;
650
651 terminating_sessions = true;
652
653 // kill them off. clients will retry etc.
654 set<Session*> sessions;
655 mds->sessionmap.get_client_session_set(sessions);
656 for (set<Session*>::const_iterator p = sessions.begin();
657 p != sessions.end();
658 ++p) {
659 Session *session = *p;
660 if (session->is_closing() ||
661 session->is_killing() ||
662 session->is_closed())
663 continue;
664 journal_close_session(session, Session::STATE_CLOSING, NULL);
665 }
666
667 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
668 }
669
670
671 void Server::find_idle_sessions()
672 {
673 dout(10) << "find_idle_sessions. laggy until " << mds->get_laggy_until() << dendl;
674
675 // timeout/stale
676 // (caps go stale, lease die)
677 utime_t now = ceph_clock_now();
678 utime_t cutoff = now;
679 cutoff -= g_conf->mds_session_timeout;
680 while (1) {
681 Session *session = mds->sessionmap.get_oldest_session(Session::STATE_OPEN);
682 if (!session) break;
683 dout(20) << "laggiest active session is " << session->info.inst << dendl;
684 if (session->last_cap_renew >= cutoff) {
685 dout(20) << "laggiest active session is " << session->info.inst << " and sufficiently new ("
686 << session->last_cap_renew << ")" << dendl;
687 break;
688 }
689
690 dout(10) << "new stale session " << session->info.inst << " last " << session->last_cap_renew << dendl;
691 mds->sessionmap.set_state(session, Session::STATE_STALE);
692 mds->locker->revoke_stale_caps(session);
693 mds->locker->remove_stale_leases(session);
694 mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
695 finish_flush_session(session, session->get_push_seq());
696 }
697
698 // autoclose
699 cutoff = now;
700 cutoff -= g_conf->mds_session_autoclose;
701
702 // don't kick clients if we've been laggy
703 if (mds->get_laggy_until() > cutoff) {
704 dout(10) << " laggy_until " << mds->get_laggy_until() << " > cutoff " << cutoff
705 << ", not kicking any clients to be safe" << dendl;
706 return;
707 }
708
709 if (mds->sessionmap.get_sessions().size() == 1 &&
710 mds->mdsmap->get_num_in_mds() == 1) {
711 dout(20) << "not evicting a slow client, because there is only one"
712 << dendl;
713 return;
714 }
715
716 while (1) {
717 Session *session = mds->sessionmap.get_oldest_session(Session::STATE_STALE);
718 if (!session)
719 break;
720 if (session->is_importing()) {
721 dout(10) << "stopping at importing session " << session->info.inst << dendl;
722 break;
723 }
724 assert(session->is_stale());
725 if (session->last_cap_renew >= cutoff) {
726 dout(20) << "oldest stale session is " << session->info.inst << " and sufficiently new ("
727 << session->last_cap_renew << ")" << dendl;
728 break;
729 }
730
731 utime_t age = now;
732 age -= session->last_cap_renew;
733 mds->clog->info() << "closing stale session " << session->info.inst
734 << " after " << age;
735 dout(10) << "autoclosing stale session " << session->info.inst << " last " << session->last_cap_renew << dendl;
736 kill_session(session, NULL);
737 }
738 }
739
740 /*
741 * XXX bump in the interface here, not using an MDSInternalContextBase here
742 * because all the callers right now happen to use a SaferCond
743 */
744 void Server::kill_session(Session *session, Context *on_safe)
745 {
746 if ((session->is_opening() ||
747 session->is_open() ||
748 session->is_stale()) &&
749 !session->is_importing()) {
750 dout(10) << "kill_session " << session << dendl;
751 journal_close_session(session, Session::STATE_KILLING, on_safe);
752 } else {
753 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
754 assert(session->is_closing() ||
755 session->is_closed() ||
756 session->is_killing() ||
757 session->is_importing());
758 if (on_safe) {
759 on_safe->complete(0);
760 }
761 }
762 }
763
764 void Server::journal_close_session(Session *session, int state, Context *on_safe)
765 {
766 uint64_t sseq = mds->sessionmap.set_state(session, state);
767 version_t pv = mds->sessionmap.mark_projected(session);
768 version_t piv = 0;
769
770 // release alloc and pending-alloc inos for this session
771 // and wipe out session state, in case the session close aborts for some reason
772 interval_set<inodeno_t> both;
773 both.insert(session->info.prealloc_inos);
774 both.insert(session->pending_prealloc_inos);
775 if (both.size()) {
776 mds->inotable->project_release_ids(both);
777 piv = mds->inotable->get_projected_version();
778 } else
779 piv = 0;
780
781 mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv),
782 new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe));
783 mdlog->flush();
784
785 // clean up requests, too
786 elist<MDRequestImpl*>::iterator p =
787 session->requests.begin(member_offset(MDRequestImpl,
788 item_session_request));
789 while (!p.end()) {
790 MDRequestRef mdr = mdcache->request_get((*p)->reqid);
791 ++p;
792 mdcache->request_kill(mdr);
793 }
794
795 finish_flush_session(session, session->get_push_seq());
796 }
797
798 void Server::reconnect_clients(MDSInternalContext *reconnect_done_)
799 {
800 reconnect_done = reconnect_done_;
801 mds->sessionmap.get_client_set(client_reconnect_gather);
802
803 if (client_reconnect_gather.empty()) {
804 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
805 reconnect_gather_finish();
806 return;
807 }
808
809 // clients will get the mdsmap and discover we're reconnecting via the monitor.
810
811 reconnect_start = ceph_clock_now();
812 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
813 mds->sessionmap.dump();
814 }
815
816 /* This function DOES put the passed message before returning*/
817 void Server::handle_client_reconnect(MClientReconnect *m)
818 {
819 dout(7) << "handle_client_reconnect " << m->get_source() << dendl;
820 client_t from = m->get_source().num();
821 Session *session = get_session(m);
822 assert(session);
823
824 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
825 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
826 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
827 return;
828 }
829
830 utime_t delay = ceph_clock_now();
831 delay -= reconnect_start;
832 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
833
834 bool deny = false;
835 if (!mds->is_reconnect()) {
836 // XXX maybe in the future we can do better than this?
837 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
838 mds->clog->info() << "denied reconnect attempt (mds is "
839 << ceph_mds_state_name(mds->get_state())
840 << ") from " << m->get_source_inst()
841 << " after " << delay << " (allowed interval " << g_conf->mds_reconnect_timeout << ")";
842 deny = true;
843 } else if (session->is_closed()) {
844 dout(1) << " session is closed, ignoring reconnect, sending close" << dendl;
845 mds->clog->info() << "denied reconnect attempt (mds is "
846 << ceph_mds_state_name(mds->get_state())
847 << ") from " << m->get_source_inst() << " (session is closed)";
848 deny = true;
849 } else if (mdcache->is_readonly()) {
850 dout(1) << " read-only FS, ignoring reconnect, sending close" << dendl;
851 mds->clog->info() << "denied reconnect attempt (mds is read-only)";
852 deny = true;
853 }
854
855 if (deny) {
856 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_CLOSE));
857 m->put();
858 return;
859 }
860
861 // notify client of success with an OPEN
862 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_OPEN));
863 session->last_cap_renew = ceph_clock_now();
864 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
865
866 // snaprealms
867 for (vector<ceph_mds_snaprealm_reconnect>::iterator p = m->realms.begin();
868 p != m->realms.end();
869 ++p) {
870 CInode *in = mdcache->get_inode(inodeno_t(p->ino));
871 if (in && in->state_test(CInode::STATE_PURGING))
872 continue;
873 if (in) {
874 assert(in->snaprealm);
875 if (in->snaprealm->have_past_parents_open()) {
876 dout(15) << "open snaprealm (w/ past parents) on " << *in << dendl;
877 mdcache->finish_snaprealm_reconnect(from, in->snaprealm, snapid_t(p->seq));
878 } else {
879 dout(15) << "open snaprealm (w/o past parents) on " << *in << dendl;
880 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
881 }
882 } else {
883 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(p->ino)
884 << " seq " << p->seq << dendl;
885 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
886 }
887 }
888
889 // caps
890 for (map<inodeno_t, cap_reconnect_t>::iterator p = m->caps.begin();
891 p != m->caps.end();
892 ++p) {
893 // make sure our last_cap_id is MAX over all issued caps
894 if (p->second.capinfo.cap_id > mdcache->last_cap_id)
895 mdcache->last_cap_id = p->second.capinfo.cap_id;
896
897 CInode *in = mdcache->get_inode(p->first);
898 if (in && in->state_test(CInode::STATE_PURGING))
899 continue;
900 if (in && in->is_auth()) {
901 // we recovered it, and it's ours. take note.
902 dout(15) << "open cap realm " << inodeno_t(p->second.capinfo.snaprealm)
903 << " on " << *in << dendl;
904 in->reconnect_cap(from, p->second, session);
905 mdcache->add_reconnected_cap(from, p->first, p->second);
906 recover_filelocks(in, p->second.flockbl, m->get_orig_source().num());
907 continue;
908 }
909
910 if (in && !in->is_auth()) {
911 // not mine.
912 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
913 // add to cap export list.
914 p->second.path.clear(); // we don't need path
915 mdcache->rejoin_export_caps(p->first, from, p->second,
916 in->authority().first);
917 } else {
918 // don't know if the inode is mine
919 dout(10) << "missing ino " << p->first << ", will load later" << dendl;
920 p->second.path.clear(); // we don't need path
921 mdcache->rejoin_recovered_caps(p->first, from, p->second, MDS_RANK_NONE);
922 }
923 }
924
925 // remove from gather set
926 client_reconnect_gather.erase(from);
927 if (client_reconnect_gather.empty())
928 reconnect_gather_finish();
929
930 m->put();
931 }
932
933
934
935 void Server::reconnect_gather_finish()
936 {
937 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
938 assert(reconnect_done);
939 reconnect_done->complete(0);
940 reconnect_done = NULL;
941 }
942
943 void Server::reconnect_tick()
944 {
945 utime_t reconnect_end = reconnect_start;
946 reconnect_end += g_conf->mds_reconnect_timeout;
947 if (ceph_clock_now() >= reconnect_end &&
948 !client_reconnect_gather.empty()) {
949 dout(10) << "reconnect timed out" << dendl;
950 for (set<client_t>::iterator p = client_reconnect_gather.begin();
951 p != client_reconnect_gather.end();
952 ++p) {
953 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
954 assert(session);
955 dout(1) << "reconnect gave up on " << session->info.inst << dendl;
956 kill_session(session, NULL);
957 failed_reconnects++;
958 }
959 client_reconnect_gather.clear();
960 reconnect_gather_finish();
961 }
962 }
963
964 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
965 {
966 if (!locks.length()) return;
967 int numlocks;
968 ceph_filelock lock;
969 bufferlist::iterator p = locks.begin();
970 ::decode(numlocks, p);
971 for (int i = 0; i < numlocks; ++i) {
972 ::decode(lock, p);
973 lock.client = client;
974 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
975 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
976 }
977 ::decode(numlocks, p);
978 for (int i = 0; i < numlocks; ++i) {
979 ::decode(lock, p);
980 lock.client = client;
981 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
982 ++in->get_flock_lock_state()->client_held_lock_counts[client];
983 }
984 }
985
986
987 /**
988 * Call this when the MDCache is oversized, to send requests to the clients
989 * to trim some caps, and consequently unpin some inodes in the MDCache so
990 * that it can trim too.
991 */
992 void Server::recall_client_state(float ratio)
993 {
994 int max_caps_per_client = (int)(g_conf->mds_cache_size * .8);
995 int min_caps_per_client = 100;
996
997 dout(10) << "recall_client_state " << ratio
998 << ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
999 << dendl;
1000
1001 set<Session*> sessions;
1002 mds->sessionmap.get_client_session_set(sessions);
1003 for (set<Session*>::const_iterator p = sessions.begin();
1004 p != sessions.end();
1005 ++p) {
1006 Session *session = *p;
1007 if (!session->is_open() ||
1008 !session->info.inst.name.is_client())
1009 continue;
1010
1011 dout(10) << " session " << session->info.inst
1012 << " caps " << session->caps.size()
1013 << ", leases " << session->leases.size()
1014 << dendl;
1015
1016 if (session->caps.size() > min_caps_per_client) {
1017 int newlim = MIN((int)(session->caps.size() * ratio), max_caps_per_client);
1018 if (session->caps.size() > newlim) {
1019 MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
1020 m->head.max_caps = newlim;
1021 mds->send_message_client(m, session);
1022 session->notify_recall_sent(newlim);
1023 }
1024 }
1025 }
1026 }
1027
1028 void Server::force_clients_readonly()
1029 {
1030 dout(10) << "force_clients_readonly" << dendl;
1031 set<Session*> sessions;
1032 mds->sessionmap.get_client_session_set(sessions);
1033 for (set<Session*>::const_iterator p = sessions.begin();
1034 p != sessions.end();
1035 ++p) {
1036 Session *session = *p;
1037 if (!session->info.inst.name.is_client() ||
1038 !(session->is_open() || session->is_stale()))
1039 continue;
1040 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
1041 }
1042 }
1043
1044 /*******
1045 * some generic stuff for finishing off requests
1046 */
1047 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1048 {
1049 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1050 assert(!mdr->has_completed);
1051
1052 // note trace items for eventual reply.
1053 mdr->tracei = in;
1054 if (in)
1055 mdr->pin(in);
1056
1057 mdr->tracedn = dn;
1058 if (dn)
1059 mdr->pin(dn);
1060
1061 early_reply(mdr, in, dn);
1062
1063 mdr->committing = true;
1064 submit_mdlog_entry(le, fin, mdr, __func__);
1065
1066 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1067 if (mds->queue_one_replay()) {
1068 dout(10) << " queued next replay op" << dendl;
1069 } else {
1070 dout(10) << " journaled last replay op, flushing" << dendl;
1071 mdlog->flush();
1072 }
1073 } else if (mdr->did_early_reply)
1074 mds->locker->drop_rdlocks(mdr.get());
1075 else
1076 mdlog->flush();
1077 }
1078
1079 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1080 const char *event)
1081 {
1082 if (mdr) {
1083 string event_str("submit entry: ");
1084 event_str += event;
1085 mdr->mark_event_string(event_str);
1086 }
1087 mdlog->submit_entry(le, fin);
1088 }
1089
1090 /*
1091 * send response built from mdr contents and error code; clean up mdr
1092 */
1093 void Server::respond_to_request(MDRequestRef& mdr, int r)
1094 {
1095 if (mdr->client_request) {
1096 reply_client_request(mdr, new MClientReply(mdr->client_request, r));
1097
1098 // add here to avoid counting ops multiple times (e.g., locks, loading)
1099 switch(mdr->client_request->get_op()) {
1100 case CEPH_MDS_OP_LOOKUPHASH:
1101 logger->inc(l_mdss_req_lookuphash);
1102 break;
1103 case CEPH_MDS_OP_LOOKUPINO:
1104 logger->inc(l_mdss_req_lookupino);
1105 break;
1106 case CEPH_MDS_OP_LOOKUPPARENT:
1107 logger->inc(l_mdss_req_lookupparent);
1108 break;
1109 case CEPH_MDS_OP_LOOKUPNAME:
1110 logger->inc(l_mdss_req_lookupname);
1111 break;
1112 case CEPH_MDS_OP_LOOKUP:
1113 logger->inc(l_mdss_req_lookup);
1114 break;
1115 case CEPH_MDS_OP_LOOKUPSNAP:
1116 logger->inc(l_mdss_req_lookupsnap);
1117 break;
1118 case CEPH_MDS_OP_GETATTR:
1119 logger->inc(l_mdss_req_getattr);
1120 break;
1121 case CEPH_MDS_OP_SETATTR:
1122 logger->inc(l_mdss_req_setattr);
1123 break;
1124 case CEPH_MDS_OP_SETLAYOUT:
1125 logger->inc(l_mdss_req_setlayout);
1126 break;
1127 case CEPH_MDS_OP_SETDIRLAYOUT:
1128 logger->inc(l_mdss_req_setdirlayout);
1129 break;
1130 case CEPH_MDS_OP_SETXATTR:
1131 logger->inc(l_mdss_req_setxattr);
1132 break;
1133 case CEPH_MDS_OP_RMXATTR:
1134 logger->inc(l_mdss_req_rmxattr);
1135 break;
1136 case CEPH_MDS_OP_READDIR:
1137 logger->inc(l_mdss_req_readdir);
1138 break;
1139 case CEPH_MDS_OP_SETFILELOCK:
1140 logger->inc(l_mdss_req_setfilelock);
1141 break;
1142 case CEPH_MDS_OP_GETFILELOCK:
1143 logger->inc(l_mdss_req_getfilelock);
1144 break;
1145 case CEPH_MDS_OP_CREATE:
1146 logger->inc(l_mdss_req_create);
1147 case CEPH_MDS_OP_OPEN:
1148 logger->inc(l_mdss_req_open);
1149 break;
1150 case CEPH_MDS_OP_MKNOD:
1151 logger->inc(l_mdss_req_mknod);
1152 break;
1153 case CEPH_MDS_OP_LINK:
1154 logger->inc(l_mdss_req_link);
1155 break;
1156 case CEPH_MDS_OP_UNLINK:
1157 logger->inc(l_mdss_req_unlink);
1158 break;
1159 case CEPH_MDS_OP_RMDIR:
1160 logger->inc(l_mdss_req_rmdir);
1161 break;
1162 case CEPH_MDS_OP_RENAME:
1163 logger->inc(l_mdss_req_rename);
1164 break;
1165 case CEPH_MDS_OP_MKDIR:
1166 logger->inc(l_mdss_req_mkdir);
1167 break;
1168 case CEPH_MDS_OP_SYMLINK:
1169 logger->inc(l_mdss_req_symlink);
1170 break;
1171 case CEPH_MDS_OP_LSSNAP:
1172 logger->inc(l_mdss_req_lssnap);
1173 break;
1174 case CEPH_MDS_OP_MKSNAP:
1175 logger->inc(l_mdss_req_mksnap);
1176 break;
1177 case CEPH_MDS_OP_RMSNAP:
1178 logger->inc(l_mdss_req_rmsnap);
1179 break;
1180 case CEPH_MDS_OP_RENAMESNAP:
1181 logger->inc(l_mdss_req_renamesnap);
1182 break;
1183 }
1184 } else if (mdr->internal_op > -1) {
1185 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1186 if (!mdr->internal_op_finish)
1187 assert(0 == "trying to respond to internal op without finisher");
1188 mdr->internal_op_finish->complete(r);
1189 mdcache->request_finish(mdr);
1190 }
1191 }
1192
1193 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
1194 {
1195 if (!g_conf->mds_early_reply)
1196 return;
1197
1198 if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
1199 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
1200 return;
1201 }
1202
1203 if (mdr->alloc_ino) {
1204 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
1205 return;
1206 }
1207
1208 MClientRequest *req = mdr->client_request;
1209 entity_inst_t client_inst = req->get_source_inst();
1210 if (client_inst.name.is_mds())
1211 return;
1212
1213 if (req->is_replay()) {
1214 dout(10) << " no early reply on replay op" << dendl;
1215 return;
1216 }
1217
1218
1219 MClientReply *reply = new MClientReply(req, 0);
1220 reply->set_unsafe();
1221
1222 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1223 //
1224 //_rename_finish() does not send dentry link/unlink message to replicas.
1225 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1226 // that have projected linkages from getting new replica.
1227 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
1228
1229 dout(10) << "early_reply " << reply->get_result()
1230 << " (" << cpp_strerror(reply->get_result())
1231 << ") " << *req << dendl;
1232
1233 if (tracei || tracedn) {
1234 if (tracei)
1235 mdr->cap_releases.erase(tracei->vino());
1236 if (tracedn)
1237 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1238
1239 set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid,
1240 req->get_dentry_wanted(), mdr);
1241 }
1242
1243 reply->set_extra_bl(mdr->reply_extra_bl);
1244 req->get_connection()->send_message(reply);
1245
1246 mdr->did_early_reply = true;
1247
1248 mds->logger->inc(l_mds_reply);
1249 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
1250 mds->logger->tinc(l_mds_reply_latency, lat);
1251 dout(20) << "lat " << lat << dendl;
1252
1253 mdr->mark_event("early_replied");
1254 }
1255
1256 /*
1257 * send given reply
1258 * include a trace to tracei
1259 * Clean up mdr
1260 */
1261 void Server::reply_client_request(MDRequestRef& mdr, MClientReply *reply)
1262 {
1263 assert(mdr.get());
1264 MClientRequest *req = mdr->client_request;
1265
1266 dout(7) << "reply_client_request " << reply->get_result()
1267 << " (" << cpp_strerror(reply->get_result())
1268 << ") " << *req << dendl;
1269
1270 mdr->mark_event("replying");
1271
1272 Session *session = mdr->session;
1273
1274 // note successful request in session map?
1275 //
1276 // setfilelock requests are special, they only modify states in MDS memory.
1277 // The states get lost when MDS fails. If Client re-send a completed
1278 // setfilelock request, it means that client did not receive corresponding
1279 // setfilelock reply. So MDS should re-execute the setfilelock request.
1280 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
1281 reply->get_result() == 0 && session) {
1282 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
1283 session->add_completed_request(mdr->reqid.tid, created);
1284 if (mdr->ls) {
1285 mdr->ls->touched_sessions.insert(session->info.inst.name);
1286 }
1287 }
1288
1289 // give any preallocated inos to the session
1290 apply_allocated_inos(mdr, session);
1291
1292 // get tracei/tracedn from mdr?
1293 snapid_t snapid = mdr->snapid;
1294 CInode *tracei = mdr->tracei;
1295 CDentry *tracedn = mdr->tracedn;
1296
1297 bool is_replay = mdr->client_request->is_replay();
1298 bool did_early_reply = mdr->did_early_reply;
1299 entity_inst_t client_inst = req->get_source_inst();
1300 int dentry_wanted = req->get_dentry_wanted();
1301
1302 if (!did_early_reply && !is_replay) {
1303
1304 mds->logger->inc(l_mds_reply);
1305 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
1306 mds->logger->tinc(l_mds_reply_latency, lat);
1307 dout(20) << "lat " << lat << dendl;
1308
1309 if (tracei)
1310 mdr->cap_releases.erase(tracei->vino());
1311 if (tracedn)
1312 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1313 }
1314
1315 // drop non-rdlocks before replying, so that we can issue leases
1316 mdcache->request_drop_non_rdlocks(mdr);
1317
1318 // reply at all?
1319 if (client_inst.name.is_mds() || !session) {
1320 reply->put(); // mds doesn't need a reply
1321 reply = 0;
1322 } else {
1323 // send reply.
1324 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
1325 (tracei || tracedn)) {
1326 if (is_replay) {
1327 if (tracei)
1328 mdcache->try_reconnect_cap(tracei, session);
1329 } else {
1330 // include metadata in reply
1331 set_trace_dist(session, reply, tracei, tracedn,
1332 snapid, dentry_wanted,
1333 mdr);
1334 }
1335 }
1336
1337 // We can set the extra bl unconditionally: if it's already been sent in the
1338 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
1339 reply->set_extra_bl(mdr->reply_extra_bl);
1340
1341 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
1342 req->get_connection()->send_message(reply);
1343 }
1344
1345 if (req->is_queued_for_replay() &&
1346 (mdr->has_completed || reply->get_result() < 0)) {
1347 if (reply->get_result() < 0) {
1348 int r = reply->get_result();
1349 derr << "reply_client_request: failed to replay " << *req
1350 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
1351 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
1352 }
1353 mds->queue_one_replay();
1354 }
1355
1356 // clean up request
1357 mdcache->request_finish(mdr);
1358
1359 // take a closer look at tracei, if it happens to be a remote link
1360 if (tracei &&
1361 tracedn &&
1362 tracedn->get_projected_linkage()->is_remote()) {
1363 mdcache->eval_remote(tracedn);
1364 }
1365 }
1366
1367
1368 void Server::encode_empty_dirstat(bufferlist& bl)
1369 {
1370 static DirStat empty;
1371 empty.encode(bl);
1372 }
1373
1374 void Server::encode_infinite_lease(bufferlist& bl)
1375 {
1376 LeaseStat e;
1377 e.seq = 0;
1378 e.mask = -1;
1379 e.duration_ms = -1;
1380 ::encode(e, bl);
1381 dout(20) << "encode_infinite_lease " << e << dendl;
1382 }
1383
1384 void Server::encode_null_lease(bufferlist& bl)
1385 {
1386 LeaseStat e;
1387 e.seq = 0;
1388 e.mask = 0;
1389 e.duration_ms = 0;
1390 ::encode(e, bl);
1391 dout(20) << "encode_null_lease " << e << dendl;
1392 }
1393
1394
1395 /*
1396 * pass inode OR dentry (not both, or we may get confused)
1397 *
1398 * trace is in reverse order (i.e. root inode comes last)
1399 */
1400 void Server::set_trace_dist(Session *session, MClientReply *reply,
1401 CInode *in, CDentry *dn,
1402 snapid_t snapid,
1403 int dentry_wanted,
1404 MDRequestRef& mdr)
1405 {
1406 // skip doing this for debugging purposes?
1407 if (g_conf->mds_inject_traceless_reply_probability &&
1408 mdr->ls && !mdr->o_trunc &&
1409 (rand() % 10000 < g_conf->mds_inject_traceless_reply_probability * 10000.0)) {
1410 dout(5) << "deliberately skipping trace for " << *reply << dendl;
1411 return;
1412 }
1413
1414 // inode, dentry, dir, ..., inode
1415 bufferlist bl;
1416 mds_rank_t whoami = mds->get_nodeid();
1417 client_t client = session->get_client();
1418 utime_t now = ceph_clock_now();
1419
1420 dout(20) << "set_trace_dist snapid " << snapid << dendl;
1421
1422 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
1423
1424 // realm
1425 if (snapid == CEPH_NOSNAP) {
1426 SnapRealm *realm;
1427 if (in)
1428 realm = in->find_snaprealm();
1429 else
1430 realm = dn->get_dir()->get_inode()->find_snaprealm();
1431 reply->snapbl = realm->get_snap_trace();
1432 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
1433 }
1434
1435 // dir + dentry?
1436 if (dn) {
1437 reply->head.is_dentry = 1;
1438 CDir *dir = dn->get_dir();
1439 CInode *diri = dir->get_inode();
1440
1441 diri->encode_inodestat(bl, session, NULL, snapid);
1442 dout(20) << "set_trace_dist added diri " << *diri << dendl;
1443
1444 #ifdef MDS_VERIFY_FRAGSTAT
1445 if (dir->is_complete())
1446 dir->verify_fragstat();
1447 #endif
1448 dir->encode_dirstat(bl, whoami);
1449 dout(20) << "set_trace_dist added dir " << *dir << dendl;
1450
1451 ::encode(dn->get_name(), bl);
1452 if (snapid == CEPH_NOSNAP)
1453 mds->locker->issue_client_lease(dn, client, bl, now, session);
1454 else
1455 encode_null_lease(bl);
1456 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
1457 } else
1458 reply->head.is_dentry = 0;
1459
1460 // inode
1461 if (in) {
1462 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
1463 dout(20) << "set_trace_dist added in " << *in << dendl;
1464 reply->head.is_target = 1;
1465 } else
1466 reply->head.is_target = 0;
1467
1468 reply->set_trace(bl);
1469 }
1470
1471
1472
1473
1474 /***
1475 * process a client request
1476 * This function DOES put the passed message before returning
1477 */
1478 void Server::handle_client_request(MClientRequest *req)
1479 {
1480 dout(4) << "handle_client_request " << *req << dendl;
1481
1482 if (mds->logger)
1483 mds->logger->inc(l_mds_request);
1484 if (logger)
1485 logger->inc(l_mdss_handle_client_request);
1486
1487 if (!mdcache->is_open()) {
1488 dout(5) << "waiting for root" << dendl;
1489 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
1490 return;
1491 }
1492
1493 // active session?
1494 Session *session = 0;
1495 if (req->get_source().is_client()) {
1496 session = get_session(req);
1497 if (!session) {
1498 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
1499 } else if (session->is_closed() ||
1500 session->is_closing() ||
1501 session->is_killing()) {
1502 dout(5) << "session closed|closing|killing, dropping" << dendl;
1503 session = NULL;
1504 }
1505 if (!session) {
1506 if (req->is_queued_for_replay())
1507 mds->queue_one_replay();
1508 req->put();
1509 return;
1510 }
1511 }
1512
1513 // old mdsmap?
1514 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
1515 // send it? hrm, this isn't ideal; they may get a lot of copies if
1516 // they have a high request rate.
1517 }
1518
1519 // completed request?
1520 bool has_completed = false;
1521 if (req->is_replay() || req->get_retry_attempt()) {
1522 assert(session);
1523 inodeno_t created;
1524 if (session->have_completed_request(req->get_reqid().tid, &created)) {
1525 has_completed = true;
1526 // Don't send traceless reply if the completed request has created
1527 // new inode. Treat the request as lookup request instead.
1528 if (req->is_replay() ||
1529 ((created == inodeno_t() || !mds->is_clientreplay()) &&
1530 req->get_op() != CEPH_MDS_OP_OPEN &&
1531 req->get_op() != CEPH_MDS_OP_CREATE)) {
1532 dout(5) << "already completed " << req->get_reqid() << dendl;
1533 MClientReply *reply = new MClientReply(req, 0);
1534 if (created != inodeno_t()) {
1535 bufferlist extra;
1536 ::encode(created, extra);
1537 reply->set_extra_bl(extra);
1538 }
1539 req->get_connection()->send_message(reply);
1540
1541 if (req->is_queued_for_replay())
1542 mds->queue_one_replay();
1543
1544 req->put();
1545 return;
1546 }
1547 if (req->get_op() != CEPH_MDS_OP_OPEN &&
1548 req->get_op() != CEPH_MDS_OP_CREATE) {
1549 dout(10) << " completed request which created new inode " << created
1550 << ", convert it to lookup request" << dendl;
1551 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
1552 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
1553 }
1554 }
1555 }
1556
1557 // trim completed_request list
1558 if (req->get_oldest_client_tid() > 0) {
1559 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
1560 assert(session);
1561 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
1562 // Sessions 'completed_requests' was dirtied, mark it to be
1563 // potentially flushed at segment expiry.
1564 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
1565
1566 if (session->get_num_trim_requests_warnings() > 0 &&
1567 session->get_num_completed_requests() * 2 < g_conf->mds_max_completed_requests)
1568 session->reset_num_trim_requests_warnings();
1569 } else {
1570 if (session->get_num_completed_requests() >=
1571 (g_conf->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
1572 session->inc_num_trim_requests_warnings();
1573 stringstream ss;
1574 ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
1575 << req->get_oldest_client_tid() << "), "
1576 << session->get_num_completed_requests()
1577 << " completed requests recorded in session\n";
1578 mds->clog->warn() << ss.str();
1579 dout(20) << __func__ << " " << ss.str() << dendl;
1580 }
1581 }
1582 }
1583
1584 // register + dispatch
1585 MDRequestRef mdr = mdcache->request_start(req);
1586 if (!mdr.get())
1587 return;
1588
1589 if (session) {
1590 mdr->session = session;
1591 session->requests.push_back(&mdr->item_session_request);
1592 }
1593
1594 if (has_completed)
1595 mdr->has_completed = true;
1596
1597 // process embedded cap releases?
1598 // (only if NOT replay!)
1599 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
1600 client_t client = req->get_source().num();
1601 for (vector<MClientRequest::Release>::iterator p = req->releases.begin();
1602 p != req->releases.end();
1603 ++p)
1604 mds->locker->process_request_cap_release(mdr, client, p->item, p->dname);
1605 req->releases.clear();
1606 }
1607
1608 dispatch_client_request(mdr);
1609 return;
1610 }
1611
1612 void Server::handle_osd_map()
1613 {
1614 /* Note that we check the OSDMAP_FULL flag directly rather than
1615 * using osdmap_full_flag(), because we want to know "is the flag set"
1616 * rather than "does the flag apply to us?" */
1617 mds->objecter->with_osdmap([this](const OSDMap& o) {
1618 is_full = o.test_flag(CEPH_OSDMAP_FULL);
1619 dout(7) << __func__ << ": full = " << is_full << " epoch = "
1620 << o.get_epoch() << dendl;
1621 });
1622 }
1623
1624 void Server::dispatch_client_request(MDRequestRef& mdr)
1625 {
1626 // we shouldn't be waiting on anyone.
1627 assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
1628
1629 if (mdr->killed) {
1630 dout(10) << "request " << *mdr << " was killed" << dendl;
1631 return;
1632 }
1633
1634 MClientRequest *req = mdr->client_request;
1635
1636 if (logger) logger->inc(l_mdss_dispatch_client_request);
1637
1638 dout(7) << "dispatch_client_request " << *req << dendl;
1639
1640 if (req->may_write()) {
1641 if (mdcache->is_readonly()) {
1642 dout(10) << " read-only FS" << dendl;
1643 respond_to_request(mdr, -EROFS);
1644 return;
1645 }
1646 if (mdr->has_more() && mdr->more()->slave_error) {
1647 dout(10) << " got error from slaves" << dendl;
1648 respond_to_request(mdr, mdr->more()->slave_error);
1649 return;
1650 }
1651 }
1652
1653 if (is_full) {
1654 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1655 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
1656 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1657 req->get_op() == CEPH_MDS_OP_RMXATTR ||
1658 req->get_op() == CEPH_MDS_OP_SETXATTR ||
1659 req->get_op() == CEPH_MDS_OP_CREATE ||
1660 req->get_op() == CEPH_MDS_OP_SYMLINK ||
1661 req->get_op() == CEPH_MDS_OP_MKSNAP ||
1662 ((req->get_op() == CEPH_MDS_OP_LINK ||
1663 req->get_op() == CEPH_MDS_OP_RENAME) &&
1664 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
1665 ) {
1666
1667 dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
1668 respond_to_request(mdr, -ENOSPC);
1669 return;
1670 } else {
1671 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
1672 }
1673 }
1674
1675 switch (req->get_op()) {
1676 case CEPH_MDS_OP_LOOKUPHASH:
1677 case CEPH_MDS_OP_LOOKUPINO:
1678 handle_client_lookup_ino(mdr, false, false);
1679 break;
1680 case CEPH_MDS_OP_LOOKUPPARENT:
1681 handle_client_lookup_ino(mdr, true, false);
1682 break;
1683 case CEPH_MDS_OP_LOOKUPNAME:
1684 handle_client_lookup_ino(mdr, false, true);
1685 break;
1686
1687 // inodes ops.
1688 case CEPH_MDS_OP_LOOKUP:
1689 handle_client_getattr(mdr, true);
1690 break;
1691
1692 case CEPH_MDS_OP_LOOKUPSNAP:
1693 // lookupsnap does not reference a CDentry; treat it as a getattr
1694 case CEPH_MDS_OP_GETATTR:
1695 handle_client_getattr(mdr, false);
1696 break;
1697
1698 case CEPH_MDS_OP_SETATTR:
1699 handle_client_setattr(mdr);
1700 break;
1701 case CEPH_MDS_OP_SETLAYOUT:
1702 handle_client_setlayout(mdr);
1703 break;
1704 case CEPH_MDS_OP_SETDIRLAYOUT:
1705 handle_client_setdirlayout(mdr);
1706 break;
1707 case CEPH_MDS_OP_SETXATTR:
1708 handle_client_setxattr(mdr);
1709 break;
1710 case CEPH_MDS_OP_RMXATTR:
1711 handle_client_removexattr(mdr);
1712 break;
1713
1714 case CEPH_MDS_OP_READDIR:
1715 handle_client_readdir(mdr);
1716 break;
1717
1718 case CEPH_MDS_OP_SETFILELOCK:
1719 handle_client_file_setlock(mdr);
1720 break;
1721
1722 case CEPH_MDS_OP_GETFILELOCK:
1723 handle_client_file_readlock(mdr);
1724 break;
1725
1726 // funky.
1727 case CEPH_MDS_OP_CREATE:
1728 if (mdr->has_completed)
1729 handle_client_open(mdr); // already created.. just open
1730 else
1731 handle_client_openc(mdr);
1732 break;
1733
1734 case CEPH_MDS_OP_OPEN:
1735 handle_client_open(mdr);
1736 break;
1737
1738 // namespace.
1739 // no prior locks.
1740 case CEPH_MDS_OP_MKNOD:
1741 handle_client_mknod(mdr);
1742 break;
1743 case CEPH_MDS_OP_LINK:
1744 handle_client_link(mdr);
1745 break;
1746 case CEPH_MDS_OP_UNLINK:
1747 case CEPH_MDS_OP_RMDIR:
1748 handle_client_unlink(mdr);
1749 break;
1750 case CEPH_MDS_OP_RENAME:
1751 handle_client_rename(mdr);
1752 break;
1753 case CEPH_MDS_OP_MKDIR:
1754 handle_client_mkdir(mdr);
1755 break;
1756 case CEPH_MDS_OP_SYMLINK:
1757 handle_client_symlink(mdr);
1758 break;
1759
1760
1761 // snaps
1762 case CEPH_MDS_OP_LSSNAP:
1763 handle_client_lssnap(mdr);
1764 break;
1765 case CEPH_MDS_OP_MKSNAP:
1766 handle_client_mksnap(mdr);
1767 break;
1768 case CEPH_MDS_OP_RMSNAP:
1769 handle_client_rmsnap(mdr);
1770 break;
1771 case CEPH_MDS_OP_RENAMESNAP:
1772 handle_client_renamesnap(mdr);
1773 break;
1774
1775 default:
1776 dout(1) << " unknown client op " << req->get_op() << dendl;
1777 respond_to_request(mdr, -EOPNOTSUPP);
1778 }
1779 }
1780
1781
1782 // ---------------------------------------
1783 // SLAVE REQUESTS
1784
1785 /* This function DOES put the passed message before returning*/
1786 void Server::handle_slave_request(MMDSSlaveRequest *m)
1787 {
1788 dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
1789 mds_rank_t from = mds_rank_t(m->get_source().num());
1790
1791 if (logger) logger->inc(l_mdss_handle_slave_request);
1792
1793 // reply?
1794 if (m->is_reply())
1795 return handle_slave_request_reply(m);
1796
1797 // the purpose of rename notify is enforcing causal message ordering. making sure
1798 // bystanders have received all messages from rename srcdn's auth MDS.
1799 if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
1800 MMDSSlaveRequest *reply = new MMDSSlaveRequest(m->get_reqid(), m->get_attempt(),
1801 MMDSSlaveRequest::OP_RENAMENOTIFYACK);
1802 mds->send_message(reply, m->get_connection());
1803 m->put();
1804 return;
1805 }
1806
1807 CDentry *straydn = NULL;
1808 if (m->stray.length() > 0) {
1809 straydn = mdcache->add_replica_stray(m->stray, from);
1810 assert(straydn);
1811 m->stray.clear();
1812 }
1813
1814 // am i a new slave?
1815 MDRequestRef mdr;
1816 if (mdcache->have_request(m->get_reqid())) {
1817 // existing?
1818 mdr = mdcache->request_get(m->get_reqid());
1819
1820 // is my request newer?
1821 if (mdr->attempt > m->get_attempt()) {
1822 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
1823 << ", dropping " << *m << dendl;
1824 m->put();
1825 return;
1826 }
1827
1828
1829 if (mdr->attempt < m->get_attempt()) {
1830 // mine is old, close it out
1831 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
1832 << ", closing out" << dendl;
1833 mdcache->request_finish(mdr);
1834 mdr.reset();
1835 } else if (mdr->slave_to_mds != from) {
1836 dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
1837 m->put();
1838 return;
1839 }
1840
1841 if (m->get_op() == MMDSSlaveRequest::OP_FINISH && m->is_abort()) {
1842 mdr->aborted = true;
1843 if (mdr->slave_request) {
1844 // only abort on-going xlock, wrlock and auth pin
1845 assert(!mdr->slave_did_prepare());
1846 } else {
1847 mdcache->request_finish(mdr);
1848 }
1849 return;
1850 }
1851 }
1852 if (!mdr.get()) {
1853 // new?
1854 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
1855 dout(10) << "missing slave request for " << m->get_reqid()
1856 << " OP_FINISH, must have lost race with a forward" << dendl;
1857 m->put();
1858 return;
1859 }
1860 mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
1861 mdr->set_op_stamp(m->op_stamp);
1862 }
1863 assert(mdr->slave_request == 0); // only one at a time, please!
1864
1865 if (straydn) {
1866 mdr->pin(straydn);
1867 mdr->straydn = straydn;
1868 }
1869
1870 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
1871 dout(3) << "not clientreplay|active yet, waiting" << dendl;
1872 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
1873 return;
1874 } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
1875 mdr->locks.empty()) {
1876 dout(3) << "not active yet, waiting" << dendl;
1877 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
1878 return;
1879 }
1880
1881 mdr->slave_request = m;
1882
1883 dispatch_slave_request(mdr);
1884 }
1885
1886 /* This function DOES put the passed message before returning*/
1887 void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
1888 {
1889 mds_rank_t from = mds_rank_t(m->get_source().num());
1890
1891 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
1892 metareqid_t r = m->get_reqid();
1893 if (!mdcache->have_uncommitted_master(r, from)) {
1894 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
1895 << from << " reqid " << r << dendl;
1896 m->put();
1897 return;
1898 }
1899 dout(3) << "not clientreplay|active yet, waiting" << dendl;
1900 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
1901 return;
1902 }
1903
1904 if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
1905 metareqid_t r = m->get_reqid();
1906 mdcache->committed_master_slave(r, from);
1907 m->put();
1908 return;
1909 }
1910
1911 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
1912 if (m->get_attempt() != mdr->attempt) {
1913 dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
1914 << m->get_attempt() << dendl;
1915 m->put();
1916 return;
1917 }
1918
1919 switch (m->get_op()) {
1920 case MMDSSlaveRequest::OP_XLOCKACK:
1921 {
1922 // identify lock, master request
1923 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
1924 m->get_object_info());
1925 mdr->more()->slaves.insert(from);
1926 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
1927 mdr->xlocks.insert(lock);
1928 mdr->locks.insert(lock);
1929 mdr->finish_locking(lock);
1930 lock->get_xlock(mdr, mdr->get_client());
1931
1932 assert(mdr->more()->waiting_on_slave.count(from));
1933 mdr->more()->waiting_on_slave.erase(from);
1934 assert(mdr->more()->waiting_on_slave.empty());
1935 mdcache->dispatch_request(mdr);
1936 }
1937 break;
1938
1939 case MMDSSlaveRequest::OP_WRLOCKACK:
1940 {
1941 // identify lock, master request
1942 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
1943 m->get_object_info());
1944 mdr->more()->slaves.insert(from);
1945 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
1946 mdr->remote_wrlocks[lock] = from;
1947 mdr->locks.insert(lock);
1948 mdr->finish_locking(lock);
1949
1950 assert(mdr->more()->waiting_on_slave.count(from));
1951 mdr->more()->waiting_on_slave.erase(from);
1952 assert(mdr->more()->waiting_on_slave.empty());
1953 mdcache->dispatch_request(mdr);
1954 }
1955 break;
1956
1957 case MMDSSlaveRequest::OP_AUTHPINACK:
1958 handle_slave_auth_pin_ack(mdr, m);
1959 break;
1960
1961 case MMDSSlaveRequest::OP_LINKPREPACK:
1962 handle_slave_link_prep_ack(mdr, m);
1963 break;
1964
1965 case MMDSSlaveRequest::OP_RMDIRPREPACK:
1966 handle_slave_rmdir_prep_ack(mdr, m);
1967 break;
1968
1969 case MMDSSlaveRequest::OP_RENAMEPREPACK:
1970 handle_slave_rename_prep_ack(mdr, m);
1971 break;
1972
1973 case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
1974 handle_slave_rename_notify_ack(mdr, m);
1975 break;
1976
1977 default:
1978 ceph_abort();
1979 }
1980
1981 // done with reply.
1982 m->put();
1983 }
1984
1985 /* This function DOES put the mdr->slave_request before returning*/
1986 void Server::dispatch_slave_request(MDRequestRef& mdr)
1987 {
1988 dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
1989
1990 if (mdr->aborted) {
1991 dout(7) << " abort flag set, finishing" << dendl;
1992 mdcache->request_finish(mdr);
1993 return;
1994 }
1995
1996 if (logger) logger->inc(l_mdss_dispatch_slave_request);
1997
1998 int op = mdr->slave_request->get_op();
1999 switch (op) {
2000 case MMDSSlaveRequest::OP_XLOCK:
2001 case MMDSSlaveRequest::OP_WRLOCK:
2002 {
2003 // identify object
2004 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2005 mdr->slave_request->get_object_info());
2006
2007 if (!lock) {
2008 dout(10) << "don't have object, dropping" << dendl;
2009 ceph_abort(); // can this happen, if we auth pinned properly.
2010 }
2011 if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2012 dout(10) << "not auth for remote xlock attempt, dropping on "
2013 << *lock << " on " << *lock->get_parent() << dendl;
2014 } else {
2015 // use acquire_locks so that we get auth_pinning.
2016 set<SimpleLock*> rdlocks;
2017 set<SimpleLock*> wrlocks = mdr->wrlocks;
2018 set<SimpleLock*> xlocks = mdr->xlocks;
2019
2020 int replycode = 0;
2021 switch (op) {
2022 case MMDSSlaveRequest::OP_XLOCK:
2023 xlocks.insert(lock);
2024 replycode = MMDSSlaveRequest::OP_XLOCKACK;
2025 break;
2026 case MMDSSlaveRequest::OP_WRLOCK:
2027 wrlocks.insert(lock);
2028 replycode = MMDSSlaveRequest::OP_WRLOCKACK;
2029 break;
2030 }
2031
2032 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
2033 return;
2034
2035 // ack
2036 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, replycode);
2037 r->set_lock_type(lock->get_type());
2038 lock->get_parent()->set_object_info(r->get_object_info());
2039 mds->send_message(r, mdr->slave_request->get_connection());
2040 }
2041
2042 // done.
2043 mdr->slave_request->put();
2044 mdr->slave_request = 0;
2045 }
2046 break;
2047
2048 case MMDSSlaveRequest::OP_UNXLOCK:
2049 case MMDSSlaveRequest::OP_UNWRLOCK:
2050 {
2051 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2052 mdr->slave_request->get_object_info());
2053 assert(lock);
2054 bool need_issue = false;
2055 switch (op) {
2056 case MMDSSlaveRequest::OP_UNXLOCK:
2057 mds->locker->xlock_finish(lock, mdr.get(), &need_issue);
2058 break;
2059 case MMDSSlaveRequest::OP_UNWRLOCK:
2060 mds->locker->wrlock_finish(lock, mdr.get(), &need_issue);
2061 break;
2062 }
2063 if (need_issue)
2064 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2065
2066 // done. no ack necessary.
2067 mdr->slave_request->put();
2068 mdr->slave_request = 0;
2069 }
2070 break;
2071
2072 case MMDSSlaveRequest::OP_DROPLOCKS:
2073 mds->locker->drop_locks(mdr.get());
2074 mdr->slave_request->put();
2075 mdr->slave_request = 0;
2076 break;
2077
2078 case MMDSSlaveRequest::OP_AUTHPIN:
2079 handle_slave_auth_pin(mdr);
2080 break;
2081
2082 case MMDSSlaveRequest::OP_LINKPREP:
2083 case MMDSSlaveRequest::OP_UNLINKPREP:
2084 handle_slave_link_prep(mdr);
2085 break;
2086
2087 case MMDSSlaveRequest::OP_RMDIRPREP:
2088 handle_slave_rmdir_prep(mdr);
2089 break;
2090
2091 case MMDSSlaveRequest::OP_RENAMEPREP:
2092 handle_slave_rename_prep(mdr);
2093 break;
2094
2095 case MMDSSlaveRequest::OP_FINISH:
2096 // information about rename imported caps
2097 if (mdr->slave_request->inode_export.length() > 0)
2098 mdr->more()->inode_import.claim(mdr->slave_request->inode_export);
2099 // finish off request.
2100 mdcache->request_finish(mdr);
2101 break;
2102
2103 default:
2104 ceph_abort();
2105 }
2106 }
2107
2108 /* This function DOES put the mdr->slave_request before returning*/
2109 void Server::handle_slave_auth_pin(MDRequestRef& mdr)
2110 {
2111 dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
2112
2113 // build list of objects
2114 list<MDSCacheObject*> objects;
2115 CInode *auth_pin_freeze = NULL;
2116 bool fail = false, wouldblock = false, readonly = false;
2117
2118 if (mdcache->is_readonly()) {
2119 dout(10) << " read-only FS" << dendl;
2120 readonly = true;
2121 fail = true;
2122 }
2123
2124 if (!fail) {
2125 for (vector<MDSCacheObjectInfo>::iterator p = mdr->slave_request->get_authpins().begin();
2126 p != mdr->slave_request->get_authpins().end();
2127 ++p) {
2128 MDSCacheObject *object = mdcache->get_object(*p);
2129 if (!object) {
2130 dout(10) << " don't have " << *p << dendl;
2131 fail = true;
2132 break;
2133 }
2134
2135 objects.push_back(object);
2136 if (*p == mdr->slave_request->get_authpin_freeze())
2137 auth_pin_freeze = static_cast<CInode*>(object);
2138 }
2139 }
2140
2141 // can we auth pin them?
2142 if (!fail) {
2143 for (list<MDSCacheObject*>::iterator p = objects.begin();
2144 p != objects.end();
2145 ++p) {
2146 if (!(*p)->is_auth()) {
2147 dout(10) << " not auth for " << **p << dendl;
2148 fail = true;
2149 break;
2150 }
2151 if (mdr->is_auth_pinned(*p))
2152 continue;
2153 if (!mdr->can_auth_pin(*p)) {
2154 if (mdr->slave_request->is_nonblock()) {
2155 dout(10) << " can't auth_pin (freezing?) " << **p << " nonblocking" << dendl;
2156 fail = true;
2157 wouldblock = true;
2158 break;
2159 }
2160 // wait
2161 dout(10) << " waiting for authpinnable on " << **p << dendl;
2162 (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2163 mdr->drop_local_auth_pins();
2164
2165 CDir *dir = NULL;
2166 if (CInode *in = dynamic_cast<CInode*>(*p)) {
2167 if (!in->is_root())
2168 dir = in->get_parent_dir();
2169 } else if (CDentry *dn = dynamic_cast<CDentry*>(*p)) {
2170 dir = dn->get_dir();
2171 } else {
2172 ceph_abort();
2173 }
2174 if (dir) {
2175 if (dir->is_freezing_dir())
2176 mdcache->fragment_freeze_inc_num_waiters(dir);
2177 if (dir->is_freezing_tree()) {
2178 while (!dir->is_freezing_tree_root())
2179 dir = dir->get_parent_dir();
2180 mdcache->migrator->export_freeze_inc_num_waiters(dir);
2181 }
2182 }
2183 return;
2184 }
2185 }
2186 }
2187
2188 // auth pin!
2189 if (fail) {
2190 mdr->drop_local_auth_pins(); // just in case
2191 } else {
2192 /* freeze authpin wrong inode */
2193 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
2194 mdr->more()->rename_inode != auth_pin_freeze)
2195 mdr->unfreeze_auth_pin(true);
2196
2197 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2198 * on the source inode to complete. This happens after all locks for the rename
2199 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2200 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2201 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2202 * The solution is freeze the inode and prevent other MDRequests from getting new
2203 * auth pins.
2204 */
2205 if (auth_pin_freeze) {
2206 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
2207 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
2208 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
2209 mds->mdlog->flush();
2210 return;
2211 }
2212 }
2213 for (list<MDSCacheObject*>::iterator p = objects.begin();
2214 p != objects.end();
2215 ++p) {
2216 dout(10) << "auth_pinning " << **p << dendl;
2217 mdr->auth_pin(*p);
2218 }
2219 }
2220
2221 // ack!
2222 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
2223
2224 // return list of my auth_pins (if any)
2225 for (set<MDSCacheObject*>::iterator p = mdr->auth_pins.begin();
2226 p != mdr->auth_pins.end();
2227 ++p) {
2228 MDSCacheObjectInfo info;
2229 (*p)->set_object_info(info);
2230 reply->get_authpins().push_back(info);
2231 if (*p == (MDSCacheObject*)auth_pin_freeze)
2232 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
2233 }
2234
2235 if (wouldblock)
2236 reply->mark_error_wouldblock();
2237 if (readonly)
2238 reply->mark_error_rofs();
2239
2240 mds->send_message_mds(reply, mdr->slave_to_mds);
2241
2242 // clean up this request
2243 mdr->slave_request->put();
2244 mdr->slave_request = 0;
2245 return;
2246 }
2247
2248 /* This function DOES NOT put the passed ack before returning*/
2249 void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
2250 {
2251 dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
2252 mds_rank_t from = mds_rank_t(ack->get_source().num());
2253
2254 // added auth pins?
2255 set<MDSCacheObject*> pinned;
2256 for (vector<MDSCacheObjectInfo>::iterator p = ack->get_authpins().begin();
2257 p != ack->get_authpins().end();
2258 ++p) {
2259 MDSCacheObject *object = mdcache->get_object(*p);
2260 assert(object); // we pinned it
2261 dout(10) << " remote has pinned " << *object << dendl;
2262 if (!mdr->is_auth_pinned(object))
2263 mdr->remote_auth_pins[object] = from;
2264 if (*p == ack->get_authpin_freeze())
2265 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
2266 pinned.insert(object);
2267 }
2268
2269 // removed frozen auth pin ?
2270 if (mdr->more()->is_remote_frozen_authpin &&
2271 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
2272 auto p = mdr->remote_auth_pins.find(mdr->more()->rename_inode);
2273 assert(p != mdr->remote_auth_pins.end());
2274 if (p->second == from) {
2275 mdr->more()->is_remote_frozen_authpin = false;
2276 }
2277 }
2278
2279 // removed auth pins?
2280 map<MDSCacheObject*, mds_rank_t>::iterator p = mdr->remote_auth_pins.begin();
2281 while (p != mdr->remote_auth_pins.end()) {
2282 MDSCacheObject* object = p->first;
2283 if (p->second == from && pinned.count(object) == 0) {
2284 dout(10) << " remote has unpinned " << *object << dendl;
2285 mdr->remote_auth_pins.erase(p++);
2286 } else {
2287 ++p;
2288 }
2289 }
2290
2291 if (ack->is_error_rofs()) {
2292 mdr->more()->slave_error = -EROFS;
2293 mdr->aborted = true;
2294 } else if (ack->is_error_wouldblock()) {
2295 mdr->more()->slave_error = -EWOULDBLOCK;
2296 mdr->aborted = true;
2297 }
2298
2299 // note slave
2300 mdr->more()->slaves.insert(from);
2301
2302 // clear from waiting list
2303 assert(mdr->more()->waiting_on_slave.count(from));
2304 mdr->more()->waiting_on_slave.erase(from);
2305
2306 // go again?
2307 if (mdr->more()->waiting_on_slave.empty())
2308 mdcache->dispatch_request(mdr);
2309 else
2310 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
2311 }
2312
2313
2314 // ---------------------------------------
2315 // HELPERS
2316
2317
2318 /**
2319 * check whether we are permitted to complete a request
2320 *
2321 * Check whether we have permission to perform the operation specified
2322 * by mask on the given inode, based on the capability in the mdr's
2323 * session.
2324 */
2325 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
2326 {
2327 if (mdr->session) {
2328 int r = mdr->session->check_access(
2329 in, mask,
2330 mdr->client_request->get_caller_uid(),
2331 mdr->client_request->get_caller_gid(),
2332 &mdr->client_request->get_caller_gid_list(),
2333 mdr->client_request->head.args.setattr.uid,
2334 mdr->client_request->head.args.setattr.gid);
2335 if (r < 0) {
2336 respond_to_request(mdr, r);
2337 return false;
2338 }
2339 }
2340 return true;
2341 }
2342
2343 /**
2344 * check whether fragment has reached maximum size
2345 *
2346 */
2347 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
2348 {
2349 const auto size = in->get_frag_size();
2350 if (size >= g_conf->mds_bal_fragment_size_max) {
2351 dout(10) << "fragment " << *in << " size exceeds " << g_conf->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
2352 respond_to_request(mdr, -ENOSPC);
2353 return false;
2354 }
2355
2356 return true;
2357 }
2358
2359
2360 /** validate_dentry_dir
2361 *
2362 * verify that the dir exists and would own the dname.
2363 * do not check if the dentry exists.
2364 */
2365 CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, const string& dname)
2366 {
2367 // make sure parent is a dir?
2368 if (!diri->is_dir()) {
2369 dout(7) << "validate_dentry_dir: not a dir" << dendl;
2370 respond_to_request(mdr, -ENOTDIR);
2371 return NULL;
2372 }
2373
2374 // which dirfrag?
2375 frag_t fg = diri->pick_dirfrag(dname);
2376 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
2377 if (!dir)
2378 return 0;
2379
2380 // frozen?
2381 if (dir->is_frozen()) {
2382 dout(7) << "dir is frozen " << *dir << dendl;
2383 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2384 return NULL;
2385 }
2386
2387 return dir;
2388 }
2389
2390
2391 /** prepare_null_dentry
2392 * prepare a null (or existing) dentry in given dir.
2393 * wait for any dn lock.
2394 */
2395 CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, const string& dname, bool okexist)
2396 {
2397 dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
2398 assert(dir->is_auth());
2399
2400 client_t client = mdr->get_client();
2401
2402 // does it already exist?
2403 CDentry *dn = dir->lookup(dname);
2404 if (dn) {
2405 /*
2406 if (dn->lock.is_xlocked_by_other(mdr)) {
2407 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2408 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2409 return 0;
2410 }
2411 */
2412 if (!dn->get_linkage(client, mdr)->is_null()) {
2413 // name already exists
2414 dout(10) << "dentry " << dname << " exists in " << *dir << dendl;
2415 if (!okexist) {
2416 respond_to_request(mdr, -EEXIST);
2417 return 0;
2418 }
2419 } else {
2420 dn->first = dir->inode->find_snaprealm()->get_newest_seq() + 1;
2421 }
2422
2423 return dn;
2424 }
2425
2426 // make sure dir is complete
2427 if (!dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2428 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2429 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2430 return 0;
2431 }
2432
2433 // create
2434 dn = dir->add_null_dentry(dname, dir->inode->find_snaprealm()->get_newest_seq() + 1);
2435 dn->mark_new();
2436 dout(10) << "prepare_null_dentry added " << *dn << dendl;
2437 return dn;
2438 }
2439
2440 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
2441 {
2442 CDentry *straydn = mdr->straydn;
2443 if (straydn) {
2444 string straydname;
2445 in->name_stray_dentry(straydname);
2446 if (straydn->get_name() == straydname)
2447 return straydn;
2448
2449 assert(!mdr->done_locking);
2450 mdr->unpin(straydn);
2451 }
2452
2453 CDir *straydir = mdcache->get_stray_dir(in);
2454
2455 if (!mdr->client_request->is_replay() &&
2456 !check_fragment_space(mdr, straydir))
2457 return NULL;
2458
2459 straydn = mdcache->get_or_create_stray_dentry(in);
2460 mdr->straydn = straydn;
2461 mdr->pin(straydn);
2462 return straydn;
2463 }
2464
2465 /** prepare_new_inode
2466 *
2467 * create a new inode. set c/m/atime. hit dir pop.
2468 */
2469 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
2470 file_layout_t *layout)
2471 {
2472 CInode *in = new CInode(mdcache);
2473
2474 // Server::prepare_force_open_sessions() can re-open session in closing
2475 // state. In that corner case, session's prealloc_inos are being freed.
2476 // To simplify the code, we disallow using/refilling session's prealloc_ino
2477 // while session is opening.
2478 bool allow_prealloc_inos = !mdr->session->is_opening();
2479
2480 // assign ino
2481 if (allow_prealloc_inos &&
2482 mdr->session->info.prealloc_inos.size()) {
2483 mdr->used_prealloc_ino =
2484 in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used
2485 mds->sessionmap.mark_projected(mdr->session);
2486
2487 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
2488 << " (" << mdr->session->info.prealloc_inos
2489 << ", " << mdr->session->info.prealloc_inos.size() << " left)"
2490 << dendl;
2491 } else {
2492 mdr->alloc_ino =
2493 in->inode.ino = mds->inotable->project_alloc_id();
2494 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
2495 }
2496
2497 if (useino && useino != in->inode.ino) {
2498 dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
2499 mds->clog->error() << mdr->client_request->get_source()
2500 << " specified ino " << useino
2501 << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
2502 //ceph_abort(); // just for now.
2503 }
2504
2505 if (allow_prealloc_inos &&
2506 mdr->session->get_num_projected_prealloc_inos() < g_conf->mds_client_prealloc_inos / 2) {
2507 int need = g_conf->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
2508 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
2509 assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
2510 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
2511 mds->sessionmap.mark_projected(mdr->session);
2512 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
2513 }
2514
2515 in->inode.version = 1;
2516 in->inode.xattr_version = 1;
2517 in->inode.nlink = 1; // FIXME
2518
2519 in->inode.mode = mode;
2520
2521 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
2522 if (in->inode.is_dir()) {
2523 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
2524 } else if (layout) {
2525 in->inode.layout = *layout;
2526 } else {
2527 in->inode.layout = mdcache->default_file_layout;
2528 }
2529
2530 in->inode.truncate_size = -1ull; // not truncated, yet!
2531 in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
2532
2533 CInode *diri = dir->get_inode();
2534
2535 dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
2536
2537 if (diri->inode.mode & S_ISGID) {
2538 dout(10) << " dir is sticky" << dendl;
2539 in->inode.gid = diri->inode.gid;
2540 if (S_ISDIR(mode)) {
2541 dout(10) << " new dir also sticky" << dendl;
2542 in->inode.mode |= S_ISGID;
2543 }
2544 } else
2545 in->inode.gid = mdr->client_request->get_caller_gid();
2546
2547 in->inode.uid = mdr->client_request->get_caller_uid();
2548
2549 in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
2550 mdr->get_op_stamp();
2551
2552 in->inode.change_attr = 0;
2553
2554 MClientRequest *req = mdr->client_request;
2555 if (req->get_data().length()) {
2556 bufferlist::iterator p = req->get_data().begin();
2557
2558 // xattrs on new inode?
2559 map<string,bufferptr> xattrs;
2560 ::decode(xattrs, p);
2561 for (map<string,bufferptr>::iterator p = xattrs.begin(); p != xattrs.end(); ++p) {
2562 dout(10) << "prepare_new_inode setting xattr " << p->first << dendl;
2563 in->xattrs[p->first] = p->second;
2564 }
2565 }
2566
2567 if (!mds->mdsmap->get_inline_data_enabled() ||
2568 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
2569 in->inode.inline_data.version = CEPH_INLINE_NONE;
2570
2571 mdcache->add_inode(in); // add
2572 dout(10) << "prepare_new_inode " << *in << dendl;
2573 return in;
2574 }
2575
2576 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
2577 {
2578 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
2579 << " inotablev " << mds->inotable->get_projected_version()
2580 << dendl;
2581 blob->set_ino_alloc(mdr->alloc_ino,
2582 mdr->used_prealloc_ino,
2583 mdr->prealloc_inos,
2584 mdr->client_request->get_source(),
2585 mds->sessionmap.get_projected(),
2586 mds->inotable->get_projected_version());
2587 }
2588
2589 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
2590 {
2591 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
2592 << " / " << mdr->prealloc_inos
2593 << " / " << mdr->used_prealloc_ino << dendl;
2594
2595 if (mdr->alloc_ino) {
2596 mds->inotable->apply_alloc_id(mdr->alloc_ino);
2597 }
2598 if (mdr->prealloc_inos.size()) {
2599 assert(session);
2600 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
2601 session->info.prealloc_inos.insert(mdr->prealloc_inos);
2602 mds->sessionmap.mark_dirty(session);
2603 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
2604 }
2605 if (mdr->used_prealloc_ino) {
2606 assert(session);
2607 session->info.used_inos.erase(mdr->used_prealloc_ino);
2608 mds->sessionmap.mark_dirty(session);
2609 }
2610 }
2611
2612 class C_MDS_TryFindInode : public ServerContext {
2613 MDRequestRef mdr;
2614 public:
2615 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
2616 void finish(int r) override {
2617 if (r == -ESTALE) // :( find_ino_peers failed
2618 server->respond_to_request(mdr, r);
2619 else
2620 server->dispatch_client_request(mdr);
2621 }
2622 };
2623
2624 CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath)
2625 {
2626 // figure parent dir vs dname
2627 if (refpath.depth() == 0) {
2628 dout(7) << "can't do that to root" << dendl;
2629 respond_to_request(mdr, -EINVAL);
2630 return 0;
2631 }
2632 string dname = refpath.last_dentry();
2633 refpath.pop_dentry();
2634
2635 dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl;
2636
2637 // traverse to parent dir
2638 CInode *diri;
2639 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &trace, &diri, MDS_TRAVERSE_FORWARD);
2640 if (r > 0) return 0; // delayed
2641 if (r < 0) {
2642 if (r == -ESTALE) {
2643 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2644 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
2645 return 0;
2646 }
2647 respond_to_request(mdr, r);
2648 return 0;
2649 }
2650
2651 // is it an auth dir?
2652 CDir *dir = validate_dentry_dir(mdr, diri, dname);
2653 if (!dir)
2654 return 0; // forwarded or waiting for freeze
2655
2656 dout(10) << "traverse_to_auth_dir " << *dir << dendl;
2657 return dir;
2658 }
2659
2660 /* If this returns null, the request has been handled
2661 * as appropriate: forwarded on, or the client's been replied to */
2662 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n,
2663 set<SimpleLock*> &rdlocks,
2664 bool want_auth,
2665 bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's
2666 a snapped dir */
2667 file_layout_t **layout,
2668 bool no_lookup) // true if we cannot return a null dentry lease
2669 {
2670 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2671 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
2672
2673 if (mdr->done_locking)
2674 return mdr->in[n];
2675
2676 // traverse
2677 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &mdr->dn[n], &mdr->in[n], MDS_TRAVERSE_FORWARD);
2678 if (r > 0)
2679 return NULL; // delayed
2680 if (r < 0) { // error
2681 if (r == -ENOENT && n == 0 && mdr->dn[n].size()) {
2682 if (!no_lookup)
2683 mdr->tracedn = mdr->dn[n][mdr->dn[n].size()-1];
2684 respond_to_request(mdr, r);
2685 } else if (r == -ESTALE) {
2686 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2687 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
2688 mdcache->find_ino_peers(refpath.get_ino(), c);
2689 } else {
2690 dout(10) << "FAIL on error " << r << dendl;
2691 respond_to_request(mdr, r);
2692 }
2693 return 0;
2694 }
2695 CInode *ref = mdr->in[n];
2696 dout(10) << "ref is " << *ref << dendl;
2697
2698 // fw to inode auth?
2699 if (mdr->snapid != CEPH_NOSNAP && !no_want_auth)
2700 want_auth = true;
2701
2702 if (want_auth) {
2703 if (ref->is_ambiguous_auth()) {
2704 dout(10) << "waiting for single auth on " << *ref << dendl;
2705 ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
2706 return 0;
2707 }
2708 if (!ref->is_auth()) {
2709 dout(10) << "fw to auth for " << *ref << dendl;
2710 mdcache->request_forward(mdr, ref->authority().first);
2711 return 0;
2712 }
2713
2714 // auth_pin?
2715 // do NOT proceed if freezing, as cap release may defer in that case, and
2716 // we could deadlock when we try to lock @ref.
2717 // if we're already auth_pinned, continue; the release has already been processed.
2718 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
2719 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
2720 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
2721 ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2722 /* If we have any auth pins, this will deadlock.
2723 * But the only way to get here if we've already got auth pins
2724 * is because we're on an inode with snapshots that got updated
2725 * between dispatches of this request. So we're going to drop
2726 * our locks and our auth pins and reacquire them later.
2727 *
2728 * This is safe since we're only in this function when working on
2729 * a single MDS request; otherwise we'd be in
2730 * rdlock_path_xlock_dentry.
2731 */
2732 mds->locker->drop_locks(mdr.get(), NULL);
2733 mdr->drop_local_auth_pins();
2734 return 0;
2735 }
2736
2737 mdr->auth_pin(ref);
2738 }
2739
2740 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2741 rdlocks.insert(&mdr->dn[n][i]->lock);
2742 if (layout)
2743 mds->locker->include_snap_rdlocks_wlayout(rdlocks, ref, layout);
2744 else
2745 mds->locker->include_snap_rdlocks(rdlocks, ref);
2746
2747 // set and pin ref
2748 mdr->pin(ref);
2749 return ref;
2750 }
2751
2752
2753 /** rdlock_path_xlock_dentry
2754 * traverse path to the directory that could/would contain dentry.
2755 * make sure i am auth for that dentry, forward as necessary.
2756 * create null dentry in place (or use existing if okexist).
2757 * get rdlocks on traversed dentries, xlock on new dentry.
2758 */
2759 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
2760 set<SimpleLock*>& rdlocks, set<SimpleLock*>& wrlocks, set<SimpleLock*>& xlocks,
2761 bool okexist, bool mustexist, bool alwaysxlock,
2762 file_layout_t **layout)
2763 {
2764 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2765
2766 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
2767
2768 client_t client = mdr->get_client();
2769
2770 if (mdr->done_locking)
2771 return mdr->dn[n].back();
2772
2773 CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
2774 if (!dir) return 0;
2775 dout(10) << "rdlock_path_xlock_dentry dir " << *dir << dendl;
2776
2777 // make sure we can auth_pin (or have already authpinned) dir
2778 if (dir->is_frozen()) {
2779 dout(7) << "waiting for !frozen/authpinnable on " << *dir << dendl;
2780 dir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2781 return 0;
2782 }
2783
2784 CInode *diri = dir->get_inode();
2785 if (!mdr->reqid.name.is_mds()) {
2786 if (diri->is_system() && !diri->is_root()) {
2787 respond_to_request(mdr, -EROFS);
2788 return 0;
2789 }
2790 }
2791 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
2792 respond_to_request(mdr, -ENOENT);
2793 return 0;
2794 }
2795
2796 // make a null dentry?
2797 const string &dname = refpath.last_dentry();
2798 CDentry *dn;
2799 if (mustexist) {
2800 dn = dir->lookup(dname);
2801
2802 // make sure dir is complete
2803 if (!dn && !dir->is_complete() &&
2804 (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2805 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2806 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2807 return 0;
2808 }
2809
2810 // readable?
2811 if (dn && !dn->lock.can_read(client) && dn->lock.get_xlock_by() != mdr) {
2812 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2813 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2814 return 0;
2815 }
2816
2817 // exists?
2818 if (!dn || dn->get_linkage(client, mdr)->is_null()) {
2819 dout(7) << "dentry " << dname << " dne in " << *dir << dendl;
2820 respond_to_request(mdr, -ENOENT);
2821 return 0;
2822 }
2823 } else {
2824 dn = prepare_null_dentry(mdr, dir, dname, okexist);
2825 if (!dn)
2826 return 0;
2827 }
2828
2829 mdr->dn[n].push_back(dn);
2830 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
2831 mdr->in[n] = dnl->get_inode();
2832
2833 // -- lock --
2834 // NOTE: rename takes the same set of locks for srcdn
2835 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2836 rdlocks.insert(&mdr->dn[n][i]->lock);
2837 if (alwaysxlock || dnl->is_null())
2838 xlocks.insert(&dn->lock); // new dn, xlock
2839 else
2840 rdlocks.insert(&dn->lock); // existing dn, rdlock
2841 wrlocks.insert(&dn->get_dir()->inode->filelock); // also, wrlock on dir mtime
2842 wrlocks.insert(&dn->get_dir()->inode->nestlock); // also, wrlock on dir mtime
2843 if (layout)
2844 mds->locker->include_snap_rdlocks_wlayout(rdlocks, dn->get_dir()->inode, layout);
2845 else
2846 mds->locker->include_snap_rdlocks(rdlocks, dn->get_dir()->inode);
2847
2848 return dn;
2849 }
2850
2851
2852
2853
2854
2855 /**
2856 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
2857 *
2858 * @param diri base inode
2859 * @param fg the exact frag we want
2860 * @param mdr request
2861 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
2862 */
2863 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
2864 {
2865 CDir *dir = diri->get_dirfrag(fg);
2866
2867 // not open and inode not mine?
2868 if (!dir && !diri->is_auth()) {
2869 mds_rank_t inauth = diri->authority().first;
2870 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
2871 mdcache->request_forward(mdr, inauth);
2872 return 0;
2873 }
2874
2875 // not open and inode frozen?
2876 if (!dir && diri->is_frozen()) {
2877 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
2878 assert(diri->get_parent_dir());
2879 diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2880 return 0;
2881 }
2882
2883 // invent?
2884 if (!dir)
2885 dir = diri->get_or_open_dirfrag(mdcache, fg);
2886
2887 // am i auth for the dirfrag?
2888 if (!dir->is_auth()) {
2889 mds_rank_t auth = dir->authority().first;
2890 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
2891 << ", fw to mds." << auth << dendl;
2892 mdcache->request_forward(mdr, auth);
2893 return 0;
2894 }
2895
2896 return dir;
2897 }
2898
2899
2900 // ===============================================================================
2901 // STAT
2902
2903 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
2904 {
2905 MClientRequest *req = mdr->client_request;
2906 set<SimpleLock*> rdlocks, wrlocks, xlocks;
2907
2908 if (req->get_filepath().depth() == 0 && is_lookup) {
2909 // refpath can't be empty for lookup but it can for
2910 // getattr (we do getattr with empty refpath for mount of '/')
2911 respond_to_request(mdr, -EINVAL);
2912 return;
2913 }
2914
2915 CInode *ref = rdlock_path_pin_ref(mdr, 0, rdlocks, false, false, NULL, !is_lookup);
2916 if (!ref) return;
2917
2918 /*
2919 * if client currently holds the EXCL cap on a field, do not rdlock
2920 * it; client's stat() will result in valid info if _either_ EXCL
2921 * cap is held or MDS rdlocks and reads the value here.
2922 *
2923 * handling this case here is easier than weakening rdlock
2924 * semantics... that would cause problems elsewhere.
2925 */
2926 client_t client = mdr->get_client();
2927 int issued = 0;
2928 Capability *cap = ref->get_client_cap(client);
2929 if (cap && (mdr->snapid == CEPH_NOSNAP ||
2930 mdr->snapid <= cap->client_follows))
2931 issued = cap->issued();
2932
2933 int mask = req->head.args.getattr.mask;
2934 if ((mask & CEPH_CAP_LINK_SHARED) && (issued & CEPH_CAP_LINK_EXCL) == 0) rdlocks.insert(&ref->linklock);
2935 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0) rdlocks.insert(&ref->authlock);
2936 if ((mask & CEPH_CAP_FILE_SHARED) && (issued & CEPH_CAP_FILE_EXCL) == 0) rdlocks.insert(&ref->filelock);
2937 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0) rdlocks.insert(&ref->xattrlock);
2938
2939 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
2940 return;
2941
2942 if (!check_access(mdr, ref, MAY_READ))
2943 return;
2944
2945 // note which caps are requested, so we return at least a snapshot
2946 // value for them. (currently this matters for xattrs and inline data)
2947 mdr->getattr_caps = mask;
2948
2949 mds->balancer->hit_inode(ceph_clock_now(), ref, META_POP_IRD,
2950 req->get_source().num());
2951
2952 // reply
2953 dout(10) << "reply to stat on " << *req << dendl;
2954 mdr->tracei = ref;
2955 if (is_lookup)
2956 mdr->tracedn = mdr->dn[0].back();
2957 respond_to_request(mdr, 0);
2958 }
2959
2960 struct C_MDS_LookupIno2 : public ServerContext {
2961 MDRequestRef mdr;
2962 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
2963 void finish(int r) override {
2964 server->_lookup_ino_2(mdr, r);
2965 }
2966 };
2967
2968 /* This function DOES clean up the mdr before returning*/
2969 /*
2970 * filepath: ino
2971 */
2972 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
2973 bool want_parent, bool want_dentry)
2974 {
2975 MClientRequest *req = mdr->client_request;
2976
2977 inodeno_t ino = req->get_filepath().get_ino();
2978 CInode *in = mdcache->get_inode(ino);
2979 if (in && in->state_test(CInode::STATE_PURGING)) {
2980 respond_to_request(mdr, -ESTALE);
2981 return;
2982 }
2983 if (!in) {
2984 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
2985 return;
2986 }
2987
2988 if (mdr && in->snaprealm && !in->snaprealm->is_open() &&
2989 !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
2990 return;
2991 }
2992
2993 // check for nothing (not read or write); this still applies the
2994 // path check.
2995 if (!check_access(mdr, in, 0))
2996 return;
2997
2998 CDentry *dn = in->get_projected_parent_dn();
2999 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3000
3001 set<SimpleLock*> rdlocks;
3002 if (dn && (want_parent || want_dentry)) {
3003 mdr->pin(dn);
3004 rdlocks.insert(&dn->lock);
3005 }
3006
3007 unsigned mask = req->head.args.getattr.mask;
3008 if (mask) {
3009 Capability *cap = in->get_client_cap(mdr->get_client());
3010 int issued = 0;
3011 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3012 issued = cap->issued();
3013 // permission bits, ACL/security xattrs
3014 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3015 rdlocks.insert(&in->authlock);
3016 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3017 rdlocks.insert(&in->xattrlock);
3018
3019 mdr->getattr_caps = mask;
3020 }
3021
3022 if (!rdlocks.empty()) {
3023 set<SimpleLock*> wrlocks, xlocks;
3024 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3025 return;
3026
3027 // need read access to directory inode
3028 if (!check_access(mdr, diri, MAY_READ))
3029 return;
3030 }
3031
3032 if (want_parent) {
3033 if (in->is_base()) {
3034 respond_to_request(mdr, -EINVAL);
3035 return;
3036 }
3037 if (!diri || diri->is_stray()) {
3038 respond_to_request(mdr, -ESTALE);
3039 return;
3040 }
3041 dout(10) << "reply to lookup_parent " << *in << dendl;
3042 mdr->tracei = diri;
3043 respond_to_request(mdr, 0);
3044 } else {
3045 if (want_dentry) {
3046 inodeno_t dirino = req->get_filepath2().get_ino();
3047 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3048 respond_to_request(mdr, -ENOENT);
3049 return;
3050 }
3051 dout(10) << "reply to lookup_name " << *in << dendl;
3052 } else
3053 dout(10) << "reply to lookup_ino " << *in << dendl;
3054
3055 mdr->tracei = in;
3056 if (want_dentry)
3057 mdr->tracedn = dn;
3058 respond_to_request(mdr, 0);
3059 }
3060 }
3061
3062 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
3063 {
3064 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
3065 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3066
3067 // `r` is a rank if >=0, else an error code
3068 if (r >= 0) {
3069 mds_rank_t dest_rank(r);
3070 if (dest_rank == mds->get_nodeid())
3071 dispatch_client_request(mdr);
3072 else
3073 mdcache->request_forward(mdr, dest_rank);
3074 return;
3075 }
3076
3077 // give up
3078 if (r == -ENOENT || r == -ENODATA)
3079 r = -ESTALE;
3080 respond_to_request(mdr, r);
3081 }
3082
3083
3084 /* This function takes responsibility for the passed mdr*/
3085 void Server::handle_client_open(MDRequestRef& mdr)
3086 {
3087 MClientRequest *req = mdr->client_request;
3088 dout(7) << "open on " << req->get_filepath() << dendl;
3089
3090 int flags = req->head.args.open.flags;
3091 int cmode = ceph_flags_to_mode(flags);
3092 if (cmode < 0) {
3093 respond_to_request(mdr, -EINVAL);
3094 return;
3095 }
3096
3097 bool need_auth = !file_mode_is_readonly(cmode) || (flags & CEPH_O_TRUNC);
3098
3099 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
3100 dout(7) << "read-only FS" << dendl;
3101 respond_to_request(mdr, -EROFS);
3102 return;
3103 }
3104
3105 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3106 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, need_auth);
3107 if (!cur)
3108 return;
3109
3110 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
3111 assert(!need_auth);
3112 mdr->done_locking = false;
3113 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3114 if (!cur)
3115 return;
3116 }
3117
3118 if (!cur->inode.is_file()) {
3119 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3120 cmode = CEPH_FILE_MODE_PIN;
3121 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3122 if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
3123 flags &= ~CEPH_O_TRUNC;
3124 }
3125
3126 dout(10) << "open flags = " << flags
3127 << ", filemode = " << cmode
3128 << ", need_auth = " << need_auth
3129 << dendl;
3130
3131 // regular file?
3132 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3133 dout(7) << "not a file or dir " << *cur << dendl;
3134 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3135 return;
3136 }*/
3137 if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
3138 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
3139 respond_to_request(mdr, -EINVAL);
3140 return;
3141 }
3142
3143 if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
3144 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
3145 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3146 respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
3147 return;
3148 }
3149
3150 if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
3151 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
3152 dout(7) << "old client cannot open inline data file " << *cur << dendl;
3153 respond_to_request(mdr, -EPERM);
3154 return;
3155 }
3156
3157 // snapped data is read only
3158 if (mdr->snapid != CEPH_NOSNAP &&
3159 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
3160 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
3161 respond_to_request(mdr, -EROFS);
3162 return;
3163 }
3164
3165 unsigned mask = req->head.args.open.mask;
3166 if (mask) {
3167 Capability *cap = cur->get_client_cap(mdr->get_client());
3168 int issued = 0;
3169 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3170 issued = cap->issued();
3171 // permission bits, ACL/security xattrs
3172 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3173 rdlocks.insert(&cur->authlock);
3174 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3175 rdlocks.insert(&cur->xattrlock);
3176
3177 mdr->getattr_caps = mask;
3178 }
3179
3180 // O_TRUNC
3181 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
3182 assert(cur->is_auth());
3183
3184 xlocks.insert(&cur->filelock);
3185 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3186 return;
3187
3188 if (!check_access(mdr, cur, MAY_WRITE))
3189 return;
3190
3191 // wait for pending truncate?
3192 const inode_t *pi = cur->get_projected_inode();
3193 if (pi->is_truncating()) {
3194 dout(10) << " waiting for pending truncate from " << pi->truncate_from
3195 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
3196 mds->locker->drop_locks(mdr.get());
3197 mdr->drop_local_auth_pins();
3198 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
3199 return;
3200 }
3201
3202 do_open_truncate(mdr, cmode);
3203 return;
3204 }
3205
3206 // sync filelock if snapped.
3207 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3208 // and that data itself is flushed so that we can read the snapped data off disk.
3209 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
3210 rdlocks.insert(&cur->filelock);
3211 }
3212
3213 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3214 return;
3215
3216 mask = MAY_READ;
3217 if (cmode & CEPH_FILE_MODE_WR)
3218 mask |= MAY_WRITE;
3219 if (!check_access(mdr, cur, mask))
3220 return;
3221
3222 if (cur->is_file() || cur->is_dir()) {
3223 if (mdr->snapid == CEPH_NOSNAP) {
3224 // register new cap
3225 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr->session, 0, req->is_replay());
3226 if (cap)
3227 dout(12) << "open issued caps " << ccap_string(cap->pending())
3228 << " for " << req->get_source()
3229 << " on " << *cur << dendl;
3230 } else {
3231 int caps = ceph_caps_for_mode(cmode);
3232 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
3233 << " for " << req->get_source()
3234 << " snapid " << mdr->snapid
3235 << " on " << *cur << dendl;
3236 mdr->snap_caps = caps;
3237 }
3238 }
3239
3240 // increase max_size?
3241 if (cmode & CEPH_FILE_MODE_WR)
3242 mds->locker->check_inode_max_size(cur);
3243
3244 // make sure this inode gets into the journal
3245 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
3246 !cur->item_open_file.is_on_list()) {
3247 LogSegment *ls = mds->mdlog->get_current_segment();
3248 EOpen *le = new EOpen(mds->mdlog);
3249 mdlog->start_entry(le);
3250 le->add_clean_inode(cur);
3251 ls->open_files.push_back(&cur->item_open_file);
3252 mdlog->submit_entry(le);
3253 }
3254
3255 // hit pop
3256 if (cmode & CEPH_FILE_MODE_WR)
3257 mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IWR);
3258 else
3259 mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IRD,
3260 mdr->client_request->get_source().num());
3261
3262 CDentry *dn = 0;
3263 if (req->get_dentry_wanted()) {
3264 assert(mdr->dn[0].size());
3265 dn = mdr->dn[0].back();
3266 }
3267
3268 mdr->tracei = cur;
3269 mdr->tracedn = dn;
3270 respond_to_request(mdr, 0);
3271 }
3272
3273 class C_MDS_openc_finish : public ServerLogContext {
3274 CDentry *dn;
3275 CInode *newi;
3276 snapid_t follows;
3277 public:
3278 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni, snapid_t f) :
3279 ServerLogContext(s, r), dn(d), newi(ni), follows(f) {}
3280 void finish(int r) override {
3281 assert(r == 0);
3282
3283 dn->pop_projected_linkage();
3284
3285 // dirty inode, dn, dir
3286 newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
3287 newi->mark_dirty(newi->inode.version+1, mdr->ls);
3288 newi->_mark_dirty_parent(mdr->ls, true);
3289
3290 mdr->apply();
3291
3292 get_mds()->locker->share_inode_max_size(newi);
3293
3294 MDRequestRef null_ref;
3295 get_mds()->mdcache->send_dentry_link(dn, null_ref);
3296
3297 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
3298
3299 server->respond_to_request(mdr, 0);
3300
3301 assert(g_conf->mds_kill_openc_at != 1);
3302 }
3303 };
3304
3305 /* This function takes responsibility for the passed mdr*/
3306 void Server::handle_client_openc(MDRequestRef& mdr)
3307 {
3308 MClientRequest *req = mdr->client_request;
3309 client_t client = mdr->get_client();
3310
3311 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
3312
3313 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
3314 if (cmode < 0) {
3315 respond_to_request(mdr, -EINVAL);
3316 return;
3317 }
3318
3319 if (!(req->head.args.open.flags & CEPH_O_EXCL)) {
3320 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(),
3321 &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
3322 if (r > 0) return;
3323 if (r == 0) {
3324 // it existed.
3325 handle_client_open(mdr);
3326 return;
3327 }
3328 if (r < 0 && r != -ENOENT) {
3329 if (r == -ESTALE) {
3330 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3331 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
3332 mdcache->find_ino_peers(req->get_filepath().get_ino(), c);
3333 } else {
3334 dout(10) << "FAIL on error " << r << dendl;
3335 respond_to_request(mdr, r);
3336 }
3337 return;
3338 }
3339 // r == -ENOENT
3340 }
3341
3342 bool excl = (req->head.args.open.flags & CEPH_O_EXCL);
3343 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3344 file_layout_t *dir_layout = NULL;
3345 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks,
3346 !excl, false, false, &dir_layout);
3347 if (!dn) return;
3348 if (mdr->snapid != CEPH_NOSNAP) {
3349 respond_to_request(mdr, -EROFS);
3350 return;
3351 }
3352 // set layout
3353 file_layout_t layout;
3354 if (dir_layout)
3355 layout = *dir_layout;
3356 else
3357 layout = mdcache->default_file_layout;
3358
3359 // What kind of client caps are required to complete this operation
3360 uint64_t access = MAY_WRITE;
3361
3362 const auto default_layout = layout;
3363
3364 // fill in any special params from client
3365 if (req->head.args.open.stripe_unit)
3366 layout.stripe_unit = req->head.args.open.stripe_unit;
3367 if (req->head.args.open.stripe_count)
3368 layout.stripe_count = req->head.args.open.stripe_count;
3369 if (req->head.args.open.object_size)
3370 layout.object_size = req->head.args.open.object_size;
3371 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
3372 (__s32)req->head.args.open.pool >= 0) {
3373 layout.pool_id = req->head.args.open.pool;
3374
3375 // make sure we have as new a map as the client
3376 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
3377 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
3378 return;
3379 }
3380 }
3381
3382 // If client doesn't have capability to modify layout pools, then
3383 // only permit this request if the requested pool matches what the
3384 // file would have inherited anyway from its parent.
3385 if (default_layout != layout) {
3386 access |= MAY_SET_VXATTR;
3387 }
3388
3389 if (!layout.is_valid()) {
3390 dout(10) << " invalid initial file layout" << dendl;
3391 respond_to_request(mdr, -EINVAL);
3392 return;
3393 }
3394 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
3395 dout(10) << " invalid data pool " << layout.pool_id << dendl;
3396 respond_to_request(mdr, -EINVAL);
3397 return;
3398 }
3399
3400 CDir *dir = dn->get_dir();
3401 CInode *diri = dir->get_inode();
3402 rdlocks.insert(&diri->authlock);
3403 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3404 return;
3405
3406 if (!check_access(mdr, diri, access))
3407 return;
3408
3409 if (!check_fragment_space(mdr, dir))
3410 return;
3411
3412 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3413
3414 if (!dnl->is_null()) {
3415 // it existed.
3416 assert(req->head.args.open.flags & CEPH_O_EXCL);
3417 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
3418 mdr->tracei = dnl->get_inode();
3419 mdr->tracedn = dn;
3420 respond_to_request(mdr, -EEXIST);
3421 return;
3422 }
3423
3424 // created null dn.
3425
3426 // create inode.
3427 SnapRealm *realm = diri->find_snaprealm(); // use directory's realm; inode isn't attached yet.
3428 snapid_t follows = realm->get_newest_seq();
3429
3430 CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
3431 req->head.args.open.mode | S_IFREG, &layout);
3432 assert(in);
3433
3434 // it's a file.
3435 dn->push_projected_linkage(in);
3436
3437 in->inode.version = dn->pre_dirty();
3438 if (layout.pool_id != mdcache->default_file_layout.pool_id)
3439 in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
3440 in->inode.update_backtrace();
3441 if (cmode & CEPH_FILE_MODE_WR) {
3442 in->inode.client_ranges[client].range.first = 0;
3443 in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
3444 in->inode.client_ranges[client].follows = follows;
3445 }
3446 in->inode.rstat.rfiles = 1;
3447
3448 assert(dn->first == follows+1);
3449 in->first = dn->first;
3450
3451 // prepare finisher
3452 mdr->ls = mdlog->get_current_segment();
3453 EUpdate *le = new EUpdate(mdlog, "openc");
3454 mdlog->start_entry(le);
3455 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
3456 journal_allocated_inos(mdr, &le->metablob);
3457 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
3458 le->metablob.add_primary_dentry(dn, in, true, true, true);
3459
3460 // do the open
3461 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
3462 in->authlock.set_state(LOCK_EXCL);
3463 in->xattrlock.set_state(LOCK_EXCL);
3464
3465 // make sure this inode gets into the journal
3466 le->metablob.add_opened_ino(in->ino());
3467 LogSegment *ls = mds->mdlog->get_current_segment();
3468 ls->open_files.push_back(&in->item_open_file);
3469
3470 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in, follows);
3471
3472 if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
3473 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
3474 // add the file created flag onto the reply if create_flags features is supported
3475 ::encode(in->inode.ino, mdr->reply_extra_bl);
3476 }
3477
3478 journal_and_reply(mdr, in, dn, le, fin);
3479
3480 // We hit_dir (via hit_inode) in our finish callback, but by then we might
3481 // have overshot the split size (multiple opencs in flight), so here is
3482 // an early chance to split the dir if this openc makes it oversized.
3483 mds->balancer->maybe_fragment(dir, false);
3484 }
3485
3486
3487
3488 void Server::handle_client_readdir(MDRequestRef& mdr)
3489 {
3490 MClientRequest *req = mdr->client_request;
3491 client_t client = req->get_source().num();
3492 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3493 CInode *diri = rdlock_path_pin_ref(mdr, 0, rdlocks, false, true);
3494 if (!diri) return;
3495
3496 // it's a directory, right?
3497 if (!diri->is_dir()) {
3498 // not a dir
3499 dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
3500 respond_to_request(mdr, -ENOTDIR);
3501 return;
3502 }
3503
3504 rdlocks.insert(&diri->filelock);
3505 rdlocks.insert(&diri->dirfragtreelock);
3506
3507 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3508 return;
3509
3510 if (!check_access(mdr, diri, MAY_READ))
3511 return;
3512
3513 // which frag?
3514 frag_t fg = (__u32)req->head.args.readdir.frag;
3515 unsigned req_flags = (__u32)req->head.args.readdir.flags;
3516 string offset_str = req->get_path2();
3517
3518 __u32 offset_hash = 0;
3519 if (!offset_str.empty())
3520 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
3521 else
3522 offset_hash = (__u32)req->head.args.readdir.offset_hash;
3523
3524 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
3525 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
3526
3527 // does the frag exist?
3528 if (diri->dirfragtree[fg.value()] != fg) {
3529 frag_t newfg;
3530 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3531 if (fg.contains((unsigned)offset_hash)) {
3532 newfg = diri->dirfragtree[offset_hash];
3533 } else {
3534 // client actually wants next frag
3535 newfg = diri->dirfragtree[fg.value()];
3536 }
3537 } else {
3538 offset_str.clear();
3539 newfg = diri->dirfragtree[fg.value()];
3540 }
3541 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
3542 fg = newfg;
3543 }
3544
3545 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
3546 if (!dir) return;
3547
3548 // ok!
3549 dout(10) << "handle_client_readdir on " << *dir << dendl;
3550 assert(dir->is_auth());
3551
3552 if (!dir->is_complete()) {
3553 if (dir->is_frozen()) {
3554 dout(7) << "dir is frozen " << *dir << dendl;
3555 mds->locker->drop_locks(mdr.get());
3556 mdr->drop_local_auth_pins();
3557 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3558 return;
3559 }
3560 // fetch
3561 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
3562 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
3563 return;
3564 }
3565
3566 #ifdef MDS_VERIFY_FRAGSTAT
3567 dir->verify_fragstat();
3568 #endif
3569
3570 utime_t now = ceph_clock_now();
3571 mdr->set_mds_stamp(now);
3572
3573 snapid_t snapid = mdr->snapid;
3574 dout(10) << "snapid " << snapid << dendl;
3575
3576 SnapRealm *realm = diri->find_snaprealm();
3577
3578 unsigned max = req->head.args.readdir.max_entries;
3579 if (!max)
3580 max = dir->get_num_any(); // whatever, something big.
3581 unsigned max_bytes = req->head.args.readdir.max_bytes;
3582 if (!max_bytes)
3583 // make sure at least one item can be encoded
3584 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
3585
3586 // start final blob
3587 bufferlist dirbl;
3588 dir->encode_dirstat(dirbl, mds->get_nodeid());
3589
3590 // count bytes available.
3591 // this isn't perfect, but we should capture the main variable/unbounded size items!
3592 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
3593 int bytes_left = max_bytes - front_bytes;
3594 bytes_left -= realm->get_snap_trace().length();
3595
3596 // build dir contents
3597 bufferlist dnbl;
3598 __u32 numfiles = 0;
3599 bool start = !offset_hash && offset_str.empty();
3600 bool end = (dir->begin() == dir->end());
3601 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
3602 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
3603 for (CDir::map_t::iterator it = start ? dir->begin() : dir->lower_bound(skip_key);
3604 !end && numfiles < max;
3605 end = (it == dir->end())) {
3606 CDentry *dn = it->second;
3607 ++it;
3608
3609 if (dn->state_test(CDentry::STATE_PURGING))
3610 continue;
3611
3612 bool dnp = dn->use_projected(client, mdr);
3613 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
3614
3615 if (dnl->is_null())
3616 continue;
3617
3618 if (dn->last < snapid || dn->first > snapid) {
3619 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
3620 continue;
3621 }
3622
3623 if (!start) {
3624 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
3625 if (!(offset_key < dn->key()))
3626 continue;
3627 }
3628
3629 CInode *in = dnl->get_inode();
3630
3631 if (in && in->ino() == CEPH_INO_CEPH)
3632 continue;
3633
3634 // remote link?
3635 // better for the MDS to do the work, if we think the client will stat any of these files.
3636 if (dnl->is_remote() && !in) {
3637 in = mdcache->get_inode(dnl->get_remote_ino());
3638 if (in) {
3639 dn->link_remote(dnl, in);
3640 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
3641 dout(10) << "skipping bad remote ino on " << *dn << dendl;
3642 continue;
3643 } else {
3644 // touch everything i _do_ have
3645 for (CDir::map_t::iterator p = dir->begin(); p != dir->end(); ++p)
3646 if (!p->second->get_linkage()->is_null())
3647 mdcache->lru.lru_touch(p->second);
3648
3649 // already issued caps and leases, reply immediately.
3650 if (dnbl.length() > 0) {
3651 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
3652 dout(10) << " open remote dentry after caps were issued, stopping at "
3653 << dnbl.length() << " < " << bytes_left << dendl;
3654 break;
3655 }
3656
3657 mds->locker->drop_locks(mdr.get());
3658 mdr->drop_local_auth_pins();
3659 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
3660 return;
3661 }
3662 }
3663 assert(in);
3664
3665 if ((int)(dnbl.length() + dn->name.length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
3666 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
3667 break;
3668 }
3669
3670 unsigned start_len = dnbl.length();
3671
3672 // dentry
3673 dout(12) << "including dn " << *dn << dendl;
3674 ::encode(dn->name, dnbl);
3675 mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session);
3676
3677 // inode
3678 dout(12) << "including inode " << *in << dendl;
3679 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
3680 if (r < 0) {
3681 // chop off dn->name, lease
3682 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
3683 bufferlist keep;
3684 keep.substr_of(dnbl, 0, start_len);
3685 dnbl.swap(keep);
3686 break;
3687 }
3688 assert(r >= 0);
3689 numfiles++;
3690
3691 // touch dn
3692 mdcache->lru.lru_touch(dn);
3693 }
3694
3695 __u16 flags = 0;
3696 if (end) {
3697 flags = CEPH_READDIR_FRAG_END;
3698 if (start)
3699 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
3700 }
3701 // client only understand END and COMPLETE flags ?
3702 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3703 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
3704 }
3705
3706 // finish final blob
3707 ::encode(numfiles, dirbl);
3708 ::encode(flags, dirbl);
3709 dirbl.claim_append(dnbl);
3710
3711 // yay, reply
3712 dout(10) << "reply to " << *req << " readdir num=" << numfiles
3713 << " bytes=" << dirbl.length()
3714 << " start=" << (int)start
3715 << " end=" << (int)end
3716 << dendl;
3717 mdr->reply_extra_bl = dirbl;
3718
3719 // bump popularity. NOTE: this doesn't quite capture it.
3720 mds->balancer->hit_dir(now, dir, META_POP_IRD, -1, numfiles);
3721
3722 // reply
3723 mdr->tracei = diri;
3724 respond_to_request(mdr, 0);
3725 }
3726
3727
3728
3729 // ===============================================================================
3730 // INODE UPDATES
3731
3732
3733 /*
3734 * finisher for basic inode updates
3735 */
3736 class C_MDS_inode_update_finish : public ServerLogContext {
3737 CInode *in;
3738 bool truncating_smaller, changed_ranges;
3739 public:
3740 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
3741 bool sm=false, bool cr=false) :
3742 ServerLogContext(s, r), in(i), truncating_smaller(sm), changed_ranges(cr) { }
3743 void finish(int r) override {
3744 assert(r == 0);
3745
3746 // apply
3747 in->pop_and_dirty_projected_inode(mdr->ls);
3748 mdr->apply();
3749
3750 // notify any clients
3751 if (truncating_smaller && in->inode.is_truncating()) {
3752 get_mds()->locker->issue_truncate(in);
3753 get_mds()->mdcache->truncate_inode(in, mdr->ls);
3754 }
3755
3756 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
3757
3758 server->respond_to_request(mdr, 0);
3759
3760 if (changed_ranges)
3761 get_mds()->locker->share_inode_max_size(in);
3762 }
3763 };
3764
3765 void Server::handle_client_file_setlock(MDRequestRef& mdr)
3766 {
3767 MClientRequest *req = mdr->client_request;
3768 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3769
3770 // get the inode to operate on, and set up any locks needed for that
3771 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3772 if (!cur)
3773 return;
3774
3775 xlocks.insert(&cur->flocklock);
3776 /* acquire_locks will return true if it gets the locks. If it fails,
3777 it will redeliver this request at a later date, so drop the request.
3778 */
3779 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
3780 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
3781 return;
3782 }
3783
3784 // copy the lock change into a ceph_filelock so we can store/apply it
3785 ceph_filelock set_lock;
3786 set_lock.start = req->head.args.filelock_change.start;
3787 set_lock.length = req->head.args.filelock_change.length;
3788 set_lock.client = req->get_orig_source().num();
3789 set_lock.owner = req->head.args.filelock_change.owner;
3790 set_lock.pid = req->head.args.filelock_change.pid;
3791 set_lock.type = req->head.args.filelock_change.type;
3792 bool will_wait = req->head.args.filelock_change.wait;
3793
3794 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
3795
3796 ceph_lock_state_t *lock_state = NULL;
3797 bool interrupt = false;
3798
3799 // get the appropriate lock state
3800 switch (req->head.args.filelock_change.rule) {
3801 case CEPH_LOCK_FLOCK_INTR:
3802 interrupt = true;
3803 // fall-thru
3804 case CEPH_LOCK_FLOCK:
3805 lock_state = cur->get_flock_lock_state();
3806 break;
3807
3808 case CEPH_LOCK_FCNTL_INTR:
3809 interrupt = true;
3810 // fall-thru
3811 case CEPH_LOCK_FCNTL:
3812 lock_state = cur->get_fcntl_lock_state();
3813 break;
3814
3815 default:
3816 dout(10) << "got unknown lock type " << set_lock.type
3817 << ", dropping request!" << dendl;
3818 respond_to_request(mdr, -EOPNOTSUPP);
3819 return;
3820 }
3821
3822 dout(10) << " state prior to lock change: " << *lock_state << dendl;
3823 if (CEPH_LOCK_UNLOCK == set_lock.type) {
3824 list<ceph_filelock> activated_locks;
3825 list<MDSInternalContextBase*> waiters;
3826 if (lock_state->is_waiting(set_lock)) {
3827 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
3828 lock_state->remove_waiting(set_lock);
3829 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
3830 } else if (!interrupt) {
3831 dout(10) << " unlock attempt on " << set_lock << dendl;
3832 lock_state->remove_lock(set_lock, activated_locks);
3833 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
3834 }
3835 mds->queue_waiters(waiters);
3836
3837 respond_to_request(mdr, 0);
3838 } else {
3839 dout(10) << " lock attempt on " << set_lock << dendl;
3840 bool deadlock = false;
3841 if (mdr->more()->flock_was_waiting &&
3842 !lock_state->is_waiting(set_lock)) {
3843 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
3844 respond_to_request(mdr, -EINTR);
3845 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
3846 dout(10) << " it failed on this attempt" << dendl;
3847 // couldn't set lock right now
3848 if (deadlock) {
3849 respond_to_request(mdr, -EDEADLK);
3850 } else if (!will_wait) {
3851 respond_to_request(mdr, -EWOULDBLOCK);
3852 } else {
3853 dout(10) << " added to waiting list" << dendl;
3854 assert(lock_state->is_waiting(set_lock));
3855 mdr->more()->flock_was_waiting = true;
3856 mds->locker->drop_locks(mdr.get());
3857 mdr->drop_local_auth_pins();
3858 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
3859 }
3860 } else
3861 respond_to_request(mdr, 0);
3862 }
3863 dout(10) << " state after lock change: " << *lock_state << dendl;
3864 }
3865
3866 void Server::handle_client_file_readlock(MDRequestRef& mdr)
3867 {
3868 MClientRequest *req = mdr->client_request;
3869 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3870
3871 // get the inode to operate on, and set up any locks needed for that
3872 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3873 if (!cur)
3874 return;
3875
3876 /* acquire_locks will return true if it gets the locks. If it fails,
3877 it will redeliver this request at a later date, so drop the request.
3878 */
3879 rdlocks.insert(&cur->flocklock);
3880 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
3881 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
3882 return;
3883 }
3884
3885 // copy the lock change into a ceph_filelock so we can store/apply it
3886 ceph_filelock checking_lock;
3887 checking_lock.start = req->head.args.filelock_change.start;
3888 checking_lock.length = req->head.args.filelock_change.length;
3889 checking_lock.client = req->get_orig_source().num();
3890 checking_lock.owner = req->head.args.filelock_change.owner;
3891 checking_lock.pid = req->head.args.filelock_change.pid;
3892 checking_lock.type = req->head.args.filelock_change.type;
3893
3894 // get the appropriate lock state
3895 ceph_lock_state_t *lock_state = NULL;
3896 switch (req->head.args.filelock_change.rule) {
3897 case CEPH_LOCK_FLOCK:
3898 lock_state = cur->get_flock_lock_state();
3899 break;
3900
3901 case CEPH_LOCK_FCNTL:
3902 lock_state = cur->get_fcntl_lock_state();
3903 break;
3904
3905 default:
3906 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
3907 respond_to_request(mdr, -EINVAL);
3908 return;
3909 }
3910 lock_state->look_for_lock(checking_lock);
3911
3912 bufferlist lock_bl;
3913 ::encode(checking_lock, lock_bl);
3914
3915 mdr->reply_extra_bl = lock_bl;
3916 respond_to_request(mdr, 0);
3917 }
3918
3919 void Server::handle_client_setattr(MDRequestRef& mdr)
3920 {
3921 MClientRequest *req = mdr->client_request;
3922 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3923 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3924 if (!cur) return;
3925
3926 if (mdr->snapid != CEPH_NOSNAP) {
3927 respond_to_request(mdr, -EROFS);
3928 return;
3929 }
3930 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
3931 respond_to_request(mdr, -EPERM);
3932 return;
3933 }
3934
3935 __u32 mask = req->head.args.setattr.mask;
3936 __u32 access_mask = MAY_WRITE;
3937
3938 // xlock inode
3939 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
3940 xlocks.insert(&cur->authlock);
3941 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
3942 xlocks.insert(&cur->filelock);
3943 if (mask & CEPH_SETATTR_CTIME)
3944 wrlocks.insert(&cur->versionlock);
3945
3946 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3947 return;
3948
3949 if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
3950 access_mask |= MAY_CHOWN;
3951
3952 if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
3953 access_mask |= MAY_CHGRP;
3954
3955 if (!check_access(mdr, cur, access_mask))
3956 return;
3957
3958 // trunc from bigger -> smaller?
3959 inode_t *pi = cur->get_projected_inode();
3960
3961 uint64_t old_size = MAX(pi->size, req->head.args.setattr.old_size);
3962
3963 // ENOSPC on growing file while full, but allow shrinks
3964 if (is_full && req->head.args.setattr.size > old_size) {
3965 dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
3966 respond_to_request(mdr, -ENOSPC);
3967 return;
3968 }
3969
3970 bool truncating_smaller = false;
3971 if (mask & CEPH_SETATTR_SIZE) {
3972 truncating_smaller = req->head.args.setattr.size < old_size;
3973 if (truncating_smaller && pi->is_truncating()) {
3974 dout(10) << " waiting for pending truncate from " << pi->truncate_from
3975 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
3976 mds->locker->drop_locks(mdr.get());
3977 mdr->drop_local_auth_pins();
3978 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
3979 return;
3980 }
3981 }
3982
3983 bool changed_ranges = false;
3984
3985 // project update
3986 mdr->ls = mdlog->get_current_segment();
3987 EUpdate *le = new EUpdate(mdlog, "setattr");
3988 mdlog->start_entry(le);
3989
3990 pi = cur->project_inode();
3991
3992 if (mask & CEPH_SETATTR_UID)
3993 pi->uid = req->head.args.setattr.uid;
3994 if (mask & CEPH_SETATTR_GID)
3995 pi->gid = req->head.args.setattr.gid;
3996
3997 if (mask & CEPH_SETATTR_MODE)
3998 pi->mode = (pi->mode & ~07777) | (req->head.args.setattr.mode & 07777);
3999 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
4000 S_ISREG(pi->mode)) {
4001 pi->mode &= ~S_ISUID;
4002 if ((pi->mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP))
4003 pi->mode &= ~S_ISGID;
4004 }
4005
4006 if (mask & CEPH_SETATTR_MTIME)
4007 pi->mtime = req->head.args.setattr.mtime;
4008 if (mask & CEPH_SETATTR_ATIME)
4009 pi->atime = req->head.args.setattr.atime;
4010 if (mask & CEPH_SETATTR_BTIME)
4011 pi->btime = req->head.args.setattr.btime;
4012 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
4013 pi->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
4014 if (mask & CEPH_SETATTR_SIZE) {
4015 if (truncating_smaller) {
4016 pi->truncate(old_size, req->head.args.setattr.size);
4017 le->metablob.add_truncate_start(cur->ino());
4018 } else {
4019 pi->size = req->head.args.setattr.size;
4020 pi->rstat.rbytes = pi->size;
4021 }
4022 pi->mtime = mdr->get_op_stamp();
4023
4024 // adjust client's max_size?
4025 map<client_t,client_writeable_range_t> new_ranges;
4026 bool max_increased = false;
4027 mds->locker->calc_new_client_ranges(cur, pi->size, &new_ranges, &max_increased);
4028 if (pi->client_ranges != new_ranges) {
4029 dout(10) << " client_ranges " << pi->client_ranges << " -> " << new_ranges << dendl;
4030 pi->client_ranges = new_ranges;
4031 changed_ranges = true;
4032 }
4033 }
4034
4035 pi->version = cur->pre_dirty();
4036 pi->ctime = mdr->get_op_stamp();
4037 pi->change_attr++;
4038
4039 // log + wait
4040 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4041 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4042 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4043
4044 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
4045 truncating_smaller, changed_ranges));
4046
4047 // flush immediately if there are readers/writers waiting
4048 if (xlocks.count(&cur->filelock) &&
4049 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
4050 mds->mdlog->flush();
4051 }
4052
4053 /* Takes responsibility for mdr */
4054 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
4055 {
4056 CInode *in = mdr->in[0];
4057 client_t client = mdr->get_client();
4058 assert(in);
4059
4060 dout(10) << "do_open_truncate " << *in << dendl;
4061
4062 SnapRealm *realm = in->find_snaprealm();
4063 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
4064
4065 mdr->ls = mdlog->get_current_segment();
4066 EUpdate *le = new EUpdate(mdlog, "open_truncate");
4067 mdlog->start_entry(le);
4068
4069 // prepare
4070 inode_t *pi = in->project_inode();
4071 pi->version = in->pre_dirty();
4072 pi->mtime = pi->ctime = mdr->get_op_stamp();
4073 pi->change_attr++;
4074
4075 uint64_t old_size = MAX(pi->size, mdr->client_request->head.args.open.old_size);
4076 if (old_size > 0) {
4077 pi->truncate(old_size, 0);
4078 le->metablob.add_truncate_start(in->ino());
4079 }
4080
4081 bool changed_ranges = false;
4082 if (cmode & CEPH_FILE_MODE_WR) {
4083 pi->client_ranges[client].range.first = 0;
4084 pi->client_ranges[client].range.last = pi->get_layout_size_increment();
4085 pi->client_ranges[client].follows = in->find_snaprealm()->get_newest_seq();
4086 changed_ranges = true;
4087 }
4088
4089 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
4090
4091 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
4092 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
4093
4094 // make sure ino gets into the journal
4095 le->metablob.add_opened_ino(in->ino());
4096 LogSegment *ls = mds->mdlog->get_current_segment();
4097 ls->open_files.push_back(&in->item_open_file);
4098
4099 mdr->o_trunc = true;
4100
4101 CDentry *dn = 0;
4102 if (mdr->client_request->get_dentry_wanted()) {
4103 assert(mdr->dn[0].size());
4104 dn = mdr->dn[0].back();
4105 }
4106
4107 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
4108 changed_ranges));
4109 // Although the `open` part can give an early reply, the truncation won't
4110 // happen until our EUpdate is persistent, to give the client a prompt
4111 // response we must also flush that event.
4112 mdlog->flush();
4113 }
4114
4115
4116 /* This function cleans up the passed mdr */
4117 void Server::handle_client_setlayout(MDRequestRef& mdr)
4118 {
4119 MClientRequest *req = mdr->client_request;
4120 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4121 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4122 if (!cur) return;
4123
4124 if (mdr->snapid != CEPH_NOSNAP) {
4125 respond_to_request(mdr, -EROFS);
4126 return;
4127 }
4128 if (!cur->is_file()) {
4129 respond_to_request(mdr, -EINVAL);
4130 return;
4131 }
4132 if (cur->get_projected_inode()->size ||
4133 cur->get_projected_inode()->truncate_seq > 1) {
4134 respond_to_request(mdr, -ENOTEMPTY);
4135 return;
4136 }
4137
4138 // validate layout
4139 file_layout_t layout = cur->get_projected_inode()->layout;
4140 // save existing layout for later
4141 const auto old_layout = layout;
4142
4143 int access = MAY_WRITE;
4144
4145 if (req->head.args.setlayout.layout.fl_object_size > 0)
4146 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4147 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4148 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4149 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4150 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4151 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4152 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4153
4154 // make sure we have as new a map as the client
4155 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4156 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4157 return;
4158 }
4159 }
4160
4161 // Don't permit layout modifications without 'p' caps
4162 if (layout != old_layout) {
4163 access |= MAY_SET_VXATTR;
4164 }
4165
4166 if (!layout.is_valid()) {
4167 dout(10) << "bad layout" << dendl;
4168 respond_to_request(mdr, -EINVAL);
4169 return;
4170 }
4171 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4172 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4173 respond_to_request(mdr, -EINVAL);
4174 return;
4175 }
4176
4177 xlocks.insert(&cur->filelock);
4178 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4179 return;
4180
4181 if (!check_access(mdr, cur, access))
4182 return;
4183
4184 // project update
4185 inode_t *pi = cur->project_inode();
4186 pi->layout = layout;
4187 // add the old pool to the inode
4188 pi->add_old_pool(old_layout.pool_id);
4189 pi->version = cur->pre_dirty();
4190 pi->ctime = mdr->get_op_stamp();
4191 pi->change_attr++;
4192
4193 // log + wait
4194 mdr->ls = mdlog->get_current_segment();
4195 EUpdate *le = new EUpdate(mdlog, "setlayout");
4196 mdlog->start_entry(le);
4197 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4198 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4199 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4200
4201 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4202 }
4203
4204 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
4205 {
4206 MClientRequest *req = mdr->client_request;
4207 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4208 file_layout_t *dir_layout = NULL;
4209 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4210 if (!cur) return;
4211
4212 if (mdr->snapid != CEPH_NOSNAP) {
4213 respond_to_request(mdr, -EROFS);
4214 return;
4215 }
4216
4217 if (!cur->is_dir()) {
4218 respond_to_request(mdr, -ENOTDIR);
4219 return;
4220 }
4221
4222 xlocks.insert(&cur->policylock);
4223 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4224 return;
4225
4226 // validate layout
4227 const inode_t *old_pi = cur->get_projected_inode();
4228 file_layout_t layout;
4229 if (old_pi->has_layout())
4230 layout = old_pi->layout;
4231 else if (dir_layout)
4232 layout = *dir_layout;
4233 else
4234 layout = mdcache->default_file_layout;
4235
4236 // Level of access required to complete
4237 int access = MAY_WRITE;
4238
4239 const auto old_layout = layout;
4240
4241 if (req->head.args.setlayout.layout.fl_object_size > 0)
4242 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4243 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4244 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4245 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4246 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4247 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4248 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4249 // make sure we have as new a map as the client
4250 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4251 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4252 return;
4253 }
4254 }
4255
4256 if (layout != old_layout) {
4257 access |= MAY_SET_VXATTR;
4258 }
4259
4260 if (!layout.is_valid()) {
4261 dout(10) << "bad layout" << dendl;
4262 respond_to_request(mdr, -EINVAL);
4263 return;
4264 }
4265 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4266 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4267 respond_to_request(mdr, -EINVAL);
4268 return;
4269 }
4270
4271 if (!check_access(mdr, cur, access))
4272 return;
4273
4274 inode_t *pi = cur->project_inode();
4275 pi->layout = layout;
4276 pi->version = cur->pre_dirty();
4277
4278 // log + wait
4279 mdr->ls = mdlog->get_current_segment();
4280 EUpdate *le = new EUpdate(mdlog, "setlayout");
4281 mdlog->start_entry(le);
4282 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4283 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4284 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4285
4286 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4287 }
4288
4289 // XATTRS
4290
4291 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
4292 file_layout_t *layout, bool validate)
4293 {
4294 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
4295 try {
4296 if (name == "layout") {
4297 string::iterator begin = value.begin();
4298 string::iterator end = value.end();
4299 keys_and_values<string::iterator> p; // create instance of parser
4300 std::map<string, string> m; // map to receive results
4301 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4302 return -EINVAL;
4303 }
4304 string left(begin, end);
4305 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4306 if (begin != end)
4307 return -EINVAL;
4308 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4309 // Skip validation on each attr, we do it once at the end (avoid
4310 // rejecting intermediate states if the overall result is ok)
4311 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
4312 osdmap, layout, false);
4313 if (r < 0)
4314 return r;
4315 }
4316 } else if (name == "layout.object_size") {
4317 layout->object_size = boost::lexical_cast<unsigned>(value);
4318 } else if (name == "layout.stripe_unit") {
4319 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
4320 } else if (name == "layout.stripe_count") {
4321 layout->stripe_count = boost::lexical_cast<unsigned>(value);
4322 } else if (name == "layout.pool") {
4323 try {
4324 layout->pool_id = boost::lexical_cast<unsigned>(value);
4325 } catch (boost::bad_lexical_cast const&) {
4326 int64_t pool = osdmap.lookup_pg_pool_name(value);
4327 if (pool < 0) {
4328 dout(10) << " unknown pool " << value << dendl;
4329 return -ENOENT;
4330 }
4331 layout->pool_id = pool;
4332 }
4333 } else if (name == "layout.pool_namespace") {
4334 layout->pool_ns = value;
4335 } else {
4336 dout(10) << " unknown layout vxattr " << name << dendl;
4337 return -EINVAL;
4338 }
4339 } catch (boost::bad_lexical_cast const&) {
4340 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4341 return -EINVAL;
4342 }
4343
4344 if (validate && !layout->is_valid()) {
4345 dout(10) << "bad layout" << dendl;
4346 return -EINVAL;
4347 }
4348 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
4349 dout(10) << " invalid data pool " << layout->pool_id << dendl;
4350 return -EINVAL;
4351 }
4352 return 0;
4353 }
4354
4355 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
4356 {
4357 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
4358 try {
4359 if (name == "quota") {
4360 string::iterator begin = value.begin();
4361 string::iterator end = value.end();
4362 keys_and_values<string::iterator> p; // create instance of parser
4363 std::map<string, string> m; // map to receive results
4364 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4365 return -EINVAL;
4366 }
4367 string left(begin, end);
4368 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4369 if (begin != end)
4370 return -EINVAL;
4371 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4372 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
4373 if (r < 0)
4374 return r;
4375 }
4376 } else if (name == "quota.max_bytes") {
4377 int64_t q = boost::lexical_cast<int64_t>(value);
4378 if (q < 0)
4379 return -EINVAL;
4380 quota->max_bytes = q;
4381 } else if (name == "quota.max_files") {
4382 int64_t q = boost::lexical_cast<int64_t>(value);
4383 if (q < 0)
4384 return -EINVAL;
4385 quota->max_files = q;
4386 } else {
4387 dout(10) << " unknown quota vxattr " << name << dendl;
4388 return -EINVAL;
4389 }
4390 } catch (boost::bad_lexical_cast const&) {
4391 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4392 return -EINVAL;
4393 }
4394
4395 if (!quota->is_valid()) {
4396 dout(10) << "bad quota" << dendl;
4397 return -EINVAL;
4398 }
4399 return 0;
4400 }
4401
4402 /*
4403 * Verify that the file layout attribute carried by client
4404 * is well-formatted.
4405 * Return 0 on success, otherwise this function takes
4406 * responsibility for the passed mdr.
4407 */
4408 int Server::check_layout_vxattr(MDRequestRef& mdr,
4409 string name,
4410 string value,
4411 file_layout_t *layout)
4412 {
4413 MClientRequest *req = mdr->client_request;
4414 epoch_t epoch;
4415 int r;
4416
4417 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4418 r = parse_layout_vxattr(name, value, osdmap, layout);
4419 epoch = osdmap.get_epoch();
4420 });
4421
4422 if (r == -ENOENT) {
4423
4424 // we don't have the specified pool, make sure our map
4425 // is newer than or as new as the client.
4426 epoch_t req_epoch = req->get_osdmap_epoch();
4427
4428 if (req_epoch > epoch) {
4429
4430 // well, our map is older. consult mds.
4431 Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
4432
4433 if (!mds->objecter->wait_for_map(req_epoch, fin))
4434 return r; // wait, fin will retry this request later
4435
4436 delete fin;
4437
4438 // now we have at least as new a map as the client, try again.
4439 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4440 r = parse_layout_vxattr(name, value, osdmap, layout);
4441 epoch = osdmap.get_epoch();
4442 });
4443
4444 assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
4445
4446 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
4447
4448 // For compatibility with client w/ old code, we still need get the
4449 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
4450 // we can remove those code.
4451 mdr->waited_for_osdmap = true;
4452 mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
4453 mds, new C_MDS_RetryRequest(mdcache, mdr)));
4454 return r;
4455 }
4456 }
4457
4458 if (r < 0) {
4459
4460 if (r == -ENOENT)
4461 r = -EINVAL;
4462
4463 respond_to_request(mdr, r);
4464 return r;
4465 }
4466
4467 // all is well
4468 return 0;
4469 }
4470
4471 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
4472 file_layout_t *dir_layout,
4473 set<SimpleLock*> rdlocks,
4474 set<SimpleLock*> wrlocks,
4475 set<SimpleLock*> xlocks)
4476 {
4477 MClientRequest *req = mdr->client_request;
4478 string name(req->get_path2());
4479 bufferlist bl = req->get_data();
4480 string value (bl.c_str(), bl.length());
4481 dout(10) << "handle_set_vxattr " << name
4482 << " val " << value.length()
4483 << " bytes on " << *cur
4484 << dendl;
4485
4486 inode_t *pi = NULL;
4487 string rest;
4488
4489 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
4490 return;
4491 }
4492
4493 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
4494 if (!cur->is_dir()) {
4495 respond_to_request(mdr, -EINVAL);
4496 return;
4497 }
4498
4499 file_layout_t layout;
4500 if (cur->get_projected_inode()->has_layout())
4501 layout = cur->get_projected_inode()->layout;
4502 else if (dir_layout)
4503 layout = *dir_layout;
4504 else
4505 layout = mdcache->default_file_layout;
4506
4507 rest = name.substr(name.find("layout"));
4508 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4509 return;
4510
4511 xlocks.insert(&cur->policylock);
4512 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4513 return;
4514
4515 pi = cur->project_inode();
4516 pi->layout = layout;
4517 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
4518 if (!cur->is_file()) {
4519 respond_to_request(mdr, -EINVAL);
4520 return;
4521 }
4522 if (cur->get_projected_inode()->size ||
4523 cur->get_projected_inode()->truncate_seq > 1) {
4524 respond_to_request(mdr, -ENOTEMPTY);
4525 return;
4526 }
4527 file_layout_t layout = cur->get_projected_inode()->layout;
4528 rest = name.substr(name.find("layout"));
4529 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4530 return;
4531
4532 xlocks.insert(&cur->filelock);
4533 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4534 return;
4535
4536 pi = cur->project_inode();
4537 int64_t old_pool = pi->layout.pool_id;
4538 pi->add_old_pool(old_pool);
4539 pi->layout = layout;
4540 pi->ctime = mdr->get_op_stamp();
4541 } else if (name.compare(0, 10, "ceph.quota") == 0) {
4542 if (!cur->is_dir() || cur->is_root()) {
4543 respond_to_request(mdr, -EINVAL);
4544 return;
4545 }
4546
4547 quota_info_t quota = cur->get_projected_inode()->quota;
4548
4549 rest = name.substr(name.find("quota"));
4550 int r = parse_quota_vxattr(rest, value, &quota);
4551 if (r < 0) {
4552 respond_to_request(mdr, r);
4553 return;
4554 }
4555
4556 xlocks.insert(&cur->policylock);
4557 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4558 return;
4559
4560 pi = cur->project_inode();
4561 pi->quota = quota;
4562 } else if (name.find("ceph.dir.pin") == 0) {
4563 if (!cur->is_dir() || cur->is_root()) {
4564 respond_to_request(mdr, -EINVAL);
4565 return;
4566 }
4567
4568 mds_rank_t rank;
4569 try {
4570 rank = boost::lexical_cast<mds_rank_t>(value);
4571 if (rank < 0) rank = MDS_RANK_NONE;
4572 } catch (boost::bad_lexical_cast const&) {
4573 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4574 respond_to_request(mdr, -EINVAL);
4575 return;
4576 }
4577
4578 xlocks.insert(&cur->policylock);
4579 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4580 return;
4581
4582 pi = cur->project_inode();
4583 cur->set_export_pin(rank);
4584 } else {
4585 dout(10) << " unknown vxattr " << name << dendl;
4586 respond_to_request(mdr, -EINVAL);
4587 return;
4588 }
4589
4590 pi->change_attr++;
4591 pi->ctime = mdr->get_op_stamp();
4592 pi->version = cur->pre_dirty();
4593 if (cur->is_file())
4594 pi->update_backtrace();
4595
4596 // log + wait
4597 mdr->ls = mdlog->get_current_segment();
4598 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
4599 mdlog->start_entry(le);
4600 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4601 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4602 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4603
4604 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4605 return;
4606 }
4607
4608 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
4609 file_layout_t *dir_layout,
4610 set<SimpleLock*> rdlocks,
4611 set<SimpleLock*> wrlocks,
4612 set<SimpleLock*> xlocks)
4613 {
4614 MClientRequest *req = mdr->client_request;
4615 string name(req->get_path2());
4616
4617 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
4618
4619 if (name == "ceph.dir.layout") {
4620 if (!cur->is_dir()) {
4621 respond_to_request(mdr, -ENODATA);
4622 return;
4623 }
4624 if (cur->is_root()) {
4625 dout(10) << "can't remove layout policy on the root directory" << dendl;
4626 respond_to_request(mdr, -EINVAL);
4627 return;
4628 }
4629
4630 if (!cur->get_projected_inode()->has_layout()) {
4631 respond_to_request(mdr, -ENODATA);
4632 return;
4633 }
4634
4635 xlocks.insert(&cur->policylock);
4636 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4637 return;
4638
4639 inode_t *pi = cur->project_inode();
4640 pi->clear_layout();
4641 pi->version = cur->pre_dirty();
4642
4643 // log + wait
4644 mdr->ls = mdlog->get_current_segment();
4645 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
4646 mdlog->start_entry(le);
4647 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4648 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4649 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4650
4651 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4652 return;
4653 } else if (name == "ceph.dir.layout.pool_namespace"
4654 || name == "ceph.file.layout.pool_namespace") {
4655 // Namespace is the only layout field that has a meaningful
4656 // null/none value (empty string, means default layout). Is equivalent
4657 // to a setxattr with empty string: pass through the empty payload of
4658 // the rmxattr request to do this.
4659 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4660 return;
4661 }
4662
4663 respond_to_request(mdr, -ENODATA);
4664 }
4665
4666 class C_MDS_inode_xattr_update_finish : public ServerLogContext {
4667 CInode *in;
4668 public:
4669
4670 C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
4671 ServerLogContext(s, r), in(i) { }
4672 void finish(int r) override {
4673 assert(r == 0);
4674
4675 // apply
4676 in->pop_and_dirty_projected_inode(mdr->ls);
4677
4678 mdr->apply();
4679
4680 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
4681
4682 server->respond_to_request(mdr, 0);
4683 }
4684 };
4685
4686 void Server::handle_client_setxattr(MDRequestRef& mdr)
4687 {
4688 MClientRequest *req = mdr->client_request;
4689 string name(req->get_path2());
4690 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4691 CInode *cur;
4692
4693 file_layout_t *dir_layout = NULL;
4694 if (name.compare(0, 15, "ceph.dir.layout") == 0)
4695 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4696 else
4697 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4698 if (!cur)
4699 return;
4700
4701 if (mdr->snapid != CEPH_NOSNAP) {
4702 respond_to_request(mdr, -EROFS);
4703 return;
4704 }
4705
4706 int flags = req->head.args.setxattr.flags;
4707
4708 // magic ceph.* namespace?
4709 if (name.compare(0, 5, "ceph.") == 0) {
4710 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4711 return;
4712 }
4713
4714 xlocks.insert(&cur->xattrlock);
4715 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4716 return;
4717
4718 if (!check_access(mdr, cur, MAY_WRITE))
4719 return;
4720
4721 map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
4722 size_t len = req->get_data().length();
4723 size_t inc = len + name.length();
4724
4725 // check xattrs kv pairs size
4726 size_t cur_xattrs_size = 0;
4727 for (const auto& p : *pxattrs) {
4728 if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
4729 continue;
4730 }
4731 cur_xattrs_size += p.first.length() + p.second.length();
4732 }
4733
4734 if (((cur_xattrs_size + inc) > g_conf->mds_max_xattr_pairs_size)) {
4735 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
4736 << cur_xattrs_size << ", inc " << inc << dendl;
4737 respond_to_request(mdr, -ENOSPC);
4738 return;
4739 }
4740
4741 if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(name)) {
4742 dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
4743 respond_to_request(mdr, -EEXIST);
4744 return;
4745 }
4746 if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(name)) {
4747 dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
4748 respond_to_request(mdr, -ENODATA);
4749 return;
4750 }
4751
4752 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
4753
4754 // project update
4755 map<string,bufferptr> *px = new map<string,bufferptr>;
4756 inode_t *pi = cur->project_inode(px);
4757 pi->version = cur->pre_dirty();
4758 pi->ctime = mdr->get_op_stamp();
4759 pi->change_attr++;
4760 pi->xattr_version++;
4761 px->erase(name);
4762 if (!(flags & CEPH_XATTR_REMOVE)) {
4763 (*px)[name] = buffer::create(len);
4764 if (len)
4765 req->get_data().copy(0, len, (*px)[name].c_str());
4766 }
4767
4768 // log + wait
4769 mdr->ls = mdlog->get_current_segment();
4770 EUpdate *le = new EUpdate(mdlog, "setxattr");
4771 mdlog->start_entry(le);
4772 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4773 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4774 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4775
4776 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4777 }
4778
4779 void Server::handle_client_removexattr(MDRequestRef& mdr)
4780 {
4781 MClientRequest *req = mdr->client_request;
4782 string name(req->get_path2());
4783 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4784 file_layout_t *dir_layout = NULL;
4785 CInode *cur;
4786 if (name == "ceph.dir.layout")
4787 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4788 else
4789 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4790 if (!cur)
4791 return;
4792
4793 if (mdr->snapid != CEPH_NOSNAP) {
4794 respond_to_request(mdr, -EROFS);
4795 return;
4796 }
4797
4798 if (name.compare(0, 5, "ceph.") == 0) {
4799 handle_remove_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4800 return;
4801 }
4802
4803 xlocks.insert(&cur->xattrlock);
4804 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4805 return;
4806
4807 map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
4808 if (pxattrs->count(name) == 0) {
4809 dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
4810 respond_to_request(mdr, -ENODATA);
4811 return;
4812 }
4813
4814 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
4815
4816 // project update
4817 map<string,bufferptr> *px = new map<string,bufferptr>;
4818 inode_t *pi = cur->project_inode(px);
4819 pi->version = cur->pre_dirty();
4820 pi->ctime = mdr->get_op_stamp();
4821 pi->change_attr++;
4822 pi->xattr_version++;
4823 px->erase(name);
4824
4825 // log + wait
4826 mdr->ls = mdlog->get_current_segment();
4827 EUpdate *le = new EUpdate(mdlog, "removexattr");
4828 mdlog->start_entry(le);
4829 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4830 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4831 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4832
4833 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4834 }
4835
4836
4837 // =================================================================
4838 // DIRECTORY and NAMESPACE OPS
4839
4840
4841 // ------------------------------------------------
4842
4843 // MKNOD
4844
4845 class C_MDS_mknod_finish : public ServerLogContext {
4846 CDentry *dn;
4847 CInode *newi;
4848 public:
4849 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4850 ServerLogContext(s, r), dn(d), newi(ni) {}
4851 void finish(int r) override {
4852 assert(r == 0);
4853
4854 // link the inode
4855 dn->pop_projected_linkage();
4856
4857 // be a bit hacky with the inode version, here.. we decrement it
4858 // just to keep mark_dirty() happen. (we didn't bother projecting
4859 // a new version of hte inode since it's just been created)
4860 newi->inode.version--;
4861 newi->mark_dirty(newi->inode.version + 1, mdr->ls);
4862 newi->_mark_dirty_parent(mdr->ls, true);
4863
4864 // mkdir?
4865 if (newi->inode.is_dir()) {
4866 CDir *dir = newi->get_dirfrag(frag_t());
4867 assert(dir);
4868 dir->fnode.version--;
4869 dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
4870 dir->mark_new(mdr->ls);
4871 }
4872
4873 mdr->apply();
4874
4875 MDRequestRef null_ref;
4876 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4877
4878 if (newi->inode.is_file())
4879 get_mds()->locker->share_inode_max_size(newi);
4880
4881 // hit pop
4882 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
4883
4884 // reply
4885 server->respond_to_request(mdr, 0);
4886 }
4887 };
4888
4889
4890 void Server::handle_client_mknod(MDRequestRef& mdr)
4891 {
4892 MClientRequest *req = mdr->client_request;
4893 client_t client = mdr->get_client();
4894 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4895 file_layout_t *dir_layout = NULL;
4896 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false,
4897 &dir_layout);
4898 if (!dn) return;
4899 if (mdr->snapid != CEPH_NOSNAP) {
4900 respond_to_request(mdr, -EROFS);
4901 return;
4902 }
4903 CInode *diri = dn->get_dir()->get_inode();
4904 rdlocks.insert(&diri->authlock);
4905 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4906 return;
4907
4908 if (!check_access(mdr, diri, MAY_WRITE))
4909 return;
4910
4911 if (!check_fragment_space(mdr, dn->get_dir()))
4912 return;
4913
4914 unsigned mode = req->head.args.mknod.mode;
4915 if ((mode & S_IFMT) == 0)
4916 mode |= S_IFREG;
4917
4918 // set layout
4919 file_layout_t layout;
4920 if (dir_layout && S_ISREG(mode))
4921 layout = *dir_layout;
4922 else
4923 layout = mdcache->default_file_layout;
4924
4925 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
4926 snapid_t follows = realm->get_newest_seq();
4927 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4928 mode, &layout);
4929 assert(newi);
4930
4931 dn->push_projected_linkage(newi);
4932
4933 newi->inode.rdev = req->head.args.mknod.rdev;
4934 newi->inode.version = dn->pre_dirty();
4935 newi->inode.rstat.rfiles = 1;
4936 if (layout.pool_id != mdcache->default_file_layout.pool_id)
4937 newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
4938 newi->inode.update_backtrace();
4939
4940 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
4941 // want to write to it (e.g., if they are reexporting NFS)
4942 if (S_ISREG(newi->inode.mode)) {
4943 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
4944 newi->inode.client_ranges[client].range.first = 0;
4945 newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
4946 newi->inode.client_ranges[client].follows = follows;
4947
4948 // issue a cap on the file
4949 int cmode = CEPH_FILE_MODE_RDWR;
4950 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
4951 if (cap) {
4952 cap->set_wanted(0);
4953
4954 // put locks in excl mode
4955 newi->filelock.set_state(LOCK_EXCL);
4956 newi->authlock.set_state(LOCK_EXCL);
4957 newi->xattrlock.set_state(LOCK_EXCL);
4958 }
4959 }
4960
4961 assert(dn->first == follows + 1);
4962 newi->first = dn->first;
4963
4964 dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
4965
4966 // prepare finisher
4967 mdr->ls = mdlog->get_current_segment();
4968 EUpdate *le = new EUpdate(mdlog, "mknod");
4969 mdlog->start_entry(le);
4970 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4971 journal_allocated_inos(mdr, &le->metablob);
4972
4973 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
4974 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4975 le->metablob.add_primary_dentry(dn, newi, true, true, true);
4976
4977 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
4978 }
4979
4980
4981
4982 // MKDIR
4983 /* This function takes responsibility for the passed mdr*/
4984 void Server::handle_client_mkdir(MDRequestRef& mdr)
4985 {
4986 MClientRequest *req = mdr->client_request;
4987 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4988 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
4989 if (!dn) return;
4990 if (mdr->snapid != CEPH_NOSNAP) {
4991 respond_to_request(mdr, -EROFS);
4992 return;
4993 }
4994 CDir *dir = dn->get_dir();
4995 CInode *diri = dir->get_inode();
4996 rdlocks.insert(&diri->authlock);
4997 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4998 return;
4999
5000 // mkdir check access
5001 if (!check_access(mdr, diri, MAY_WRITE))
5002 return;
5003
5004 if (!check_fragment_space(mdr, dir))
5005 return;
5006
5007 // new inode
5008 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5009 snapid_t follows = realm->get_newest_seq();
5010
5011 unsigned mode = req->head.args.mkdir.mode;
5012 mode &= ~S_IFMT;
5013 mode |= S_IFDIR;
5014 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5015 assert(newi);
5016
5017 // it's a directory.
5018 dn->push_projected_linkage(newi);
5019
5020 newi->inode.version = dn->pre_dirty();
5021 newi->inode.rstat.rsubdirs = 1;
5022 newi->inode.update_backtrace();
5023
5024 dout(12) << " follows " << follows << dendl;
5025 assert(dn->first == follows + 1);
5026 newi->first = dn->first;
5027
5028 // ...and that new dir is empty.
5029 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
5030 newdir->state_set(CDir::STATE_CREATING);
5031 newdir->mark_complete();
5032 newdir->fnode.version = newdir->pre_dirty();
5033
5034 // prepare finisher
5035 mdr->ls = mdlog->get_current_segment();
5036 EUpdate *le = new EUpdate(mdlog, "mkdir");
5037 mdlog->start_entry(le);
5038 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5039 journal_allocated_inos(mdr, &le->metablob);
5040 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5041 le->metablob.add_primary_dentry(dn, newi, true, true);
5042 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
5043
5044 // issue a cap on the directory
5045 int cmode = CEPH_FILE_MODE_RDWR;
5046 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5047 if (cap) {
5048 cap->set_wanted(0);
5049
5050 // put locks in excl mode
5051 newi->filelock.set_state(LOCK_EXCL);
5052 newi->authlock.set_state(LOCK_EXCL);
5053 newi->xattrlock.set_state(LOCK_EXCL);
5054 }
5055
5056 // make sure this inode gets into the journal
5057 le->metablob.add_opened_ino(newi->ino());
5058 LogSegment *ls = mds->mdlog->get_current_segment();
5059 ls->open_files.push_back(&newi->item_open_file);
5060
5061 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5062 }
5063
5064
5065 // SYMLINK
5066
5067 void Server::handle_client_symlink(MDRequestRef& mdr)
5068 {
5069 MClientRequest *req = mdr->client_request;
5070 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5071 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5072 if (!dn) return;
5073 if (mdr->snapid != CEPH_NOSNAP) {
5074 respond_to_request(mdr, -EROFS);
5075 return;
5076 }
5077 CDir *dir = dn->get_dir();
5078 CInode *diri = dir->get_inode();
5079 rdlocks.insert(&diri->authlock);
5080 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5081 return;
5082
5083 if (!check_access(mdr, diri, MAY_WRITE))
5084 return;
5085
5086 if (!check_fragment_space(mdr, dir))
5087 return;
5088
5089 unsigned mode = S_IFLNK | 0777;
5090 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5091 assert(newi);
5092
5093 // it's a symlink
5094 dn->push_projected_linkage(newi);
5095
5096 newi->symlink = req->get_path2();
5097 newi->inode.size = newi->symlink.length();
5098 newi->inode.rstat.rbytes = newi->inode.size;
5099 newi->inode.rstat.rfiles = 1;
5100 newi->inode.version = dn->pre_dirty();
5101 newi->inode.update_backtrace();
5102
5103 newi->first = dn->first;
5104
5105 // prepare finisher
5106 mdr->ls = mdlog->get_current_segment();
5107 EUpdate *le = new EUpdate(mdlog, "symlink");
5108 mdlog->start_entry(le);
5109 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5110 journal_allocated_inos(mdr, &le->metablob);
5111 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5112 le->metablob.add_primary_dentry(dn, newi, true, true);
5113
5114 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5115 }
5116
5117
5118
5119
5120
5121 // LINK
5122
5123 void Server::handle_client_link(MDRequestRef& mdr)
5124 {
5125 MClientRequest *req = mdr->client_request;
5126
5127 dout(7) << "handle_client_link " << req->get_filepath()
5128 << " to " << req->get_filepath2()
5129 << dendl;
5130
5131 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5132
5133 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5134 if (!dn) return;
5135 CInode *targeti = rdlock_path_pin_ref(mdr, 1, rdlocks, false);
5136 if (!targeti) return;
5137 if (mdr->snapid != CEPH_NOSNAP) {
5138 respond_to_request(mdr, -EROFS);
5139 return;
5140 }
5141
5142 CDir *dir = dn->get_dir();
5143 dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
5144 dout(7) << "target is " << *targeti << dendl;
5145 if (targeti->is_dir()) {
5146 dout(7) << "target is a dir, failing..." << dendl;
5147 respond_to_request(mdr, -EINVAL);
5148 return;
5149 }
5150
5151 xlocks.insert(&targeti->linklock);
5152
5153 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5154 return;
5155
5156 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5157 if (!check_access(mdr, targeti, MAY_WRITE))
5158 return;
5159
5160 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
5161 return;
5162
5163 if (!check_fragment_space(mdr, dir))
5164 return;
5165 }
5166
5167 // go!
5168 assert(g_conf->mds_kill_link_at != 1);
5169
5170 // local or remote?
5171 if (targeti->is_auth())
5172 _link_local(mdr, dn, targeti);
5173 else
5174 _link_remote(mdr, true, dn, targeti);
5175 }
5176
5177
5178 class C_MDS_link_local_finish : public ServerLogContext {
5179 CDentry *dn;
5180 CInode *targeti;
5181 version_t dnpv;
5182 version_t tipv;
5183 public:
5184 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
5185 version_t dnpv_, version_t tipv_) :
5186 ServerLogContext(s, r), dn(d), targeti(ti),
5187 dnpv(dnpv_), tipv(tipv_) { }
5188 void finish(int r) override {
5189 assert(r == 0);
5190 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv);
5191 }
5192 };
5193
5194
5195 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
5196 {
5197 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
5198
5199 mdr->ls = mdlog->get_current_segment();
5200
5201 // predirty NEW dentry
5202 version_t dnpv = dn->pre_dirty();
5203 version_t tipv = targeti->pre_dirty();
5204
5205 // project inode update
5206 inode_t *pi = targeti->project_inode();
5207 pi->nlink++;
5208 pi->ctime = mdr->get_op_stamp();
5209 pi->change_attr++;
5210 pi->version = tipv;
5211
5212 // log + wait
5213 EUpdate *le = new EUpdate(mdlog, "link_local");
5214 mdlog->start_entry(le);
5215 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5216 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
5217 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
5218 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5219 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
5220
5221 // do this after predirty_*, to avoid funky extra dnl arg
5222 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5223
5224 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv));
5225 }
5226
5227 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
5228 version_t dnpv, version_t tipv)
5229 {
5230 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
5231
5232 // link and unlock the NEW dentry
5233 dn->pop_projected_linkage();
5234 dn->mark_dirty(dnpv, mdr->ls);
5235
5236 // target inode
5237 targeti->pop_and_dirty_projected_inode(mdr->ls);
5238
5239 mdr->apply();
5240
5241 MDRequestRef null_ref;
5242 mdcache->send_dentry_link(dn, null_ref);
5243
5244 // bump target popularity
5245 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5246 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
5247
5248 // reply
5249 respond_to_request(mdr, 0);
5250 }
5251
5252
5253 // link / unlink remote
5254
5255 class C_MDS_link_remote_finish : public ServerLogContext {
5256 bool inc;
5257 CDentry *dn;
5258 CInode *targeti;
5259 version_t dpv;
5260 public:
5261 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
5262 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
5263 dpv(d->get_projected_version()) {}
5264 void finish(int r) override {
5265 assert(r == 0);
5266 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
5267 }
5268 };
5269
5270 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
5271 {
5272 dout(10) << "_link_remote "
5273 << (inc ? "link ":"unlink ")
5274 << *dn << " to " << *targeti << dendl;
5275
5276 // 1. send LinkPrepare to dest (journal nlink++ prepare)
5277 mds_rank_t linkauth = targeti->authority().first;
5278 if (mdr->more()->witnessed.count(linkauth) == 0) {
5279 if (mds->is_cluster_degraded() &&
5280 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
5281 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
5282 if (mdr->more()->waiting_on_slave.empty())
5283 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
5284 return;
5285 }
5286
5287 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
5288 int op;
5289 if (inc)
5290 op = MMDSSlaveRequest::OP_LINKPREP;
5291 else
5292 op = MMDSSlaveRequest::OP_UNLINKPREP;
5293 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, op);
5294 targeti->set_object_info(req->get_object_info());
5295 req->op_stamp = mdr->get_op_stamp();
5296 mds->send_message_mds(req, linkauth);
5297
5298 assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
5299 mdr->more()->waiting_on_slave.insert(linkauth);
5300 return;
5301 }
5302 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
5303
5304 assert(g_conf->mds_kill_link_at != 2);
5305
5306 mdr->set_mds_stamp(ceph_clock_now());
5307
5308 // add to event
5309 mdr->ls = mdlog->get_current_segment();
5310 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
5311 mdlog->start_entry(le);
5312 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5313 if (!mdr->more()->witnessed.empty()) {
5314 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
5315 le->reqid = mdr->reqid;
5316 le->had_slaves = true;
5317 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
5318 }
5319
5320 if (inc) {
5321 dn->pre_dirty();
5322 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
5323 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5324 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5325 } else {
5326 dn->pre_dirty();
5327 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
5328 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
5329 le->metablob.add_null_dentry(dn, true);
5330 }
5331
5332 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
5333 }
5334
5335 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
5336 CDentry *dn, CInode *targeti,
5337 version_t dpv)
5338 {
5339 dout(10) << "_link_remote_finish "
5340 << (inc ? "link ":"unlink ")
5341 << *dn << " to " << *targeti << dendl;
5342
5343 assert(g_conf->mds_kill_link_at != 3);
5344
5345 if (!mdr->more()->witnessed.empty())
5346 mdcache->logged_master_update(mdr->reqid);
5347
5348 if (inc) {
5349 // link the new dentry
5350 dn->pop_projected_linkage();
5351 dn->mark_dirty(dpv, mdr->ls);
5352 } else {
5353 // unlink main dentry
5354 dn->get_dir()->unlink_inode(dn);
5355 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
5356 }
5357
5358 mdr->apply();
5359
5360 MDRequestRef null_ref;
5361 if (inc)
5362 mdcache->send_dentry_link(dn, null_ref);
5363 else
5364 mdcache->send_dentry_unlink(dn, NULL, null_ref);
5365
5366 // bump target popularity
5367 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5368 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
5369
5370 // reply
5371 respond_to_request(mdr, 0);
5372
5373 if (!inc)
5374 // removing a new dn?
5375 dn->get_dir()->try_remove_unlinked_dn(dn);
5376 }
5377
5378
5379 // remote linking/unlinking
5380
5381 class C_MDS_SlaveLinkPrep : public ServerLogContext {
5382 CInode *targeti;
5383 public:
5384 C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t) :
5385 ServerLogContext(s, r), targeti(t) { }
5386 void finish(int r) override {
5387 assert(r == 0);
5388 server->_logged_slave_link(mdr, targeti);
5389 }
5390 };
5391
5392 class C_MDS_SlaveLinkCommit : public ServerContext {
5393 MDRequestRef mdr;
5394 CInode *targeti;
5395 public:
5396 C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
5397 ServerContext(s), mdr(r), targeti(t) { }
5398 void finish(int r) override {
5399 server->_commit_slave_link(mdr, r, targeti);
5400 }
5401 };
5402
5403 /* This function DOES put the mdr->slave_request before returning*/
5404 void Server::handle_slave_link_prep(MDRequestRef& mdr)
5405 {
5406 dout(10) << "handle_slave_link_prep " << *mdr
5407 << " on " << mdr->slave_request->get_object_info()
5408 << dendl;
5409
5410 assert(g_conf->mds_kill_link_at != 4);
5411
5412 CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
5413 assert(targeti);
5414 dout(10) << "targeti " << *targeti << dendl;
5415 CDentry *dn = targeti->get_parent_dn();
5416 CDentry::linkage_t *dnl = dn->get_linkage();
5417 assert(dnl->is_primary());
5418
5419 mdr->set_op_stamp(mdr->slave_request->op_stamp);
5420
5421 mdr->auth_pin(targeti);
5422
5423 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
5424 assert(g_conf->mds_kill_link_at != 5);
5425
5426 // journal it
5427 mdr->ls = mdlog->get_current_segment();
5428 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
5429 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
5430 mdlog->start_entry(le);
5431
5432 inode_t *pi = dnl->get_inode()->project_inode();
5433
5434 // update journaled target inode
5435 bool inc;
5436 if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
5437 inc = true;
5438 pi->nlink++;
5439 } else {
5440 inc = false;
5441 pi->nlink--;
5442 }
5443
5444 link_rollback rollback;
5445 rollback.reqid = mdr->reqid;
5446 rollback.ino = targeti->ino();
5447 rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
5448 const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
5449 rollback.old_dir_mtime = pf->fragstat.mtime;
5450 rollback.old_dir_rctime = pf->rstat.rctime;
5451 rollback.was_inc = inc;
5452 ::encode(rollback, le->rollback);
5453 mdr->more()->rollback_bl = le->rollback;
5454
5455 pi->ctime = mdr->get_op_stamp();
5456 pi->version = targeti->pre_dirty();
5457
5458 dout(10) << " projected inode " << pi << " v " << pi->version << dendl;
5459
5460 // commit case
5461 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
5462 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
5463
5464 // set up commit waiter
5465 mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
5466
5467 mdr->more()->slave_update_journaled = true;
5468 submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti),
5469 mdr, __func__);
5470 mdlog->flush();
5471 }
5472
5473 void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti)
5474 {
5475 dout(10) << "_logged_slave_link " << *mdr
5476 << " " << *targeti << dendl;
5477
5478 assert(g_conf->mds_kill_link_at != 6);
5479
5480 // update the target
5481 targeti->pop_and_dirty_projected_inode(mdr->ls);
5482 mdr->apply();
5483
5484 // hit pop
5485 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5486
5487 // done.
5488 mdr->slave_request->put();
5489 mdr->slave_request = 0;
5490
5491 // ack
5492 if (!mdr->aborted) {
5493 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5494 MMDSSlaveRequest::OP_LINKPREPACK);
5495 mds->send_message_mds(reply, mdr->slave_to_mds);
5496 } else {
5497 dout(10) << " abort flag set, finishing" << dendl;
5498 mdcache->request_finish(mdr);
5499 }
5500 }
5501
5502
5503 struct C_MDS_CommittedSlave : public ServerLogContext {
5504 C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
5505 void finish(int r) override {
5506 server->_committed_slave(mdr);
5507 }
5508 };
5509
5510 void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
5511 {
5512 dout(10) << "_commit_slave_link " << *mdr
5513 << " r=" << r
5514 << " " << *targeti << dendl;
5515
5516 assert(g_conf->mds_kill_link_at != 7);
5517
5518 if (r == 0) {
5519 // drop our pins, etc.
5520 mdr->cleanup();
5521
5522 // write a commit to the journal
5523 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
5524 ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
5525 mdlog->start_entry(le);
5526 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
5527 mdlog->flush();
5528 } else {
5529 do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
5530 }
5531 }
5532
5533 void Server::_committed_slave(MDRequestRef& mdr)
5534 {
5535 dout(10) << "_committed_slave " << *mdr << dendl;
5536
5537 assert(g_conf->mds_kill_link_at != 8);
5538
5539 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5540 MMDSSlaveRequest::OP_COMMITTED);
5541 mds->send_message_mds(req, mdr->slave_to_mds);
5542 mdcache->request_finish(mdr);
5543 }
5544
5545 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
5546 MutationRef mut;
5547 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r) : ServerLogContext(s, r), mut(m) {}
5548 void finish(int r) override {
5549 server->_link_rollback_finish(mut, mdr);
5550 }
5551 };
5552
5553 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
5554 {
5555 link_rollback rollback;
5556 bufferlist::iterator p = rbl.begin();
5557 ::decode(rollback, p);
5558
5559 dout(10) << "do_link_rollback on " << rollback.reqid
5560 << (rollback.was_inc ? " inc":" dec")
5561 << " ino " << rollback.ino
5562 << dendl;
5563
5564 assert(g_conf->mds_kill_link_at != 9);
5565
5566 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
5567 assert(mdr || mds->is_resolve());
5568
5569 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
5570 mut->ls = mds->mdlog->get_current_segment();
5571
5572 CInode *in = mdcache->get_inode(rollback.ino);
5573 assert(in);
5574 dout(10) << " target is " << *in << dendl;
5575 assert(!in->is_projected()); // live slave request hold versionlock xlock.
5576
5577 inode_t *pi = in->project_inode();
5578 pi->version = in->pre_dirty();
5579 mut->add_projected_inode(in);
5580
5581 // parent dir rctime
5582 CDir *parent = in->get_projected_parent_dn()->get_dir();
5583 fnode_t *pf = parent->project_fnode();
5584 mut->add_projected_fnode(parent);
5585 pf->version = parent->pre_dirty();
5586 if (pf->fragstat.mtime == pi->ctime) {
5587 pf->fragstat.mtime = rollback.old_dir_mtime;
5588 if (pf->rstat.rctime == pi->ctime)
5589 pf->rstat.rctime = rollback.old_dir_rctime;
5590 mut->add_updated_lock(&parent->get_inode()->filelock);
5591 mut->add_updated_lock(&parent->get_inode()->nestlock);
5592 }
5593
5594 // inode
5595 pi->ctime = rollback.old_ctime;
5596 if (rollback.was_inc)
5597 pi->nlink--;
5598 else
5599 pi->nlink++;
5600
5601 // journal it
5602 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
5603 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
5604 mdlog->start_entry(le);
5605 le->commit.add_dir_context(parent);
5606 le->commit.add_dir(parent, true);
5607 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
5608
5609 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr),
5610 mdr, __func__);
5611 mdlog->flush();
5612 }
5613
5614 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr)
5615 {
5616 dout(10) << "_link_rollback_finish" << dendl;
5617
5618 assert(g_conf->mds_kill_link_at != 10);
5619
5620 mut->apply();
5621 if (mdr)
5622 mdcache->request_finish(mdr);
5623
5624 mdcache->finish_rollback(mut->reqid);
5625
5626 mut->cleanup();
5627 }
5628
5629
5630 /* This function DOES NOT put the passed message before returning*/
5631 void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *m)
5632 {
5633 dout(10) << "handle_slave_link_prep_ack " << *mdr
5634 << " " << *m << dendl;
5635 mds_rank_t from = mds_rank_t(m->get_source().num());
5636
5637 assert(g_conf->mds_kill_link_at != 11);
5638
5639 // note slave
5640 mdr->more()->slaves.insert(from);
5641
5642 // witnessed!
5643 assert(mdr->more()->witnessed.count(from) == 0);
5644 mdr->more()->witnessed.insert(from);
5645 assert(!m->is_not_journaled());
5646 mdr->more()->has_journaled_slaves = true;
5647
5648 // remove from waiting list
5649 assert(mdr->more()->waiting_on_slave.count(from));
5650 mdr->more()->waiting_on_slave.erase(from);
5651
5652 assert(mdr->more()->waiting_on_slave.empty());
5653
5654 dispatch_client_request(mdr); // go again!
5655 }
5656
5657
5658
5659
5660
5661 // UNLINK
5662
5663 void Server::handle_client_unlink(MDRequestRef& mdr)
5664 {
5665 MClientRequest *req = mdr->client_request;
5666 client_t client = mdr->get_client();
5667
5668 // rmdir or unlink?
5669 bool rmdir = false;
5670 if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true;
5671
5672 if (req->get_filepath().depth() == 0) {
5673 respond_to_request(mdr, -EINVAL);
5674 return;
5675 }
5676
5677 // traverse to path
5678 vector<CDentry*> trace;
5679 CInode *in;
5680 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(), &trace, &in, MDS_TRAVERSE_FORWARD);
5681 if (r > 0) return;
5682 if (r < 0) {
5683 if (r == -ESTALE) {
5684 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
5685 mdcache->find_ino_peers(req->get_filepath().get_ino(), new C_MDS_TryFindInode(this, mdr));
5686 return;
5687 }
5688 respond_to_request(mdr, r);
5689 return;
5690 }
5691 if (mdr->snapid != CEPH_NOSNAP) {
5692 respond_to_request(mdr, -EROFS);
5693 return;
5694 }
5695
5696 CDentry *dn = trace[trace.size()-1];
5697 assert(dn);
5698 if (!dn->is_auth()) {
5699 mdcache->request_forward(mdr, dn->authority().first);
5700 return;
5701 }
5702
5703 CInode *diri = dn->get_dir()->get_inode();
5704
5705 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
5706 assert(!dnl->is_null());
5707
5708 if (rmdir) {
5709 dout(7) << "handle_client_rmdir on " << *dn << dendl;
5710 } else {
5711 dout(7) << "handle_client_unlink on " << *dn << dendl;
5712 }
5713 dout(7) << "dn links to " << *in << dendl;
5714
5715 // rmdir vs is_dir
5716 if (in->is_dir()) {
5717 if (rmdir) {
5718 // do empty directory checks
5719 if (_dir_is_nonempty_unlocked(mdr, in)) {
5720 respond_to_request(mdr, -ENOTEMPTY);
5721 return;
5722 }
5723 } else {
5724 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
5725 respond_to_request(mdr, -EISDIR);
5726 return;
5727 }
5728 } else {
5729 if (rmdir) {
5730 // unlink
5731 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
5732 respond_to_request(mdr, -ENOTDIR);
5733 return;
5734 }
5735 }
5736
5737 // -- create stray dentry? --
5738 CDentry *straydn = NULL;
5739 if (dnl->is_primary()) {
5740 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
5741 if (!straydn)
5742 return;
5743 dout(10) << " straydn is " << *straydn << dendl;
5744 } else if (mdr->straydn) {
5745 mdr->unpin(mdr->straydn);
5746 mdr->straydn = NULL;
5747 }
5748
5749 // lock
5750 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5751
5752 for (int i=0; i<(int)trace.size()-1; i++)
5753 rdlocks.insert(&trace[i]->lock);
5754 xlocks.insert(&dn->lock);
5755 wrlocks.insert(&diri->filelock);
5756 wrlocks.insert(&diri->nestlock);
5757 xlocks.insert(&in->linklock);
5758 if (straydn) {
5759 wrlocks.insert(&straydn->get_dir()->inode->filelock);
5760 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
5761 xlocks.insert(&straydn->lock);
5762 }
5763 if (in->is_dir())
5764 rdlocks.insert(&in->filelock); // to verify it's empty
5765 mds->locker->include_snap_rdlocks(rdlocks, dnl->get_inode());
5766
5767 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5768 return;
5769
5770 if (in->is_dir() &&
5771 _dir_is_nonempty(mdr, in)) {
5772 respond_to_request(mdr, -ENOTEMPTY);
5773 return;
5774 }
5775
5776 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5777 if (!check_access(mdr, diri, MAY_WRITE))
5778 return;
5779 }
5780
5781 // yay!
5782 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
5783 // subtree root auths need to be witnesses
5784 set<mds_rank_t> witnesses;
5785 in->list_replicas(witnesses);
5786 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
5787
5788 for (set<mds_rank_t>::iterator p = witnesses.begin();
5789 p != witnesses.end();
5790 ++p) {
5791 if (mdr->more()->witnessed.count(*p)) {
5792 dout(10) << " already witnessed by mds." << *p << dendl;
5793 } else if (mdr->more()->waiting_on_slave.count(*p)) {
5794 dout(10) << " already waiting on witness mds." << *p << dendl;
5795 } else {
5796 if (!_rmdir_prepare_witness(mdr, *p, trace, straydn))
5797 return;
5798 }
5799 }
5800 if (!mdr->more()->waiting_on_slave.empty())
5801 return; // we're waiting for a witness.
5802 }
5803
5804 // ok!
5805 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
5806 _link_remote(mdr, false, dn, dnl->get_inode());
5807 else
5808 _unlink_local(mdr, dn, straydn);
5809 }
5810
5811 class C_MDS_unlink_local_finish : public ServerLogContext {
5812 CDentry *dn;
5813 CDentry *straydn;
5814 version_t dnpv; // deleted dentry
5815 public:
5816 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
5817 ServerLogContext(s, r), dn(d), straydn(sd),
5818 dnpv(d->get_projected_version()) {}
5819 void finish(int r) override {
5820 assert(r == 0);
5821 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
5822 }
5823 };
5824
5825 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
5826 {
5827 dout(10) << "_unlink_local " << *dn << dendl;
5828
5829 CDentry::linkage_t *dnl = dn->get_projected_linkage();
5830 CInode *in = dnl->get_inode();
5831
5832 SnapRealm *realm = in->find_snaprealm();
5833 snapid_t follows = realm->get_newest_seq();
5834
5835 // ok, let's do it.
5836 mdr->ls = mdlog->get_current_segment();
5837
5838 // prepare log entry
5839 EUpdate *le = new EUpdate(mdlog, "unlink_local");
5840 mdlog->start_entry(le);
5841 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5842 if (!mdr->more()->witnessed.empty()) {
5843 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
5844 le->reqid = mdr->reqid;
5845 le->had_slaves = true;
5846 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
5847 }
5848
5849 if (straydn) {
5850 assert(dnl->is_primary());
5851 straydn->push_projected_linkage(in);
5852 straydn->first = follows + 1;
5853 }
5854
5855 // the unlinked dentry
5856 dn->pre_dirty();
5857
5858 inode_t *pi = in->project_inode();
5859 dn->make_path_string(pi->stray_prior_path);
5860 mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
5861 pi->version = in->pre_dirty();
5862 pi->ctime = mdr->get_op_stamp();
5863 pi->change_attr++;
5864 pi->nlink--;
5865 if (pi->nlink == 0)
5866 in->state_set(CInode::STATE_ORPHAN);
5867
5868 if (dnl->is_primary()) {
5869 // primary link. add stray dentry.
5870 assert(straydn);
5871 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
5872 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5873
5874 // project snaprealm, too
5875 if (in->snaprealm || follows + 1 > in->get_oldest_snap())
5876 in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
5877
5878 pi->update_backtrace();
5879 le->metablob.add_primary_dentry(straydn, in, true, true);
5880 } else {
5881 // remote link. update remote inode.
5882 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
5883 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5884 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5885 }
5886
5887 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
5888 le->metablob.add_null_dentry(dn, true);
5889
5890 if (in->is_dir()) {
5891 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
5892 le->metablob.renamed_dirino = in->ino();
5893 }
5894
5895 dn->push_projected_linkage();
5896
5897 if (in->is_dir()) {
5898 assert(straydn);
5899 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
5900 }
5901
5902 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
5903 }
5904
5905 void Server::_unlink_local_finish(MDRequestRef& mdr,
5906 CDentry *dn, CDentry *straydn,
5907 version_t dnpv)
5908 {
5909 dout(10) << "_unlink_local_finish " << *dn << dendl;
5910
5911 if (!mdr->more()->witnessed.empty())
5912 mdcache->logged_master_update(mdr->reqid);
5913
5914 // unlink main dentry
5915 dn->get_dir()->unlink_inode(dn);
5916 dn->pop_projected_linkage();
5917
5918 // relink as stray? (i.e. was primary link?)
5919 CInode *strayin = NULL;
5920 bool snap_is_new = false;
5921 if (straydn) {
5922 dout(20) << " straydn is " << *straydn << dendl;
5923 CDentry::linkage_t *straydnl = straydn->pop_projected_linkage();
5924 strayin = straydnl->get_inode();
5925
5926 snap_is_new = strayin->snaprealm ? true : false;
5927 mdcache->touch_dentry_bottom(straydn);
5928 }
5929
5930 dn->mark_dirty(dnpv, mdr->ls);
5931 mdr->apply();
5932
5933 if (snap_is_new) //only new if strayin exists
5934 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, true);
5935
5936 mdcache->send_dentry_unlink(dn, straydn, mdr);
5937
5938 // update subtree map?
5939 if (straydn && strayin->is_dir())
5940 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
5941
5942 // bump pop
5943 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
5944
5945 // reply
5946 respond_to_request(mdr, 0);
5947
5948 // removing a new dn?
5949 dn->get_dir()->try_remove_unlinked_dn(dn);
5950
5951 // clean up ?
5952 // respond_to_request() drops locks. So stray reintegration can race with us.
5953 if (straydn && !straydn->get_projected_linkage()->is_null()) {
5954 // Tip off the MDCache that this dentry is a stray that
5955 // might be elegible for purge.
5956 mdcache->notify_stray(straydn);
5957 }
5958 }
5959
5960 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
5961 {
5962 if (mds->is_cluster_degraded() &&
5963 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
5964 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
5965 if (mdr->more()->waiting_on_slave.empty())
5966 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
5967 return false;
5968 }
5969
5970 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
5971 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5972 MMDSSlaveRequest::OP_RMDIRPREP);
5973 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
5974 for (auto dn : trace)
5975 req->srcdnpath.push_dentry(dn->name);
5976 mdcache->replicate_stray(straydn, who, req->stray);
5977
5978 req->op_stamp = mdr->get_op_stamp();
5979 mds->send_message_mds(req, who);
5980
5981 assert(mdr->more()->waiting_on_slave.count(who) == 0);
5982 mdr->more()->waiting_on_slave.insert(who);
5983 return true;
5984 }
5985
5986 struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
5987 CDentry *dn, *straydn;
5988 C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
5989 : ServerLogContext(s, r), dn(d), straydn(st) {}
5990 void finish(int r) override {
5991 server->_logged_slave_rmdir(mdr, dn, straydn);
5992 }
5993 };
5994
5995 struct C_MDS_SlaveRmdirCommit : public ServerContext {
5996 MDRequestRef mdr;
5997 C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r)
5998 : ServerContext(s), mdr(r) { }
5999 void finish(int r) override {
6000 server->_commit_slave_rmdir(mdr, r);
6001 }
6002 };
6003
6004 void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
6005 {
6006 dout(10) << "handle_slave_rmdir_prep " << *mdr
6007 << " " << mdr->slave_request->srcdnpath
6008 << " to " << mdr->slave_request->destdnpath
6009 << dendl;
6010
6011 vector<CDentry*> trace;
6012 filepath srcpath(mdr->slave_request->srcdnpath);
6013 dout(10) << " src " << srcpath << dendl;
6014 CInode *in;
6015 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK);
6016 if (r > 0) return;
6017 if (r == -ESTALE) {
6018 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
6019 mdr->slave_to_mds);
6020 return;
6021 }
6022 assert(r == 0);
6023 CDentry *dn = trace[trace.size()-1];
6024 dout(10) << " dn " << *dn << dendl;
6025 mdr->pin(dn);
6026
6027 assert(mdr->straydn);
6028 CDentry *straydn = mdr->straydn;
6029 dout(10) << " straydn " << *straydn << dendl;
6030
6031 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6032
6033 rmdir_rollback rollback;
6034 rollback.reqid = mdr->reqid;
6035 rollback.src_dir = dn->get_dir()->dirfrag();
6036 rollback.src_dname = dn->name;
6037 rollback.dest_dir = straydn->get_dir()->dirfrag();
6038 rollback.dest_dname = straydn->name;
6039 ::encode(rollback, mdr->more()->rollback_bl);
6040 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
6041
6042 // set up commit waiter
6043 mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr);
6044
6045 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
6046 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
6047 dn->get_dir()->unlink_inode(dn);
6048 straydn->get_dir()->link_primary_inode(straydn, in);
6049
6050 assert(straydn->first >= in->first);
6051 in->first = straydn->first;
6052
6053 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), false);
6054
6055 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6056 MMDSSlaveRequest::OP_RMDIRPREPACK);
6057 reply->mark_not_journaled();
6058 mds->send_message_mds(reply, mdr->slave_to_mds);
6059
6060 // send caps to auth (if we're not already)
6061 if (in->is_any_caps() && !in->state_test(CInode::STATE_EXPORTINGCAPS))
6062 mdcache->migrator->export_caps(in);
6063
6064 mdcache->touch_dentry_bottom(straydn); // move stray to end of lru
6065
6066 mdr->slave_request->put();
6067 mdr->slave_request = 0;
6068 mdr->straydn = 0;
6069 return;
6070 }
6071
6072 straydn->push_projected_linkage(in);
6073 dn->push_projected_linkage();
6074
6075 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
6076 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
6077 mdlog->start_entry(le);
6078 le->rollback = mdr->more()->rollback_bl;
6079
6080 le->commit.add_dir_context(straydn->get_dir());
6081 le->commit.add_primary_dentry(straydn, in, true);
6082 // slave: no need to journal original dentry
6083
6084 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6085 le->commit.renamed_dirino = in->ino();
6086
6087 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6088
6089 mdr->more()->slave_update_journaled = true;
6090 submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
6091 mdr, __func__);
6092 mdlog->flush();
6093 }
6094
6095 void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
6096 {
6097 dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
6098
6099 // update our cache now, so we are consistent with what is in the journal
6100 // when we journal a subtree map
6101 CInode *in = dn->get_linkage()->get_inode();
6102 dn->get_dir()->unlink_inode(dn);
6103 straydn->pop_projected_linkage();
6104 dn->pop_projected_linkage();
6105 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), true);
6106
6107 // done.
6108 mdr->slave_request->put();
6109 mdr->slave_request = 0;
6110 mdr->straydn = 0;
6111
6112 if (!mdr->aborted) {
6113 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6114 MMDSSlaveRequest::OP_RMDIRPREPACK);
6115 mds->send_message_mds(reply, mdr->slave_to_mds);
6116 } else {
6117 dout(10) << " abort flag set, finishing" << dendl;
6118 mdcache->request_finish(mdr);
6119 }
6120 }
6121
6122 void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
6123 {
6124 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
6125 << " " << *ack << dendl;
6126
6127 mds_rank_t from = mds_rank_t(ack->get_source().num());
6128
6129 mdr->more()->slaves.insert(from);
6130 mdr->more()->witnessed.insert(from);
6131 if (!ack->is_not_journaled())
6132 mdr->more()->has_journaled_slaves = true;
6133
6134 // remove from waiting list
6135 assert(mdr->more()->waiting_on_slave.count(from));
6136 mdr->more()->waiting_on_slave.erase(from);
6137
6138 if (mdr->more()->waiting_on_slave.empty())
6139 dispatch_client_request(mdr); // go again!
6140 else
6141 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
6142 }
6143
6144 void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r)
6145 {
6146 dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
6147
6148 if (r == 0) {
6149 mdr->cleanup();
6150
6151 if (mdr->more()->slave_update_journaled) {
6152 // write a commit to the journal
6153 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
6154 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
6155 ESlaveUpdate::RMDIR);
6156 mdlog->start_entry(le);
6157 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
6158 mdlog->flush();
6159 } else {
6160 _committed_slave(mdr);
6161 }
6162 } else {
6163 // abort
6164 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
6165 }
6166 }
6167
6168 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
6169 metareqid_t reqid;
6170 CDentry *dn;
6171 CDentry *straydn;
6172 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
6173 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
6174 void finish(int r) override {
6175 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
6176 }
6177 };
6178
6179 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
6180 {
6181 // unlink the other rollback methods, the rmdir rollback is only
6182 // needed to record the subtree changes in the journal for inode
6183 // replicas who are auth for empty dirfrags. no actual changes to
6184 // the file system are taking place here, so there is no Mutation.
6185
6186 rmdir_rollback rollback;
6187 bufferlist::iterator p = rbl.begin();
6188 ::decode(rollback, p);
6189
6190 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
6191 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
6192 assert(mdr || mds->is_resolve());
6193
6194 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
6195 if (!dir)
6196 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
6197 assert(dir);
6198 CDentry *dn = dir->lookup(rollback.src_dname);
6199 assert(dn);
6200 dout(10) << " dn " << *dn << dendl;
6201 dir = mdcache->get_dirfrag(rollback.dest_dir);
6202 assert(dir);
6203 CDentry *straydn = dir->lookup(rollback.dest_dname);
6204 assert(straydn);
6205 dout(10) << " straydn " << *dn << dendl;
6206 CInode *in = straydn->get_linkage()->get_inode();
6207
6208 if (mdr && !mdr->more()->slave_update_journaled) {
6209 assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
6210
6211 straydn->get_dir()->unlink_inode(straydn);
6212 dn->get_dir()->link_primary_inode(dn, in);
6213
6214 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), false);
6215
6216 mdcache->request_finish(mdr);
6217 mdcache->finish_rollback(rollback.reqid);
6218 return;
6219 }
6220
6221 dn->push_projected_linkage(in);
6222 straydn->push_projected_linkage();
6223
6224 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
6225 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
6226 mdlog->start_entry(le);
6227
6228 le->commit.add_dir_context(dn->get_dir());
6229 le->commit.add_primary_dentry(dn, in, true);
6230 // slave: no need to journal straydn
6231
6232 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6233 le->commit.renamed_dirino = in->ino();
6234
6235 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
6236
6237 submit_mdlog_entry(le,
6238 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
6239 dn, straydn),
6240 mdr, __func__);
6241 mdlog->flush();
6242 }
6243
6244 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
6245 {
6246 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
6247
6248 straydn->get_dir()->unlink_inode(straydn);
6249 dn->pop_projected_linkage();
6250 straydn->pop_projected_linkage();
6251
6252 CInode *in = dn->get_linkage()->get_inode();
6253 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), true);
6254 if (mds->is_resolve()) {
6255 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
6256 mdcache->try_trim_non_auth_subtree(root);
6257 }
6258
6259 if (mdr)
6260 mdcache->request_finish(mdr);
6261
6262 mdcache->finish_rollback(reqid);
6263 }
6264
6265
6266 /** _dir_is_nonempty[_unlocked]
6267 *
6268 * check if a directory is non-empty (i.e. we can rmdir it).
6269 *
6270 * the unlocked varient this is a fastpath check. we can't really be
6271 * sure until we rdlock the filelock.
6272 */
6273 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
6274 {
6275 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
6276 assert(in->is_auth());
6277
6278 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
6279 return true; // in a snapshot!
6280
6281 list<CDir*> ls;
6282 in->get_dirfrags(ls);
6283 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6284 CDir *dir = *p;
6285 // is the frag obviously non-empty?
6286 if (dir->is_auth()) {
6287 if (dir->get_projected_fnode()->fragstat.size()) {
6288 dout(10) << "dir_is_nonempty_unlocked dirstat has "
6289 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
6290 return true;
6291 }
6292 }
6293 }
6294
6295 return false;
6296 }
6297
6298 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
6299 {
6300 dout(10) << "dir_is_nonempty " << *in << dendl;
6301 assert(in->is_auth());
6302 assert(in->filelock.can_read(mdr->get_client()));
6303
6304 frag_info_t dirstat;
6305 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
6306
6307 list<CDir*> ls;
6308 in->get_dirfrags(ls);
6309 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6310 CDir *dir = *p;
6311 const fnode_t *pf = dir->get_projected_fnode();
6312 if (pf->fragstat.size()) {
6313 dout(10) << "dir_is_nonempty dirstat has "
6314 << pf->fragstat.size() << " items " << *dir << dendl;
6315 return true;
6316 }
6317
6318 if (pf->accounted_fragstat.version == dirstat_version)
6319 dirstat.add(pf->accounted_fragstat);
6320 else
6321 dirstat.add(pf->fragstat);
6322 }
6323
6324 return dirstat.size() != in->get_projected_inode()->dirstat.size();
6325 }
6326
6327
6328 // ======================================================
6329
6330
6331 class C_MDS_rename_finish : public ServerLogContext {
6332 CDentry *srcdn;
6333 CDentry *destdn;
6334 CDentry *straydn;
6335 public:
6336 C_MDS_rename_finish(Server *s, MDRequestRef& r,
6337 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
6338 ServerLogContext(s, r),
6339 srcdn(sdn), destdn(ddn), straydn(stdn) { }
6340 void finish(int r) override {
6341 assert(r == 0);
6342 server->_rename_finish(mdr, srcdn, destdn, straydn);
6343 }
6344 };
6345
6346
6347 /** handle_client_rename
6348 *
6349 * rename master is the destdn auth. this is because cached inodes
6350 * must remain connected. thus, any replica of srci, must also
6351 * replicate destdn, and possibly straydn, so that srci (and
6352 * destdn->inode) remain connected during the rename.
6353 *
6354 * to do this, we freeze srci, then master (destdn auth) verifies that
6355 * all other nodes have also replciated destdn and straydn. note that
6356 * destdn replicas need not also replicate srci. this only works when
6357 * destdn is master.
6358 *
6359 * This function takes responsibility for the passed mdr.
6360 */
6361 void Server::handle_client_rename(MDRequestRef& mdr)
6362 {
6363 MClientRequest *req = mdr->client_request;
6364 dout(7) << "handle_client_rename " << *req << dendl;
6365
6366 filepath destpath = req->get_filepath();
6367 filepath srcpath = req->get_filepath2();
6368 if (destpath.depth() == 0 || srcpath.depth() == 0) {
6369 respond_to_request(mdr, -EINVAL);
6370 return;
6371 }
6372 const string &destname = destpath.last_dentry();
6373
6374 vector<CDentry*>& srctrace = mdr->dn[1];
6375 vector<CDentry*>& desttrace = mdr->dn[0];
6376
6377 set<SimpleLock*> rdlocks, wrlocks, xlocks;
6378
6379 CDentry *destdn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, true, false, true);
6380 if (!destdn) return;
6381 dout(10) << " destdn " << *destdn << dendl;
6382 if (mdr->snapid != CEPH_NOSNAP) {
6383 respond_to_request(mdr, -EROFS);
6384 return;
6385 }
6386 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
6387 CDir *destdir = destdn->get_dir();
6388 assert(destdir->is_auth());
6389
6390 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER);
6391 if (r > 0)
6392 return; // delayed
6393 if (r < 0) {
6394 if (r == -ESTALE) {
6395 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
6396 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
6397 } else {
6398 dout(10) << "FAIL on error " << r << dendl;
6399 respond_to_request(mdr, r);
6400 }
6401 return;
6402
6403 }
6404 assert(!srctrace.empty());
6405 CDentry *srcdn = srctrace[srctrace.size()-1];
6406 dout(10) << " srcdn " << *srcdn << dendl;
6407 if (srcdn->last != CEPH_NOSNAP) {
6408 respond_to_request(mdr, -EROFS);
6409 return;
6410 }
6411 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
6412 CInode *srci = srcdnl->get_inode();
6413 dout(10) << " srci " << *srci << dendl;
6414
6415 CInode *oldin = 0;
6416 if (!destdnl->is_null()) {
6417 //dout(10) << "dest dn exists " << *destdn << dendl;
6418 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
6419 if (!oldin) return;
6420 dout(10) << " oldin " << *oldin << dendl;
6421
6422 // mv /some/thing /to/some/existing_other_thing
6423 if (oldin->is_dir() && !srci->is_dir()) {
6424 respond_to_request(mdr, -EISDIR);
6425 return;
6426 }
6427 if (!oldin->is_dir() && srci->is_dir()) {
6428 respond_to_request(mdr, -ENOTDIR);
6429 return;
6430 }
6431
6432 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
6433 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
6434 respond_to_request(mdr, -ENOTEMPTY);
6435 return;
6436 }
6437 if (srci == oldin && !srcdn->get_dir()->inode->is_stray()) {
6438 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
6439 return;
6440 }
6441 }
6442
6443 // -- some sanity checks --
6444
6445 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
6446 if (destpath.get_ino() != srcpath.get_ino() &&
6447 !(req->get_source().is_mds() &&
6448 MDS_INO_IS_MDSDIR(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
6449 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
6450 CInode *destbase = desttrace[0]->get_dir()->get_inode();
6451 // ok, extend srctrace toward root until it is an ancestor of desttrace.
6452 while (srcbase != destbase &&
6453 !srcbase->is_projected_ancestor_of(destbase)) {
6454 CDentry *pdn = srcbase->get_projected_parent_dn();
6455 srctrace.insert(srctrace.begin(), pdn);
6456 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
6457 srcbase = pdn->get_dir()->get_inode();
6458 }
6459
6460 // then, extend destpath until it shares the same parent inode as srcpath.
6461 while (destbase != srcbase) {
6462 CDentry *pdn = destbase->get_projected_parent_dn();
6463 desttrace.insert(desttrace.begin(), pdn);
6464 rdlocks.insert(&pdn->lock);
6465 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
6466 destbase = pdn->get_dir()->get_inode();
6467 }
6468 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
6469 }
6470
6471 // src == dest?
6472 if (srcdn->get_dir() == destdir && srcdn->name == destname) {
6473 dout(7) << "rename src=dest, noop" << dendl;
6474 respond_to_request(mdr, 0);
6475 return;
6476 }
6477
6478 // dest a child of src?
6479 // e.g. mv /usr /usr/foo
6480 CDentry *pdn = destdir->inode->get_projected_parent_dn();
6481 while (pdn) {
6482 if (pdn == srcdn) {
6483 dout(7) << "cannot rename item to be a child of itself" << dendl;
6484 respond_to_request(mdr, -EINVAL);
6485 return;
6486 }
6487 pdn = pdn->get_dir()->inode->parent;
6488 }
6489
6490 // is this a stray migration, reintegration or merge? (sanity checks!)
6491 if (mdr->reqid.name.is_mds() &&
6492 !(MDS_INO_IS_MDSDIR(srcpath.get_ino()) &&
6493 MDS_INO_IS_MDSDIR(destpath.get_ino())) &&
6494 !(destdnl->is_remote() &&
6495 destdnl->get_remote_ino() == srci->ino())) {
6496 respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
6497 return;
6498 }
6499
6500 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
6501 (srcdnl->is_primary() || destdnl->is_primary()));
6502 if (linkmerge)
6503 dout(10) << " this is a link merge" << dendl;
6504
6505 // -- create stray dentry? --
6506 CDentry *straydn = NULL;
6507 if (destdnl->is_primary() && !linkmerge) {
6508 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
6509 if (!straydn)
6510 return;
6511 dout(10) << " straydn is " << *straydn << dendl;
6512 } else if (mdr->straydn) {
6513 mdr->unpin(mdr->straydn);
6514 mdr->straydn = NULL;
6515 }
6516
6517 // -- prepare witness list --
6518 /*
6519 * NOTE: we use _all_ replicas as witnesses.
6520 * this probably isn't totally necessary (esp for file renames),
6521 * but if/when we change that, we have to make sure rejoin is
6522 * sufficiently robust to handle strong rejoins from survivors
6523 * with totally wrong dentry->inode linkage.
6524 * (currently, it can ignore rename effects, because the resolve
6525 * stage will sort them out.)
6526 */
6527 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
6528 if (srcdn->is_auth())
6529 srcdn->list_replicas(witnesses);
6530 else
6531 witnesses.insert(srcdn->authority().first);
6532 if (srcdnl->is_remote() && !srci->is_auth())
6533 witnesses.insert(srci->authority().first);
6534 destdn->list_replicas(witnesses);
6535 if (destdnl->is_remote() && !oldin->is_auth())
6536 witnesses.insert(oldin->authority().first);
6537 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
6538
6539
6540 // -- locks --
6541 map<SimpleLock*, mds_rank_t> remote_wrlocks;
6542
6543 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
6544 for (int i=0; i<(int)srctrace.size(); i++)
6545 rdlocks.insert(&srctrace[i]->lock);
6546 xlocks.insert(&srcdn->lock);
6547 mds_rank_t srcdirauth = srcdn->get_dir()->authority().first;
6548 if (srcdirauth != mds->get_nodeid()) {
6549 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
6550 remote_wrlocks[&srcdn->get_dir()->inode->filelock] = srcdirauth;
6551 remote_wrlocks[&srcdn->get_dir()->inode->nestlock] = srcdirauth;
6552 if (srci->is_dir())
6553 rdlocks.insert(&srci->dirfragtreelock);
6554 } else {
6555 wrlocks.insert(&srcdn->get_dir()->inode->filelock);
6556 wrlocks.insert(&srcdn->get_dir()->inode->nestlock);
6557 }
6558 mds->locker->include_snap_rdlocks(rdlocks, srcdn->get_dir()->inode);
6559
6560 // straydn?
6561 if (straydn) {
6562 wrlocks.insert(&straydn->get_dir()->inode->filelock);
6563 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
6564 xlocks.insert(&straydn->lock);
6565 }
6566
6567 // xlock versionlock on dentries if there are witnesses.
6568 // replicas can't see projected dentry linkages, and will get
6569 // confused if we try to pipeline things.
6570 if (!witnesses.empty()) {
6571 // take xlock on all projected ancestor dentries for srcdn and destdn.
6572 // this ensures the srcdn and destdn can be traversed to by the witnesses.
6573 for (int i= 0; i<(int)srctrace.size(); i++) {
6574 if (srctrace[i]->is_auth() && srctrace[i]->is_projected())
6575 xlocks.insert(&srctrace[i]->versionlock);
6576 }
6577 for (int i=0; i<(int)desttrace.size(); i++) {
6578 if (desttrace[i]->is_auth() && desttrace[i]->is_projected())
6579 xlocks.insert(&desttrace[i]->versionlock);
6580 }
6581 // xlock srci and oldin's primary dentries, so witnesses can call
6582 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
6583 // is traversed.
6584 if (srcdnl->is_remote())
6585 xlocks.insert(&srci->get_projected_parent_dn()->lock);
6586 if (destdnl->is_remote())
6587 xlocks.insert(&oldin->get_projected_parent_dn()->lock);
6588 }
6589
6590 // we need to update srci's ctime. xlock its least contended lock to do that...
6591 xlocks.insert(&srci->linklock);
6592
6593 // xlock oldin (for nlink--)
6594 if (oldin) {
6595 xlocks.insert(&oldin->linklock);
6596 if (oldin->is_dir())
6597 rdlocks.insert(&oldin->filelock);
6598 }
6599 if (srcdnl->is_primary() && srci->is_dir())
6600 // FIXME: this should happen whenever we are renamning between
6601 // realms, regardless of the file type
6602 // FIXME: If/when this changes, make sure to update the
6603 // "allowance" in handle_slave_rename_prep
6604 xlocks.insert(&srci->snaplock); // FIXME: an auth bcast could be sufficient?
6605 else
6606 rdlocks.insert(&srci->snaplock);
6607
6608 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL;
6609 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks,
6610 &remote_wrlocks, auth_pin_freeze))
6611 return;
6612
6613 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6614 if (!check_access(mdr, srcdn->get_dir()->get_inode(), MAY_WRITE))
6615 return;
6616
6617 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
6618 return;
6619
6620 if (!check_fragment_space(mdr, destdn->get_dir()))
6621 return;
6622
6623 if (!check_access(mdr, srci, MAY_WRITE))
6624 return;
6625 }
6626
6627 // with read lock, really verify oldin is empty
6628 if (oldin &&
6629 oldin->is_dir() &&
6630 _dir_is_nonempty(mdr, oldin)) {
6631 respond_to_request(mdr, -ENOTEMPTY);
6632 return;
6633 }
6634
6635 /* project_past_snaprealm_parent() will do this job
6636 *
6637 // moving between snaprealms?
6638 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
6639 SnapRealm *srcrealm = srci->find_snaprealm();
6640 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
6641 if (srcrealm != destrealm &&
6642 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
6643 destrealm->get_newest_seq() + 1 > srcdn->first)) {
6644 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
6645 mdcache->snaprealm_create(mdr, srci);
6646 return;
6647 }
6648 }
6649 */
6650
6651 assert(g_conf->mds_kill_rename_at != 1);
6652
6653 // -- open all srcdn inode frags, if any --
6654 // we need these open so that auth can properly delegate from inode to dirfrags
6655 // after the inode is _ours_.
6656 if (srcdnl->is_primary() &&
6657 !srcdn->is_auth() &&
6658 srci->is_dir()) {
6659 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
6660 mdr->set_stickydirs(srci);
6661
6662 list<frag_t> frags;
6663 srci->dirfragtree.get_leaves(frags);
6664 for (list<frag_t>::iterator p = frags.begin();
6665 p != frags.end();
6666 ++p) {
6667 CDir *dir = srci->get_dirfrag(*p);
6668 if (!dir) {
6669 dout(10) << " opening " << *p << " under " << *srci << dendl;
6670 mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr));
6671 return;
6672 }
6673 }
6674 }
6675
6676 // -- prepare witnesses --
6677
6678 // do srcdn auth last
6679 mds_rank_t last = MDS_RANK_NONE;
6680 if (!srcdn->is_auth()) {
6681 last = srcdn->authority().first;
6682 mdr->more()->srcdn_auth_mds = last;
6683 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
6684 // are involved in the rename operation.
6685 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
6686 dout(10) << " preparing ambiguous auth for srci" << dendl;
6687 assert(mdr->more()->is_remote_frozen_authpin);
6688 assert(mdr->more()->rename_inode == srci);
6689 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6690 return;
6691 }
6692 }
6693
6694 for (set<mds_rank_t>::iterator p = witnesses.begin();
6695 p != witnesses.end();
6696 ++p) {
6697 if (*p == last) continue; // do it last!
6698 if (mdr->more()->witnessed.count(*p)) {
6699 dout(10) << " already witnessed by mds." << *p << dendl;
6700 } else if (mdr->more()->waiting_on_slave.count(*p)) {
6701 dout(10) << " already waiting on witness mds." << *p << dendl;
6702 } else {
6703 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
6704 return;
6705 }
6706 }
6707 if (!mdr->more()->waiting_on_slave.empty())
6708 return; // we're waiting for a witness.
6709
6710 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
6711 dout(10) << " preparing last witness (srcdn auth)" << dendl;
6712 assert(mdr->more()->waiting_on_slave.count(last) == 0);
6713 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6714 return;
6715 }
6716
6717 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
6718 if (!mdr->more()->slaves.empty() && !srci->is_dir())
6719 assert(g_conf->mds_kill_rename_at != 3);
6720 if (!mdr->more()->slaves.empty() && srci->is_dir())
6721 assert(g_conf->mds_kill_rename_at != 4);
6722
6723 // -- declare now --
6724 mdr->set_mds_stamp(ceph_clock_now());
6725
6726 // -- prepare journal entry --
6727 mdr->ls = mdlog->get_current_segment();
6728 EUpdate *le = new EUpdate(mdlog, "rename");
6729 mdlog->start_entry(le);
6730 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6731 if (!mdr->more()->witnessed.empty()) {
6732 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6733
6734 le->reqid = mdr->reqid;
6735 le->had_slaves = true;
6736
6737 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6738 // no need to send frozen auth pin to recovring auth MDS of srci
6739 mdr->more()->is_remote_frozen_authpin = false;
6740 }
6741
6742 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
6743 if (le->client_map.length())
6744 le->cmapv = mds->sessionmap.get_projected();
6745
6746 // -- commit locally --
6747 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
6748
6749 journal_and_reply(mdr, srci, destdn, le, fin);
6750 }
6751
6752
6753 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
6754 {
6755 dout(10) << "_rename_finish " << *mdr << dendl;
6756
6757 if (!mdr->more()->witnessed.empty())
6758 mdcache->logged_master_update(mdr->reqid);
6759
6760 // apply
6761 _rename_apply(mdr, srcdn, destdn, straydn);
6762
6763 mdcache->send_dentry_link(destdn, mdr);
6764
6765 CDentry::linkage_t *destdnl = destdn->get_linkage();
6766 CInode *in = destdnl->get_inode();
6767 bool need_eval = mdr->more()->cap_imports.count(in);
6768
6769 // test hack: test slave commit
6770 if (!mdr->more()->slaves.empty() && !in->is_dir())
6771 assert(g_conf->mds_kill_rename_at != 5);
6772 if (!mdr->more()->slaves.empty() && in->is_dir())
6773 assert(g_conf->mds_kill_rename_at != 6);
6774
6775 // bump popularity
6776 mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
6777 if (destdnl->is_remote() && in->is_auth())
6778 mds->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
6779
6780 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
6781
6782 assert(g_conf->mds_kill_rename_at != 7);
6783
6784 // reply
6785 respond_to_request(mdr, 0);
6786
6787 if (need_eval)
6788 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
6789
6790 // clean up?
6791 // respond_to_request() drops locks. So stray reintegration can race with us.
6792 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6793 mdcache->notify_stray(straydn);
6794 }
6795 }
6796
6797
6798
6799 // helpers
6800
6801 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
6802 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
6803 {
6804 if (mds->is_cluster_degraded() &&
6805 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6806 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
6807 if (mdr->more()->waiting_on_slave.empty())
6808 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6809 return false;
6810 }
6811
6812 dout(10) << "_rename_prepare_witness mds." << who << dendl;
6813 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6814 MMDSSlaveRequest::OP_RENAMEPREP);
6815
6816 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
6817 for (auto dn : srctrace)
6818 req->srcdnpath.push_dentry(dn->name);
6819 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
6820 for (auto dn : dsttrace)
6821 req->destdnpath.push_dentry(dn->name);
6822 if (straydn)
6823 mdcache->replicate_stray(straydn, who, req->stray);
6824
6825 // srcdn auth will verify our current witness list is sufficient
6826 req->witnesses = witnesse;
6827
6828 req->op_stamp = mdr->get_op_stamp();
6829 mds->send_message_mds(req, who);
6830
6831 assert(mdr->more()->waiting_on_slave.count(who) == 0);
6832 mdr->more()->waiting_on_slave.insert(who);
6833 return true;
6834 }
6835
6836 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
6837 {
6838 version_t oldpv = mdr->more()->inode_import_v;
6839
6840 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
6841
6842 /* import node */
6843 bufferlist::iterator blp = mdr->more()->inode_import.begin();
6844
6845 // imported caps
6846 ::decode(mdr->more()->imported_client_map, blp);
6847 ::encode(mdr->more()->imported_client_map, *client_map_bl,
6848 mds->mdsmap->get_up_features());
6849 prepare_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
6850
6851 list<ScatterLock*> updated_scatterlocks;
6852 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
6853 mdr->more()->cap_imports, updated_scatterlocks);
6854
6855 // hack: force back to !auth and clean, temporarily
6856 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
6857 srcdnl->get_inode()->mark_clean();
6858
6859 return oldpv;
6860 }
6861
6862 bool Server::_need_force_journal(CInode *diri, bool empty)
6863 {
6864 list<CDir*> ls;
6865 diri->get_dirfrags(ls);
6866
6867 bool force_journal = false;
6868 if (empty) {
6869 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6870 if ((*p)->is_subtree_root() && (*p)->get_dir_auth().first == mds->get_nodeid()) {
6871 dout(10) << " frag " << (*p)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
6872 force_journal = true;
6873 break;
6874 } else
6875 dout(20) << " frag " << (*p)->get_frag() << " is not auth subtree dirfrag" << dendl;
6876 }
6877 } else {
6878 // see if any children of our frags are auth subtrees.
6879 list<CDir*> subtrees;
6880 mdcache->list_subtrees(subtrees);
6881 dout(10) << " subtrees " << subtrees << " frags " << ls << dendl;
6882 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6883 CDir *dir = *p;
6884 for (list<CDir*>::iterator q = subtrees.begin(); q != subtrees.end(); ++q) {
6885 if (dir->contains(*q)) {
6886 if ((*q)->get_dir_auth().first == mds->get_nodeid()) {
6887 dout(10) << " frag " << (*p)->get_frag() << " contains (maybe) auth subtree, will force journal "
6888 << **q << dendl;
6889 force_journal = true;
6890 break;
6891 } else
6892 dout(20) << " frag " << (*p)->get_frag() << " contains but isn't auth for " << **q << dendl;
6893 } else
6894 dout(20) << " frag " << (*p)->get_frag() << " does not contain " << **q << dendl;
6895 }
6896 if (force_journal)
6897 break;
6898 }
6899 }
6900 return force_journal;
6901 }
6902
6903 void Server::_rename_prepare(MDRequestRef& mdr,
6904 EMetaBlob *metablob, bufferlist *client_map_bl,
6905 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
6906 {
6907 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
6908 if (straydn)
6909 dout(10) << " straydn " << *straydn << dendl;
6910
6911 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
6912 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
6913 CInode *srci = srcdnl->get_inode();
6914 CInode *oldin = destdnl->get_inode();
6915
6916 // primary+remote link merge?
6917 bool linkmerge = (srci == destdnl->get_inode() &&
6918 (srcdnl->is_primary() || destdnl->is_primary()));
6919 bool silent = srcdn->get_dir()->inode->is_stray();
6920
6921 bool force_journal_dest = false;
6922 if (srci->is_dir() && !destdn->is_auth()) {
6923 if (srci->is_auth()) {
6924 // if we are auth for srci and exporting it, force journal because journal replay needs
6925 // the source inode to create auth subtrees.
6926 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
6927 force_journal_dest = true;
6928 } else
6929 force_journal_dest = _need_force_journal(srci, false);
6930 }
6931
6932 bool force_journal_stray = false;
6933 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
6934 force_journal_stray = _need_force_journal(oldin, true);
6935
6936 if (linkmerge)
6937 dout(10) << " merging remote and primary links to the same inode" << dendl;
6938 if (silent)
6939 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
6940 if (force_journal_dest)
6941 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
6942 if (force_journal_stray)
6943 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
6944
6945 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
6946 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
6947 metablob->renamed_dirino = srci->ino();
6948 } else if (oldin && oldin->is_dir() && force_journal_stray) {
6949 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
6950 metablob->renamed_dirino = oldin->ino();
6951 }
6952
6953 // prepare
6954 inode_t *pi = 0; // renamed inode
6955 inode_t *tpi = 0; // target/overwritten inode
6956
6957 // target inode
6958 if (!linkmerge) {
6959 if (destdnl->is_primary()) {
6960 assert(straydn); // moving to straydn.
6961 // link--, and move.
6962 if (destdn->is_auth()) {
6963 tpi = oldin->project_inode(); //project_snaprealm
6964 tpi->version = straydn->pre_dirty(tpi->version);
6965 tpi->update_backtrace();
6966 }
6967 straydn->push_projected_linkage(oldin);
6968 } else if (destdnl->is_remote()) {
6969 // nlink-- targeti
6970 if (oldin->is_auth()) {
6971 tpi = oldin->project_inode();
6972 tpi->version = oldin->pre_dirty();
6973 }
6974 }
6975 }
6976
6977 // dest
6978 if (srcdnl->is_remote()) {
6979 if (!linkmerge) {
6980 // destdn
6981 if (destdn->is_auth())
6982 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
6983 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
6984 // srci
6985 if (srci->is_auth()) {
6986 pi = srci->project_inode();
6987 pi->version = srci->pre_dirty();
6988 }
6989 } else {
6990 dout(10) << " will merge remote onto primary link" << dendl;
6991 if (destdn->is_auth()) {
6992 pi = oldin->project_inode();
6993 pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
6994 }
6995 }
6996 } else { // primary
6997 if (destdn->is_auth()) {
6998 version_t oldpv;
6999 if (srcdn->is_auth())
7000 oldpv = srci->get_projected_version();
7001 else {
7002 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
7003
7004 // note which dirfrags have child subtrees in the journal
7005 // event, so that we can open those (as bounds) during replay.
7006 if (srci->is_dir()) {
7007 list<CDir*> ls;
7008 srci->get_dirfrags(ls);
7009 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7010 CDir *dir = *p;
7011 if (!dir->is_auth())
7012 metablob->renamed_dir_frags.push_back(dir->get_frag());
7013 }
7014 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
7015 }
7016 }
7017 pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
7018 // & srcdnl->snaprealm
7019 pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
7020 pi->update_backtrace();
7021 }
7022 destdn->push_projected_linkage(srci);
7023 }
7024
7025 // src
7026 if (srcdn->is_auth())
7027 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
7028 srcdn->push_projected_linkage(); // push null linkage
7029
7030 if (!silent) {
7031 if (pi) {
7032 pi->ctime = mdr->get_op_stamp();
7033 pi->change_attr++;
7034 if (linkmerge)
7035 pi->nlink--;
7036 }
7037 if (tpi) {
7038 tpi->ctime = mdr->get_op_stamp();
7039 tpi->change_attr++;
7040 destdn->make_path_string(tpi->stray_prior_path);
7041 tpi->nlink--;
7042 if (tpi->nlink == 0)
7043 oldin->state_set(CInode::STATE_ORPHAN);
7044 }
7045 }
7046
7047 // prepare nesting, mtime updates
7048 int predirty_dir = silent ? 0:PREDIRTY_DIR;
7049
7050 // guarantee stray dir is processed first during journal replay. unlink the old inode,
7051 // then link the source inode to destdn
7052 if (destdnl->is_primary()) {
7053 assert(straydn);
7054 if (straydn->is_auth()) {
7055 metablob->add_dir_context(straydn->get_dir());
7056 metablob->add_dir(straydn->get_dir(), true);
7057 }
7058 }
7059
7060 // sub off target
7061 if (destdn->is_auth() && !destdnl->is_null()) {
7062 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
7063 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
7064 if (destdnl->is_primary())
7065 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
7066 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7067 }
7068
7069 // move srcdn
7070 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
7071 int flags = predirty_dir | predirty_primary;
7072 if (srcdn->is_auth())
7073 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
7074 if (destdn->is_auth())
7075 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
7076
7077 SnapRealm *src_realm = srci->find_snaprealm();
7078 SnapRealm *dest_realm = destdn->get_dir()->inode->find_snaprealm();
7079 snapid_t next_dest_snap = dest_realm->get_newest_seq() + 1;
7080
7081 // add it all to the metablob
7082 // target inode
7083 if (!linkmerge) {
7084 if (destdnl->is_primary()) {
7085 if (destdn->is_auth()) {
7086 // project snaprealm, too
7087 if (oldin->snaprealm || dest_realm->get_newest_seq() + 1 > oldin->get_oldest_snap())
7088 oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
7089 straydn->first = MAX(oldin->first, next_dest_snap);
7090 metablob->add_primary_dentry(straydn, oldin, true, true);
7091 } else if (force_journal_stray) {
7092 dout(10) << " forced journaling straydn " << *straydn << dendl;
7093 metablob->add_dir_context(straydn->get_dir());
7094 metablob->add_primary_dentry(straydn, oldin, true);
7095 }
7096 } else if (destdnl->is_remote()) {
7097 if (oldin->is_auth()) {
7098 // auth for targeti
7099 metablob->add_dir_context(oldin->get_projected_parent_dir());
7100 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
7101 CEPH_NOSNAP, 0, destdnl);
7102 metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
7103 }
7104 }
7105 }
7106
7107 // dest
7108 if (srcdnl->is_remote()) {
7109 if (!linkmerge) {
7110 if (destdn->is_auth() && !destdnl->is_null())
7111 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7112 else
7113 destdn->first = MAX(destdn->first, next_dest_snap);
7114
7115 if (destdn->is_auth())
7116 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7117 if (srci->get_projected_parent_dn()->is_auth()) { // it's remote
7118 metablob->add_dir_context(srci->get_projected_parent_dir());
7119 mdcache->journal_cow_dentry(mdr.get(), metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl);
7120 metablob->add_primary_dentry(srci->get_projected_parent_dn(), srci, true);
7121 }
7122 } else {
7123 if (destdn->is_auth() && !destdnl->is_null())
7124 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7125 else
7126 destdn->first = MAX(destdn->first, next_dest_snap);
7127
7128 if (destdn->is_auth())
7129 metablob->add_primary_dentry(destdn, destdnl->get_inode(), true, true);
7130 }
7131 } else if (srcdnl->is_primary()) {
7132 // project snap parent update?
7133 if (destdn->is_auth() && src_realm != dest_realm &&
7134 (srci->snaprealm || src_realm->get_newest_seq() + 1 > srci->get_oldest_snap()))
7135 srci->project_past_snaprealm_parent(dest_realm);
7136
7137 if (destdn->is_auth() && !destdnl->is_null())
7138 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7139 else
7140 destdn->first = MAX(destdn->first, next_dest_snap);
7141
7142 if (destdn->is_auth())
7143 metablob->add_primary_dentry(destdn, srci, true, true);
7144 else if (force_journal_dest) {
7145 dout(10) << " forced journaling destdn " << *destdn << dendl;
7146 metablob->add_dir_context(destdn->get_dir());
7147 metablob->add_primary_dentry(destdn, srci, true);
7148 if (srcdn->is_auth() && srci->is_dir()) {
7149 // journal new subtrees root dirfrags
7150 list<CDir*> ls;
7151 srci->get_dirfrags(ls);
7152 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7153 CDir *dir = *p;
7154 if (dir->is_auth())
7155 metablob->add_dir(dir, true);
7156 }
7157 }
7158 }
7159 }
7160
7161 // src
7162 if (srcdn->is_auth()) {
7163 dout(10) << " journaling srcdn " << *srcdn << dendl;
7164 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
7165 // also journal the inode in case we need do slave rename rollback. It is Ok to add
7166 // both primary and NULL dentries. Because during journal replay, null dentry is
7167 // processed after primary dentry.
7168 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
7169 metablob->add_primary_dentry(srcdn, srci, true);
7170 metablob->add_null_dentry(srcdn, true);
7171 } else
7172 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
7173
7174 // make renamed inode first track the dn
7175 if (srcdnl->is_primary() && destdn->is_auth())
7176 srci->first = destdn->first;
7177
7178 if (oldin && oldin->is_dir())
7179 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
7180 if (srci->is_dir())
7181 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
7182
7183 }
7184
7185
7186 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7187 {
7188 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7189 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
7190
7191 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7192 CDentry::linkage_t *destdnl = destdn->get_linkage();
7193
7194 CInode *oldin = destdnl->get_inode();
7195
7196 bool imported_inode = false;
7197
7198 // primary+remote link merge?
7199 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7200 (srcdnl->is_primary() || destdnl->is_primary()));
7201
7202 // target inode
7203 if (!linkmerge) {
7204 if (destdnl->is_primary()) {
7205 assert(straydn);
7206 dout(10) << "straydn is " << *straydn << dendl;
7207 destdn->get_dir()->unlink_inode(destdn);
7208
7209 straydn->pop_projected_linkage();
7210 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7211 assert(!straydn->is_projected()); // no other projected
7212
7213 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
7214
7215 // nlink-- targeti
7216 if (destdn->is_auth()) {
7217 bool hadrealm = (oldin->snaprealm ? true : false);
7218 oldin->pop_and_dirty_projected_inode(mdr->ls);
7219 if (oldin->snaprealm && !hadrealm)
7220 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT);
7221 } else {
7222 // FIXME this snaprealm is not filled out correctly
7223 //oldin->open_snaprealm(); might be sufficient..
7224 }
7225 } else if (destdnl->is_remote()) {
7226 destdn->get_dir()->unlink_inode(destdn);
7227 if (oldin->is_auth())
7228 oldin->pop_and_dirty_projected_inode(mdr->ls);
7229 }
7230 }
7231
7232 // unlink src before we relink it at dest
7233 CInode *in = srcdnl->get_inode();
7234 assert(in);
7235
7236 bool srcdn_was_remote = srcdnl->is_remote();
7237 srcdn->get_dir()->unlink_inode(srcdn);
7238
7239 // dest
7240 if (srcdn_was_remote) {
7241 if (!linkmerge) {
7242 // destdn
7243 destdnl = destdn->pop_projected_linkage();
7244 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7245 assert(!destdn->is_projected()); // no other projected
7246
7247 destdn->link_remote(destdnl, in);
7248 if (destdn->is_auth())
7249 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
7250 // in
7251 if (in->is_auth())
7252 in->pop_and_dirty_projected_inode(mdr->ls);
7253 } else {
7254 dout(10) << "merging remote onto primary link" << dendl;
7255 oldin->pop_and_dirty_projected_inode(mdr->ls);
7256 }
7257 } else { // primary
7258 if (linkmerge) {
7259 dout(10) << "merging primary onto remote link" << dendl;
7260 destdn->get_dir()->unlink_inode(destdn);
7261 }
7262 destdnl = destdn->pop_projected_linkage();
7263 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7264 assert(!destdn->is_projected()); // no other projected
7265
7266 // srcdn inode import?
7267 if (!srcdn->is_auth() && destdn->is_auth()) {
7268 assert(mdr->more()->inode_import.length() > 0);
7269
7270 map<client_t,Capability::Import> imported_caps;
7271
7272 // finish cap imports
7273 finish_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
7274 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
7275 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
7276 mdr->more()->srcdn_auth_mds, true,
7277 mdr->more()->cap_imports[destdnl->get_inode()],
7278 imported_caps);
7279 }
7280
7281 mdr->more()->inode_import.clear();
7282 ::encode(imported_caps, mdr->more()->inode_import);
7283
7284 /* hack: add an auth pin for each xlock we hold. These were
7285 * remote xlocks previously but now they're local and
7286 * we're going to try and unpin when we xlock_finish. */
7287 for (set<SimpleLock *>::iterator i = mdr->xlocks.begin();
7288 i != mdr->xlocks.end();
7289 ++i)
7290 if ((*i)->get_parent() == destdnl->get_inode() &&
7291 !(*i)->is_locallock())
7292 mds->locker->xlock_import(*i);
7293
7294 // hack: fix auth bit
7295 in->state_set(CInode::STATE_AUTH);
7296 imported_inode = true;
7297
7298 mdr->clear_ambiguous_auth();
7299 }
7300
7301 if (destdn->is_auth()) {
7302 in->pop_and_dirty_projected_inode(mdr->ls);
7303
7304 } else {
7305 // FIXME: fix up snaprealm!
7306 }
7307 }
7308
7309 // src
7310 if (srcdn->is_auth())
7311 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
7312 srcdn->pop_projected_linkage();
7313 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7314 assert(!srcdn->is_projected()); // no other projected
7315
7316 // apply remaining projected inodes (nested)
7317 mdr->apply();
7318
7319 // update subtree map?
7320 if (destdnl->is_primary() && in->is_dir())
7321 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true, imported_inode);
7322
7323 if (straydn && oldin->is_dir())
7324 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
7325
7326 // removing a new dn?
7327 if (srcdn->is_auth())
7328 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
7329 }
7330
7331
7332
7333 // ------------
7334 // SLAVE
7335
7336 class C_MDS_SlaveRenamePrep : public ServerLogContext {
7337 CDentry *srcdn, *destdn, *straydn;
7338 public:
7339 C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7340 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
7341 void finish(int r) override {
7342 server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
7343 }
7344 };
7345
7346 class C_MDS_SlaveRenameCommit : public ServerContext {
7347 MDRequestRef mdr;
7348 CDentry *srcdn, *destdn, *straydn;
7349 public:
7350 C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7351 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
7352 void finish(int r) override {
7353 server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
7354 }
7355 };
7356
7357 class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
7358 MDRequestRef mdr;
7359 public:
7360 C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
7361 ServerContext(s), mdr(r) {}
7362 void finish(int r) override {
7363 server->_slave_rename_sessions_flushed(mdr);
7364 }
7365 };
7366
7367 /* This function DOES put the mdr->slave_request before returning*/
7368 void Server::handle_slave_rename_prep(MDRequestRef& mdr)
7369 {
7370 dout(10) << "handle_slave_rename_prep " << *mdr
7371 << " " << mdr->slave_request->srcdnpath
7372 << " to " << mdr->slave_request->destdnpath
7373 << dendl;
7374
7375 // discover destdn
7376 filepath destpath(mdr->slave_request->destdnpath);
7377 dout(10) << " dest " << destpath << dendl;
7378 vector<CDentry*> trace;
7379 int r = mdcache->path_traverse(mdr, NULL, NULL, destpath, &trace, NULL, MDS_TRAVERSE_DISCOVERXLOCK);
7380 if (r > 0) return;
7381 if (r == -ESTALE) {
7382 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
7383 mdr->slave_to_mds);
7384 return;
7385 }
7386 assert(r == 0); // we shouldn't get an error here!
7387
7388 CDentry *destdn = trace[trace.size()-1];
7389 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7390 dout(10) << " destdn " << *destdn << dendl;
7391 mdr->pin(destdn);
7392
7393 // discover srcdn
7394 filepath srcpath(mdr->slave_request->srcdnpath);
7395 dout(10) << " src " << srcpath << dendl;
7396 CInode *srci = nullptr;
7397 r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &srci, MDS_TRAVERSE_DISCOVERXLOCK);
7398 if (r > 0) return;
7399 assert(r == 0);
7400
7401 // srcpath must not point to a null dentry
7402 assert(srci != nullptr);
7403
7404 CDentry *srcdn = trace[trace.size()-1];
7405 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7406 dout(10) << " srcdn " << *srcdn << dendl;
7407 mdr->pin(srcdn);
7408 mdr->pin(srci);
7409
7410 // stray?
7411 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7412 (srcdnl->is_primary() || destdnl->is_primary()));
7413 CDentry *straydn = mdr->straydn;
7414 if (destdnl->is_primary() && !linkmerge)
7415 assert(straydn);
7416
7417 mdr->set_op_stamp(mdr->slave_request->op_stamp);
7418 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
7419
7420 // set up commit waiter (early, to clean up any freezing etc we do)
7421 if (!mdr->more()->slave_commit)
7422 mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
7423
7424 // am i srcdn auth?
7425 if (srcdn->is_auth()) {
7426 set<mds_rank_t> srcdnrep;
7427 srcdn->list_replicas(srcdnrep);
7428
7429 bool reply_witness = false;
7430 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
7431 // freeze?
7432 // we need this to
7433 // - avoid conflicting lock state changes
7434 // - avoid concurrent updates to the inode
7435 // (this could also be accomplished with the versionlock)
7436 int allowance = 2; // 1 for the mdr auth_pin, 1 for the link lock
7437 allowance += srcdnl->get_inode()->is_dir(); // for the snap lock
7438 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
7439 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
7440
7441 // unfreeze auth pin after freezing the inode to avoid queueing waiters
7442 if (srcdnl->get_inode()->is_frozen_auth_pin())
7443 mdr->unfreeze_auth_pin();
7444
7445 if (!frozen_inode) {
7446 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
7447 return;
7448 }
7449
7450 /*
7451 * set ambiguous auth for srci
7452 * NOTE: we don't worry about ambiguous cache expire as we do
7453 * with subtree migrations because all slaves will pin
7454 * srcdn->get_inode() for duration of this rename.
7455 */
7456 mdr->set_ambiguous_auth(srcdnl->get_inode());
7457
7458 // just mark the source inode as ambiguous auth if more than two MDS are involved.
7459 // the master will send another OP_RENAMEPREP slave request later.
7460 if (mdr->slave_request->witnesses.size() > 1) {
7461 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
7462 reply_witness = true;
7463 }
7464
7465 // make sure bystanders have received all lock related messages
7466 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7467 if (*p == mdr->slave_to_mds ||
7468 (mds->is_cluster_degraded() &&
7469 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
7470 continue;
7471 MMDSSlaveRequest *notify = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7472 MMDSSlaveRequest::OP_RENAMENOTIFY);
7473 mds->send_message_mds(notify, *p);
7474 mdr->more()->waiting_on_slave.insert(*p);
7475 }
7476
7477 // make sure clients have received all cap related messages
7478 set<client_t> export_client_set;
7479 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
7480
7481 MDSGatherBuilder gather(g_ceph_context);
7482 flush_client_sessions(export_client_set, gather);
7483 if (gather.has_subs()) {
7484 mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
7485 gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
7486 gather.activate();
7487 }
7488 }
7489
7490 // is witness list sufficient?
7491 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7492 if (*p == mdr->slave_to_mds ||
7493 mdr->slave_request->witnesses.count(*p)) continue;
7494 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
7495 reply_witness = true;
7496 break;
7497 }
7498
7499 if (reply_witness) {
7500 assert(!srcdnrep.empty());
7501 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7502 MMDSSlaveRequest::OP_RENAMEPREPACK);
7503 reply->witnesses.swap(srcdnrep);
7504 mds->send_message_mds(reply, mdr->slave_to_mds);
7505 mdr->slave_request->put();
7506 mdr->slave_request = 0;
7507 return;
7508 }
7509 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
7510 if (!mdr->more()->waiting_on_slave.empty()) {
7511 dout(10) << " still waiting for rename notify acks from "
7512 << mdr->more()->waiting_on_slave << dendl;
7513 return;
7514 }
7515 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
7516 // set ambiguous auth for srci on witnesses
7517 mdr->set_ambiguous_auth(srcdnl->get_inode());
7518 }
7519
7520 // encode everything we'd need to roll this back... basically, just the original state.
7521 rename_rollback rollback;
7522
7523 rollback.reqid = mdr->reqid;
7524
7525 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
7526 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7527 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
7528 rollback.orig_src.dname = srcdn->name;
7529 if (srcdnl->is_primary())
7530 rollback.orig_src.ino = srcdnl->get_inode()->ino();
7531 else {
7532 assert(srcdnl->is_remote());
7533 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
7534 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
7535 }
7536
7537 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
7538 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7539 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
7540 rollback.orig_dest.dname = destdn->name;
7541 if (destdnl->is_primary())
7542 rollback.orig_dest.ino = destdnl->get_inode()->ino();
7543 else if (destdnl->is_remote()) {
7544 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
7545 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
7546 }
7547
7548 if (straydn) {
7549 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
7550 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
7551 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
7552 rollback.stray.dname = straydn->name;
7553 }
7554 ::encode(rollback, mdr->more()->rollback_bl);
7555 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7556
7557 // journal.
7558 mdr->ls = mdlog->get_current_segment();
7559 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
7560 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
7561 mdlog->start_entry(le);
7562 le->rollback = mdr->more()->rollback_bl;
7563
7564 bufferlist blah; // inode import data... obviously not used if we're the slave
7565 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
7566
7567 if (le->commit.empty()) {
7568 dout(10) << " empty metablob, skipping journal" << dendl;
7569 mdlog->cancel_entry(le);
7570 mdr->ls = NULL;
7571 _logged_slave_rename(mdr, srcdn, destdn, straydn);
7572 } else {
7573 mdr->more()->slave_update_journaled = true;
7574 submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
7575 mdr, __func__);
7576 mdlog->flush();
7577 }
7578 }
7579
7580 void Server::_logged_slave_rename(MDRequestRef& mdr,
7581 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7582 {
7583 dout(10) << "_logged_slave_rename " << *mdr << dendl;
7584
7585 // prepare ack
7586 MMDSSlaveRequest *reply = NULL;
7587 if (!mdr->aborted) {
7588 reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7589 if (!mdr->more()->slave_update_journaled)
7590 reply->mark_not_journaled();
7591 }
7592
7593 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7594 CDentry::linkage_t *destdnl = NULL;
7595 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
7596
7597 // export srci?
7598 if (srcdn->is_auth() && srcdnl->is_primary()) {
7599 // set export bounds for CInode::encode_export()
7600 list<CDir*> bounds;
7601 if (srcdnl->get_inode()->is_dir()) {
7602 srcdnl->get_inode()->get_dirfrags(bounds);
7603 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7604 (*p)->state_set(CDir::STATE_EXPORTBOUND);
7605 }
7606
7607 map<client_t,entity_inst_t> exported_client_map;
7608 bufferlist inodebl;
7609 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
7610 exported_client_map);
7611
7612 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7613 (*p)->state_clear(CDir::STATE_EXPORTBOUND);
7614
7615 if (reply) {
7616 ::encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
7617 reply->inode_export.claim_append(inodebl);
7618 reply->inode_export_v = srcdnl->get_inode()->inode.version;
7619 }
7620
7621 // remove mdr auth pin
7622 mdr->auth_unpin(srcdnl->get_inode());
7623 mdr->more()->is_inode_exporter = true;
7624
7625 if (srcdnl->get_inode()->is_dirty())
7626 srcdnl->get_inode()->mark_clean();
7627
7628 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
7629 }
7630
7631 // apply
7632 _rename_apply(mdr, srcdn, destdn, straydn);
7633
7634 destdnl = destdn->get_linkage();
7635
7636 // bump popularity
7637 mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
7638 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
7639 mds->balancer->hit_inode(mdr->get_mds_stamp(), destdnl->get_inode(),
7640 META_POP_IWR);
7641
7642 // done.
7643 mdr->slave_request->put();
7644 mdr->slave_request = 0;
7645 mdr->straydn = 0;
7646
7647 if (reply) {
7648 mds->send_message_mds(reply, mdr->slave_to_mds);
7649 } else {
7650 assert(mdr->aborted);
7651 dout(10) << " abort flag set, finishing" << dendl;
7652 mdcache->request_finish(mdr);
7653 }
7654 }
7655
7656 void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
7657 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7658 {
7659 dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
7660
7661 CDentry::linkage_t *destdnl = destdn->get_linkage();
7662
7663 list<MDSInternalContextBase*> finished;
7664 if (r == 0) {
7665 // unfreeze+singleauth inode
7666 // hmm, do i really need to delay this?
7667 if (mdr->more()->is_inode_exporter) {
7668
7669 CInode *in = destdnl->get_inode();
7670
7671 // drop our pins
7672 // we exported, clear out any xlocks that we moved to another MDS
7673 set<SimpleLock*>::iterator i = mdr->xlocks.begin();
7674 while (i != mdr->xlocks.end()) {
7675 SimpleLock *lock = *i++;
7676
7677 // we only care about xlocks on the exported inode
7678 if (lock->get_parent() == in &&
7679 !lock->is_locallock())
7680 mds->locker->xlock_export(lock, mdr.get());
7681 }
7682
7683 map<client_t,Capability::Import> peer_imported;
7684 bufferlist::iterator bp = mdr->more()->inode_import.begin();
7685 ::decode(peer_imported, bp);
7686
7687 dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl;
7688 mdcache->migrator->finish_export_inode(destdnl->get_inode(),
7689 mdr->get_mds_stamp(),
7690 mdr->slave_to_mds, peer_imported, finished);
7691 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
7692
7693 // unfreeze
7694 assert(destdnl->get_inode()->is_frozen_inode());
7695 destdnl->get_inode()->unfreeze_inode(finished);
7696 }
7697
7698 // singleauth
7699 if (mdr->more()->is_ambiguous_auth) {
7700 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
7701 mdr->more()->is_ambiguous_auth = false;
7702 }
7703
7704
7705 mds->queue_waiters(finished);
7706 mdr->cleanup();
7707
7708 if (mdr->more()->slave_update_journaled) {
7709 // write a commit to the journal
7710 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
7711 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
7712 ESlaveUpdate::RENAME);
7713 mdlog->start_entry(le);
7714 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
7715 mdlog->flush();
7716 } else {
7717 _committed_slave(mdr);
7718 }
7719 } else {
7720
7721 // abort
7722 // rollback_bl may be empty if we froze the inode but had to provide an expanded
7723 // witness list from the master, and they failed before we tried prep again.
7724 if (mdr->more()->rollback_bl.length()) {
7725 if (mdr->more()->is_inode_exporter) {
7726 dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl;
7727 destdnl->get_inode()->abort_export();
7728 }
7729 if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
7730 mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
7731 // rollback but preserve the slave request
7732 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
7733 mdr->more()->rollback_bl.clear();
7734 } else
7735 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
7736 } else {
7737 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
7738 // singleauth
7739 if (mdr->more()->is_ambiguous_auth) {
7740 if (srcdn->is_auth())
7741 mdr->more()->rename_inode->unfreeze_inode(finished);
7742
7743 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
7744 mdr->more()->is_ambiguous_auth = false;
7745 }
7746 mds->queue_waiters(finished);
7747 mdcache->request_finish(mdr);
7748 }
7749 }
7750 }
7751
7752 void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
7753 bool isdir, int linkunlink, nest_info_t &rstat)
7754 {
7755 fnode_t *pf;
7756 pf = dir->project_fnode();
7757 mut->add_projected_fnode(dir);
7758 pf->version = dir->pre_dirty();
7759
7760 if (isdir) {
7761 pf->fragstat.nsubdirs += linkunlink;
7762 } else {
7763 pf->fragstat.nfiles += linkunlink;
7764 }
7765 if (r.ino) {
7766 pf->rstat.rbytes += linkunlink * rstat.rbytes;
7767 pf->rstat.rfiles += linkunlink * rstat.rfiles;
7768 pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
7769 pf->rstat.rsnaprealms += linkunlink * rstat.rsnaprealms;
7770 }
7771 if (pf->fragstat.mtime == ctime) {
7772 pf->fragstat.mtime = r.dirfrag_old_mtime;
7773 if (pf->rstat.rctime == ctime)
7774 pf->rstat.rctime = r.dirfrag_old_rctime;
7775 }
7776 mut->add_updated_lock(&dir->get_inode()->filelock);
7777 mut->add_updated_lock(&dir->get_inode()->nestlock);
7778 }
7779
7780 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
7781 MutationRef mut;
7782 CDentry *srcdn;
7783 version_t srcdnpv;
7784 CDentry *destdn;
7785 CDentry *straydn;
7786 bool finish_mdr;
7787 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
7788 CDentry *sd, version_t pv, CDentry *dd,
7789 CDentry *st, bool f) :
7790 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
7791 straydn(st), finish_mdr(f) {}
7792 void finish(int r) override {
7793 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
7794 destdn, straydn, finish_mdr);
7795 }
7796 };
7797
7798 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
7799 bool finish_mdr)
7800 {
7801 rename_rollback rollback;
7802 bufferlist::iterator p = rbl.begin();
7803 ::decode(rollback, p);
7804
7805 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
7806 // need to finish this update before sending resolve to claim the subtree
7807 mdcache->add_rollback(rollback.reqid, master);
7808
7809 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7810 mut->ls = mds->mdlog->get_current_segment();
7811
7812 CDentry *srcdn = NULL;
7813 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
7814 if (!srcdir)
7815 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
7816 if (srcdir) {
7817 dout(10) << " srcdir " << *srcdir << dendl;
7818 srcdn = srcdir->lookup(rollback.orig_src.dname);
7819 if (srcdn) {
7820 dout(10) << " srcdn " << *srcdn << dendl;
7821 assert(srcdn->get_linkage()->is_null());
7822 } else
7823 dout(10) << " srcdn not found" << dendl;
7824 } else
7825 dout(10) << " srcdir not found" << dendl;
7826
7827 CDentry *destdn = NULL;
7828 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
7829 if (!destdir)
7830 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
7831 if (destdir) {
7832 dout(10) << " destdir " << *destdir << dendl;
7833 destdn = destdir->lookup(rollback.orig_dest.dname);
7834 if (destdn)
7835 dout(10) << " destdn " << *destdn << dendl;
7836 else
7837 dout(10) << " destdn not found" << dendl;
7838 } else
7839 dout(10) << " destdir not found" << dendl;
7840
7841 CInode *in = NULL;
7842 if (rollback.orig_src.ino) {
7843 in = mdcache->get_inode(rollback.orig_src.ino);
7844 if (in && in->is_dir())
7845 assert(srcdn && destdn);
7846 } else
7847 in = mdcache->get_inode(rollback.orig_src.remote_ino);
7848
7849 CDir *straydir = NULL;
7850 CDentry *straydn = NULL;
7851 if (rollback.stray.dirfrag.ino) {
7852 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
7853 if (straydir) {
7854 dout(10) << "straydir " << *straydir << dendl;
7855 straydn = straydir->lookup(rollback.stray.dname);
7856 if (straydn) {
7857 dout(10) << " straydn " << *straydn << dendl;
7858 assert(straydn->get_linkage()->is_primary());
7859 } else
7860 dout(10) << " straydn not found" << dendl;
7861 } else
7862 dout(10) << "straydir not found" << dendl;
7863 }
7864
7865 CInode *target = NULL;
7866 if (rollback.orig_dest.ino) {
7867 target = mdcache->get_inode(rollback.orig_dest.ino);
7868 if (target)
7869 assert(destdn && straydn);
7870 } else if (rollback.orig_dest.remote_ino)
7871 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
7872
7873 // can't use is_auth() in the resolve stage
7874 mds_rank_t whoami = mds->get_nodeid();
7875 // slave
7876 assert(!destdn || destdn->authority().first != whoami);
7877 assert(!straydn || straydn->authority().first != whoami);
7878
7879 bool force_journal_src = false;
7880 bool force_journal_dest = false;
7881 if (in && in->is_dir() && srcdn->authority().first != whoami)
7882 force_journal_src = _need_force_journal(in, false);
7883 if (in && target && target->is_dir())
7884 force_journal_dest = _need_force_journal(in, true);
7885
7886 version_t srcdnpv = 0;
7887 // repair src
7888 if (srcdn) {
7889 if (srcdn->authority().first == whoami)
7890 srcdnpv = srcdn->pre_dirty();
7891 if (rollback.orig_src.ino) {
7892 assert(in);
7893 srcdn->push_projected_linkage(in);
7894 } else
7895 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
7896 rollback.orig_src.remote_d_type);
7897 }
7898
7899 inode_t *pi = 0;
7900 if (in) {
7901 if (in->authority().first == whoami) {
7902 pi = in->project_inode();
7903 mut->add_projected_inode(in);
7904 pi->version = in->pre_dirty();
7905 } else
7906 pi = in->get_projected_inode();
7907 if (pi->ctime == rollback.ctime)
7908 pi->ctime = rollback.orig_src.old_ctime;
7909 }
7910
7911 if (srcdn && srcdn->authority().first == whoami) {
7912 nest_info_t blah;
7913 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
7914 in ? in->is_dir() : false, 1, pi ? pi->accounted_rstat : blah);
7915 }
7916
7917 // repair dest
7918 if (destdn) {
7919 if (rollback.orig_dest.ino && target) {
7920 destdn->push_projected_linkage(target);
7921 } else if (rollback.orig_dest.remote_ino) {
7922 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
7923 rollback.orig_dest.remote_d_type);
7924 } else {
7925 // the dentry will be trimmed soon, it's ok to have wrong linkage
7926 if (rollback.orig_dest.ino)
7927 assert(mds->is_resolve());
7928 destdn->push_projected_linkage();
7929 }
7930 }
7931
7932 if (straydn)
7933 straydn->push_projected_linkage();
7934
7935 if (target) {
7936 inode_t *ti = NULL;
7937 if (target->authority().first == whoami) {
7938 ti = target->project_inode();
7939 mut->add_projected_inode(target);
7940 ti->version = target->pre_dirty();
7941 } else
7942 ti = target->get_projected_inode();
7943 if (ti->ctime == rollback.ctime)
7944 ti->ctime = rollback.orig_dest.old_ctime;
7945 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
7946 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
7947 assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
7948 else
7949 assert(rollback.orig_dest.remote_ino &&
7950 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
7951 } else
7952 ti->nlink++;
7953 }
7954
7955 if (srcdn)
7956 dout(0) << " srcdn back to " << *srcdn << dendl;
7957 if (in)
7958 dout(0) << " srci back to " << *in << dendl;
7959 if (destdn)
7960 dout(0) << " destdn back to " << *destdn << dendl;
7961 if (target)
7962 dout(0) << " desti back to " << *target << dendl;
7963
7964 // journal it
7965 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
7966 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
7967 mdlog->start_entry(le);
7968
7969 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
7970 le->commit.add_dir_context(srcdir);
7971 if (rollback.orig_src.ino)
7972 le->commit.add_primary_dentry(srcdn, 0, true);
7973 else
7974 le->commit.add_remote_dentry(srcdn, true);
7975 }
7976
7977 if (!rollback.orig_src.ino && // remote linkage
7978 in && in->authority().first == whoami) {
7979 le->commit.add_dir_context(in->get_projected_parent_dir());
7980 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
7981 }
7982
7983 if (force_journal_dest) {
7984 assert(rollback.orig_dest.ino);
7985 le->commit.add_dir_context(destdir);
7986 le->commit.add_primary_dentry(destdn, 0, true);
7987 }
7988
7989 // slave: no need to journal straydn
7990
7991 if (target && target != in && target->authority().first == whoami) {
7992 assert(rollback.orig_dest.remote_ino);
7993 le->commit.add_dir_context(target->get_projected_parent_dir());
7994 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
7995 }
7996
7997 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
7998 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
7999 le->commit.renamed_dirino = in->ino();
8000 if (srcdn->authority().first == whoami) {
8001 list<CDir*> ls;
8002 in->get_dirfrags(ls);
8003 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
8004 CDir *dir = *p;
8005 if (!dir->is_auth())
8006 le->commit.renamed_dir_frags.push_back(dir->get_frag());
8007 }
8008 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
8009 }
8010 } else if (force_journal_dest) {
8011 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
8012 le->commit.renamed_dirino = target->ino();
8013 }
8014
8015 if (target && target->is_dir()) {
8016 assert(destdn);
8017 mdcache->project_subtree_rename(target, straydir, destdir);
8018 }
8019
8020 if (in && in->is_dir()) {
8021 assert(srcdn);
8022 mdcache->project_subtree_rename(in, destdir, srcdir);
8023 }
8024
8025 if (mdr && !mdr->more()->slave_update_journaled) {
8026 assert(le->commit.empty());
8027 mdlog->cancel_entry(le);
8028 mut->ls = NULL;
8029 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, finish_mdr);
8030 } else {
8031 assert(!le->commit.empty());
8032 if (mdr)
8033 mdr->more()->slave_update_journaled = false;
8034 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr, srcdn, srcdnpv,
8035 destdn, straydn, finish_mdr);
8036 submit_mdlog_entry(le, fin, mdr, __func__);
8037 mdlog->flush();
8038 }
8039 }
8040
8041 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
8042 version_t srcdnpv, CDentry *destdn,
8043 CDentry *straydn, bool finish_mdr)
8044 {
8045 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
8046
8047 if (straydn) {
8048 straydn->get_dir()->unlink_inode(straydn);
8049 straydn->pop_projected_linkage();
8050 }
8051 if (destdn) {
8052 destdn->get_dir()->unlink_inode(destdn);
8053 destdn->pop_projected_linkage();
8054 }
8055 if (srcdn) {
8056 srcdn->pop_projected_linkage();
8057 if (srcdn->authority().first == mds->get_nodeid())
8058 srcdn->mark_dirty(srcdnpv, mut->ls);
8059 }
8060
8061 mut->apply();
8062
8063 if (srcdn && srcdn->get_linkage()->is_primary()) {
8064 CInode *in = srcdn->get_linkage()->get_inode();
8065 if (srcdn->authority().first == mds->get_nodeid())
8066 in->state_set(CInode::STATE_AUTH);
8067 // update subtree map?
8068 if (in && in->is_dir()) {
8069 assert(destdn);
8070 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
8071 }
8072 }
8073
8074 if (destdn) {
8075 CInode *oldin = destdn->get_linkage()->get_inode();
8076 // update subtree map?
8077 if (oldin && oldin->is_dir()) {
8078 assert(straydn);
8079 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
8080 }
8081 }
8082
8083 if (mds->is_resolve()) {
8084 CDir *root = NULL;
8085 if (straydn)
8086 root = mdcache->get_subtree_root(straydn->get_dir());
8087 else if (destdn)
8088 root = mdcache->get_subtree_root(destdn->get_dir());
8089 if (root)
8090 mdcache->try_trim_non_auth_subtree(root);
8091 }
8092
8093 if (mdr) {
8094 list<MDSInternalContextBase*> finished;
8095 if (mdr->more()->is_ambiguous_auth) {
8096 if (srcdn->is_auth())
8097 mdr->more()->rename_inode->unfreeze_inode(finished);
8098
8099 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8100 mdr->more()->is_ambiguous_auth = false;
8101 }
8102 mds->queue_waiters(finished);
8103 if (finish_mdr || mdr->aborted)
8104 mdcache->request_finish(mdr);
8105 else
8106 mdr->more()->slave_rolling_back = false;
8107 }
8108
8109 mdcache->finish_rollback(mut->reqid);
8110
8111 mut->cleanup();
8112 }
8113
8114 /* This function DOES put the passed message before returning*/
8115 void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8116 {
8117 dout(10) << "handle_slave_rename_prep_ack " << *mdr
8118 << " witnessed by " << ack->get_source()
8119 << " " << *ack << dendl;
8120 mds_rank_t from = mds_rank_t(ack->get_source().num());
8121
8122 // note slave
8123 mdr->more()->slaves.insert(from);
8124 if (mdr->more()->srcdn_auth_mds == from &&
8125 mdr->more()->is_remote_frozen_authpin &&
8126 !mdr->more()->is_ambiguous_auth) {
8127 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
8128 }
8129
8130 // witnessed? or add extra witnesses?
8131 assert(mdr->more()->witnessed.count(from) == 0);
8132 if (ack->witnesses.empty()) {
8133 mdr->more()->witnessed.insert(from);
8134 if (!ack->is_not_journaled())
8135 mdr->more()->has_journaled_slaves = true;
8136 } else {
8137 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
8138 mdr->more()->extra_witnesses.swap(ack->witnesses);
8139 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
8140 }
8141
8142 // srci import?
8143 if (ack->inode_export.length()) {
8144 dout(10) << " got srci import" << dendl;
8145 mdr->more()->inode_import.claim(ack->inode_export);
8146 mdr->more()->inode_import_v = ack->inode_export_v;
8147 }
8148
8149 // remove from waiting list
8150 assert(mdr->more()->waiting_on_slave.count(from));
8151 mdr->more()->waiting_on_slave.erase(from);
8152
8153 if (mdr->more()->waiting_on_slave.empty())
8154 dispatch_client_request(mdr); // go again!
8155 else
8156 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
8157 }
8158
8159 void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8160 {
8161 dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
8162 << ack->get_source() << dendl;
8163 assert(mdr->is_slave());
8164 mds_rank_t from = mds_rank_t(ack->get_source().num());
8165
8166 if (mdr->more()->waiting_on_slave.count(from)) {
8167 mdr->more()->waiting_on_slave.erase(from);
8168
8169 if (mdr->more()->waiting_on_slave.empty()) {
8170 if (mdr->slave_request)
8171 dispatch_slave_request(mdr);
8172 } else
8173 dout(10) << " still waiting for rename notify acks from "
8174 << mdr->more()->waiting_on_slave << dendl;
8175 }
8176 }
8177
8178 void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
8179 {
8180 dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
8181
8182 if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
8183 mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
8184
8185 if (mdr->more()->waiting_on_slave.empty()) {
8186 if (mdr->slave_request)
8187 dispatch_slave_request(mdr);
8188 } else
8189 dout(10) << " still waiting for rename notify acks from "
8190 << mdr->more()->waiting_on_slave << dendl;
8191 }
8192 }
8193
8194 // snaps
8195 /* This function takes responsibility for the passed mdr*/
8196 void Server::handle_client_lssnap(MDRequestRef& mdr)
8197 {
8198 MClientRequest *req = mdr->client_request;
8199
8200 // traverse to path
8201 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8202 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8203 respond_to_request(mdr, -ESTALE);
8204 return;
8205 }
8206 if (!diri->is_auth()) {
8207 mdcache->request_forward(mdr, diri->authority().first);
8208 return;
8209 }
8210 if (!diri->is_dir()) {
8211 respond_to_request(mdr, -ENOTDIR);
8212 return;
8213 }
8214 dout(10) << "lssnap on " << *diri << dendl;
8215
8216 // lock snap
8217 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8218 mds->locker->include_snap_rdlocks(rdlocks, diri);
8219 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8220 return;
8221
8222 if (!check_access(mdr, diri, MAY_READ))
8223 return;
8224
8225 SnapRealm *realm = diri->find_snaprealm();
8226 map<snapid_t,SnapInfo*> infomap;
8227 realm->get_snap_info(infomap, diri->get_oldest_snap());
8228
8229 unsigned max_entries = req->head.args.readdir.max_entries;
8230 if (!max_entries)
8231 max_entries = infomap.size();
8232 int max_bytes = req->head.args.readdir.max_bytes;
8233 if (!max_bytes)
8234 // make sure at least one item can be encoded
8235 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
8236
8237 __u64 last_snapid = 0;
8238 string offset_str = req->get_path2();
8239 if (!offset_str.empty())
8240 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
8241
8242 bufferlist dirbl;
8243 encode_empty_dirstat(dirbl);
8244
8245 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
8246
8247 __u32 num = 0;
8248 bufferlist dnbl;
8249 map<snapid_t,SnapInfo*>::iterator p = infomap.upper_bound(last_snapid);
8250 for (; p != infomap.end() && num < max_entries; ++p) {
8251 dout(10) << p->first << " -> " << *p->second << dendl;
8252
8253 // actual
8254 string snap_name;
8255 if (p->second->ino == diri->ino())
8256 snap_name = p->second->name;
8257 else
8258 snap_name = p->second->get_long_name();
8259
8260 unsigned start_len = dnbl.length();
8261 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
8262 break;
8263
8264 ::encode(snap_name, dnbl);
8265 encode_infinite_lease(dnbl);
8266
8267 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
8268 if (r < 0) {
8269 bufferlist keep;
8270 keep.substr_of(dnbl, 0, start_len);
8271 dnbl.swap(keep);
8272 break;
8273 }
8274 ++num;
8275 }
8276
8277 ::encode(num, dirbl);
8278 __u16 flags = 0;
8279 if (p == infomap.end()) {
8280 flags = CEPH_READDIR_FRAG_END;
8281 if (last_snapid == 0)
8282 flags |= CEPH_READDIR_FRAG_COMPLETE;
8283 }
8284 ::encode(flags, dirbl);
8285 dirbl.claim_append(dnbl);
8286
8287 mdr->reply_extra_bl = dirbl;
8288 mdr->tracei = diri;
8289 respond_to_request(mdr, 0);
8290 }
8291
8292
8293 // MKSNAP
8294
8295 struct C_MDS_mksnap_finish : public ServerLogContext {
8296 CInode *diri;
8297 SnapInfo info;
8298 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
8299 ServerLogContext(s, r), diri(di), info(i) {}
8300 void finish(int r) override {
8301 server->_mksnap_finish(mdr, diri, info);
8302 }
8303 };
8304
8305 /* This function takes responsibility for the passed mdr*/
8306 void Server::handle_client_mksnap(MDRequestRef& mdr)
8307 {
8308 if (!mds->mdsmap->allows_snaps()) {
8309 // you can't make snapshots until you set an option right now
8310 respond_to_request(mdr, -EPERM);
8311 return;
8312 }
8313
8314 MClientRequest *req = mdr->client_request;
8315 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8316 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8317 respond_to_request(mdr, -ESTALE);
8318 return;
8319 }
8320
8321 if (!diri->is_auth()) { // fw to auth?
8322 mdcache->request_forward(mdr, diri->authority().first);
8323 return;
8324 }
8325
8326 // dir only
8327 if (!diri->is_dir()) {
8328 respond_to_request(mdr, -ENOTDIR);
8329 return;
8330 }
8331 if (diri->is_system() && !diri->is_root()) {
8332 // no snaps in system dirs (root is ok)
8333 respond_to_request(mdr, -EPERM);
8334 return;
8335 }
8336
8337 const string &snapname = req->get_filepath().last_dentry();
8338
8339 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8340 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8341 respond_to_request(mdr, -EPERM);
8342 return;
8343 }
8344
8345 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
8346
8347 // lock snap
8348 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8349
8350 mds->locker->include_snap_rdlocks(rdlocks, diri);
8351 rdlocks.erase(&diri->snaplock);
8352 xlocks.insert(&diri->snaplock);
8353
8354 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8355 return;
8356
8357 if (!check_access(mdr, diri, MAY_WRITE))
8358 return;
8359
8360 // make sure name is unique
8361 if (diri->snaprealm &&
8362 diri->snaprealm->exists(snapname)) {
8363 respond_to_request(mdr, -EEXIST);
8364 return;
8365 }
8366 if (snapname.length() == 0 ||
8367 snapname[0] == '_') {
8368 respond_to_request(mdr, -EINVAL);
8369 return;
8370 }
8371
8372 // allocate a snapid
8373 if (!mdr->more()->stid) {
8374 // prepare an stid
8375 mds->snapclient->prepare_create(diri->ino(), snapname,
8376 mdr->get_mds_stamp(),
8377 &mdr->more()->stid, &mdr->more()->snapidbl,
8378 new C_MDS_RetryRequest(mdcache, mdr));
8379 return;
8380 }
8381
8382 version_t stid = mdr->more()->stid;
8383 snapid_t snapid;
8384 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8385 ::decode(snapid, p);
8386 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
8387
8388 // journal
8389 SnapInfo info;
8390 info.ino = diri->ino();
8391 info.snapid = snapid;
8392 info.name = snapname;
8393 info.stamp = mdr->get_op_stamp();
8394
8395 inode_t *pi = diri->project_inode();
8396 pi->ctime = info.stamp;
8397 pi->version = diri->pre_dirty();
8398
8399 // project the snaprealm
8400 sr_t *newsnap = diri->project_snaprealm(snapid);
8401 newsnap->snaps[snapid] = info;
8402 newsnap->seq = snapid;
8403 newsnap->last_created = snapid;
8404
8405 // journal the inode changes
8406 mdr->ls = mdlog->get_current_segment();
8407 EUpdate *le = new EUpdate(mdlog, "mksnap");
8408 mdlog->start_entry(le);
8409
8410 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8411 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8412 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8413 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8414
8415 // journal the snaprealm changes
8416 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
8417 mdr, __func__);
8418 mdlog->flush();
8419 }
8420
8421 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
8422 {
8423 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
8424
8425 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
8426
8427 diri->pop_and_dirty_projected_inode(mdr->ls);
8428 mdr->apply();
8429
8430 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8431
8432 // create snap
8433 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8434
8435 mdcache->do_realm_invalidate_and_update_notify(diri, op);
8436
8437 // yay
8438 mdr->in[0] = diri;
8439 mdr->snapid = info.snapid;
8440 mdr->tracei = diri;
8441 respond_to_request(mdr, 0);
8442 }
8443
8444
8445 // RMSNAP
8446
8447 struct C_MDS_rmsnap_finish : public ServerLogContext {
8448 CInode *diri;
8449 snapid_t snapid;
8450 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8451 ServerLogContext(s, r), diri(di), snapid(sn) {}
8452 void finish(int r) override {
8453 server->_rmsnap_finish(mdr, diri, snapid);
8454 }
8455 };
8456
8457 /* This function takes responsibility for the passed mdr*/
8458 void Server::handle_client_rmsnap(MDRequestRef& mdr)
8459 {
8460 MClientRequest *req = mdr->client_request;
8461
8462 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8463 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8464 respond_to_request(mdr, -ESTALE);
8465 return;
8466 }
8467 if (!diri->is_auth()) { // fw to auth?
8468 mdcache->request_forward(mdr, diri->authority().first);
8469 return;
8470 }
8471 if (!diri->is_dir()) {
8472 respond_to_request(mdr, -ENOTDIR);
8473 return;
8474 }
8475
8476 const string &snapname = req->get_filepath().last_dentry();
8477
8478 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8479 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8480 respond_to_request(mdr, -EPERM);
8481 return;
8482 }
8483
8484 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
8485
8486 // does snap exist?
8487 if (snapname.length() == 0 || snapname[0] == '_') {
8488 respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
8489 return;
8490 }
8491 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
8492 respond_to_request(mdr, -ENOENT);
8493 return;
8494 }
8495 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
8496 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
8497
8498 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8499 mds->locker->include_snap_rdlocks(rdlocks, diri);
8500 rdlocks.erase(&diri->snaplock);
8501 xlocks.insert(&diri->snaplock);
8502
8503 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8504 return;
8505
8506 if (!check_access(mdr, diri, MAY_WRITE))
8507 return;
8508
8509 // prepare
8510 if (!mdr->more()->stid) {
8511 mds->snapclient->prepare_destroy(diri->ino(), snapid,
8512 &mdr->more()->stid, &mdr->more()->snapidbl,
8513 new C_MDS_RetryRequest(mdcache, mdr));
8514 return;
8515 }
8516 version_t stid = mdr->more()->stid;
8517 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8518 snapid_t seq;
8519 ::decode(seq, p);
8520 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8521
8522 // journal
8523 inode_t *pi = diri->project_inode();
8524 pi->version = diri->pre_dirty();
8525 pi->ctime = mdr->get_op_stamp();
8526
8527 mdr->ls = mdlog->get_current_segment();
8528 EUpdate *le = new EUpdate(mdlog, "rmsnap");
8529 mdlog->start_entry(le);
8530
8531 // project the snaprealm
8532 sr_t *newnode = diri->project_snaprealm();
8533 newnode->snaps.erase(snapid);
8534 newnode->seq = seq;
8535 newnode->last_destroyed = seq;
8536
8537 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8538 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8539 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8540 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8541
8542 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
8543 mdr, __func__);
8544 mdlog->flush();
8545 }
8546
8547 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8548 {
8549 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
8550 snapid_t stid = mdr->more()->stid;
8551 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8552 snapid_t seq;
8553 ::decode(seq, p);
8554
8555 diri->pop_and_dirty_projected_inode(mdr->ls);
8556 mdr->apply();
8557
8558 mds->snapclient->commit(stid, mdr->ls);
8559
8560 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8561
8562 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
8563
8564 // yay
8565 mdr->in[0] = diri;
8566 respond_to_request(mdr, 0);
8567
8568 // purge snapshot data
8569 if (diri->snaprealm->have_past_parents_open())
8570 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
8571 }
8572
8573 struct C_MDS_renamesnap_finish : public ServerLogContext {
8574 CInode *diri;
8575 snapid_t snapid;
8576 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8577 ServerLogContext(s, r), diri(di), snapid(sn) {}
8578 void finish(int r) override {
8579 server->_renamesnap_finish(mdr, diri, snapid);
8580 }
8581 };
8582
8583 /* This function takes responsibility for the passed mdr*/
8584 void Server::handle_client_renamesnap(MDRequestRef& mdr)
8585 {
8586 MClientRequest *req = mdr->client_request;
8587 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
8588 respond_to_request(mdr, -EINVAL);
8589 return;
8590 }
8591
8592 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8593 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8594 respond_to_request(mdr, -ESTALE);
8595 return;
8596 }
8597
8598 if (!diri->is_auth()) { // fw to auth?
8599 mdcache->request_forward(mdr, diri->authority().first);
8600 return;
8601 }
8602
8603 if (!diri->is_dir()) { // dir only
8604 respond_to_request(mdr, -ENOTDIR);
8605 return;
8606 }
8607
8608 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid ||
8609 mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8610 respond_to_request(mdr, -EPERM);
8611 return;
8612 }
8613
8614 const string &dstname = req->get_filepath().last_dentry();
8615 const string &srcname = req->get_filepath2().last_dentry();
8616 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
8617
8618 if (srcname.length() == 0 || srcname[0] == '_') {
8619 respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
8620 return;
8621 }
8622 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
8623 respond_to_request(mdr, -ENOENT);
8624 return;
8625 }
8626 if (dstname.length() == 0 || dstname[0] == '_') {
8627 respond_to_request(mdr, -EINVAL);
8628 return;
8629 }
8630 if (diri->snaprealm->exists(dstname)) {
8631 respond_to_request(mdr, -EEXIST);
8632 return;
8633 }
8634
8635 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
8636 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
8637
8638 // lock snap
8639 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8640
8641 mds->locker->include_snap_rdlocks(rdlocks, diri);
8642 rdlocks.erase(&diri->snaplock);
8643 xlocks.insert(&diri->snaplock);
8644
8645 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8646 return;
8647
8648 if (!check_access(mdr, diri, MAY_WRITE))
8649 return;
8650
8651 // prepare
8652 if (!mdr->more()->stid) {
8653 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
8654 &mdr->more()->stid, &mdr->more()->snapidbl,
8655 new C_MDS_RetryRequest(mdcache, mdr));
8656 return;
8657 }
8658
8659 version_t stid = mdr->more()->stid;
8660 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8661 snapid_t seq;
8662 ::decode(seq, p);
8663 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8664
8665 // journal
8666 inode_t *pi = diri->project_inode();
8667 pi->ctime = mdr->get_op_stamp();
8668 pi->version = diri->pre_dirty();
8669
8670 // project the snaprealm
8671 sr_t *newsnap = diri->project_snaprealm();
8672 assert(newsnap->snaps.count(snapid));
8673 newsnap->snaps[snapid].name = dstname;
8674
8675 // journal the inode changes
8676 mdr->ls = mdlog->get_current_segment();
8677 EUpdate *le = new EUpdate(mdlog, "renamesnap");
8678 mdlog->start_entry(le);
8679
8680 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8681 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8682 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8683 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8684
8685 // journal the snaprealm changes
8686 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
8687 mdr, __func__);
8688 mdlog->flush();
8689 }
8690
8691 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8692 {
8693 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
8694
8695 diri->pop_and_dirty_projected_inode(mdr->ls);
8696 mdr->apply();
8697
8698 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8699
8700 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8701
8702 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE, true);
8703
8704 // yay
8705 mdr->in[0] = diri;
8706 mdr->tracei = diri;
8707 mdr->snapid = snapid;
8708 respond_to_request(mdr, 0);
8709 }
8710
8711 /**
8712 * Return true if server is in state RECONNECT and this
8713 * client has not yet reconnected.
8714 */
8715 bool Server::waiting_for_reconnect(client_t c) const
8716 {
8717 return client_reconnect_gather.count(c) > 0;
8718 }
8719
8720 void Server::dump_reconnect_status(Formatter *f) const
8721 {
8722 f->open_object_section("reconnect_status");
8723 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
8724 f->close_section();
8725 }