]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
update sources to 12.2.7
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20
21 #include "MDSRank.h"
22 #include "Server.h"
23 #include "Locker.h"
24 #include "MDCache.h"
25 #include "MDLog.h"
26 #include "Migrator.h"
27 #include "MDBalancer.h"
28 #include "InoTable.h"
29 #include "SnapClient.h"
30 #include "Mutation.h"
31
32 #include "msg/Messenger.h"
33
34 #include "osdc/Objecter.h"
35
36 #include "messages/MClientSession.h"
37 #include "messages/MClientRequest.h"
38 #include "messages/MClientReply.h"
39 #include "messages/MClientReconnect.h"
40 #include "messages/MClientCaps.h"
41 #include "messages/MClientSnap.h"
42
43 #include "messages/MMDSSlaveRequest.h"
44
45 #include "messages/MLock.h"
46
47 #include "events/EUpdate.h"
48 #include "events/ESlaveUpdate.h"
49 #include "events/ESession.h"
50 #include "events/EOpen.h"
51 #include "events/ECommitted.h"
52
53 #include "include/filepath.h"
54 #include "common/errno.h"
55 #include "common/Timer.h"
56 #include "common/perf_counters.h"
57 #include "include/compat.h"
58 #include "osd/OSDMap.h"
59
60 #include <errno.h>
61
62 #include <list>
63 #include <iostream>
64 #include <boost/utility/string_view.hpp>
65 using namespace std;
66
67 #include "common/config.h"
68
69 #define dout_context g_ceph_context
70 #define dout_subsys ceph_subsys_mds
71 #undef dout_prefix
72 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
73
74 class ServerContext : public MDSInternalContextBase {
75 protected:
76 Server *server;
77 MDSRank *get_mds() override
78 {
79 return server->mds;
80 }
81
82 public:
83 explicit ServerContext(Server *s) : server(s) {
84 assert(server != NULL);
85 }
86 };
87
88 class ServerLogContext : public MDSLogContextBase {
89 protected:
90 Server *server;
91 MDSRank *get_mds() override
92 {
93 return server->mds;
94 }
95
96 MDRequestRef mdr;
97 void pre_finish(int r) override {
98 if (mdr)
99 mdr->mark_event("journal_committed: ");
100 }
101 public:
102 explicit ServerLogContext(Server *s) : server(s) {
103 assert(server != NULL);
104 }
105 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
106 assert(server != NULL);
107 }
108 };
109
110 void Server::create_logger()
111 {
112 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
113 plb.add_u64_counter(l_mdss_handle_client_request,"handle_client_request",
114 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
115 plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
116 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
117 plb.add_u64_counter(l_mdss_handle_client_session, "handle_client_session",
118 "Client session messages", "hcs", PerfCountersBuilder::PRIO_INTERESTING);
119 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", "Client requests dispatched");
120 plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request", "Server requests dispatched");
121 plb.add_u64_counter(l_mdss_req_lookuphash, "req_lookuphash",
122 "Request type lookup hash of inode");
123 plb.add_u64_counter(l_mdss_req_lookupino, "req_lookupino",
124 "Request type lookup inode");
125 plb.add_u64_counter(l_mdss_req_lookupparent, "req_lookupparent",
126 "Request type lookup parent");
127 plb.add_u64_counter(l_mdss_req_lookupname, "req_lookupname",
128 "Request type lookup name");
129 plb.add_u64_counter(l_mdss_req_lookup, "req_lookup",
130 "Request type lookup");
131 plb.add_u64_counter(l_mdss_req_lookupsnap, "req_lookupsnap",
132 "Request type lookup snapshot");
133 plb.add_u64_counter(l_mdss_req_getattr, "req_getattr",
134 "Request type get attribute");
135 plb.add_u64_counter(l_mdss_req_setattr, "req_setattr",
136 "Request type set attribute");
137 plb.add_u64_counter(l_mdss_req_setlayout, "req_setlayout",
138 "Request type set file layout");
139 plb.add_u64_counter(l_mdss_req_setdirlayout, "req_setdirlayout",
140 "Request type set directory layout");
141 plb.add_u64_counter(l_mdss_req_setxattr, "req_setxattr",
142 "Request type set extended attribute");
143 plb.add_u64_counter(l_mdss_req_rmxattr, "req_rmxattr",
144 "Request type remove extended attribute");
145 plb.add_u64_counter(l_mdss_req_readdir, "req_readdir",
146 "Request type read directory");
147 plb.add_u64_counter(l_mdss_req_setfilelock, "req_setfilelock",
148 "Request type set file lock");
149 plb.add_u64_counter(l_mdss_req_getfilelock, "req_getfilelock",
150 "Request type get file lock");
151 plb.add_u64_counter(l_mdss_req_create, "req_create",
152 "Request type create");
153 plb.add_u64_counter(l_mdss_req_open, "req_open",
154 "Request type open");
155 plb.add_u64_counter(l_mdss_req_mknod, "req_mknod",
156 "Request type make node");
157 plb.add_u64_counter(l_mdss_req_link, "req_link",
158 "Request type link");
159 plb.add_u64_counter(l_mdss_req_unlink, "req_unlink",
160 "Request type unlink");
161 plb.add_u64_counter(l_mdss_req_rmdir, "req_rmdir",
162 "Request type remove directory");
163 plb.add_u64_counter(l_mdss_req_rename, "req_rename",
164 "Request type rename");
165 plb.add_u64_counter(l_mdss_req_mkdir, "req_mkdir",
166 "Request type make directory");
167 plb.add_u64_counter(l_mdss_req_symlink, "req_symlink",
168 "Request type symbolic link");
169 plb.add_u64_counter(l_mdss_req_lssnap, "req_lssnap",
170 "Request type list snapshot");
171 plb.add_u64_counter(l_mdss_req_mksnap, "req_mksnap",
172 "Request type make snapshot");
173 plb.add_u64_counter(l_mdss_req_rmsnap, "req_rmsnap",
174 "Request type remove snapshot");
175 plb.add_u64_counter(l_mdss_req_renamesnap, "req_renamesnap",
176 "Request type rename snapshot");
177 logger = plb.create_perf_counters();
178 g_ceph_context->get_perfcounters_collection()->add(logger);
179 }
180
181 Server::Server(MDSRank *m) :
182 mds(m),
183 mdcache(mds->mdcache), mdlog(mds->mdlog),
184 logger(0),
185 is_full(false),
186 reconnect_done(NULL),
187 failed_reconnects(0),
188 reconnect_evicting(false),
189 terminating_sessions(false)
190 {
191 }
192
193
194 /* This function DOES put the passed message before returning*/
195 void Server::dispatch(Message *m)
196 {
197 switch (m->get_type()) {
198 case CEPH_MSG_CLIENT_RECONNECT:
199 handle_client_reconnect(static_cast<MClientReconnect*>(m));
200 return;
201 }
202
203 // active?
204 // handle_slave_request()/handle_client_session() will wait if necessary
205 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
206 MClientRequest *req = static_cast<MClientRequest*>(m);
207 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
208 Session *session = mds->get_session(req);
209 if (!session || session->is_closed()) {
210 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
211 req->put();
212 return;
213 }
214 bool queue_replay = false;
215 if (req->is_replay()) {
216 dout(3) << "queuing replayed op" << dendl;
217 queue_replay = true;
218 } else if (req->get_retry_attempt()) {
219 // process completed request in clientreplay stage. The completed request
220 // might have created new file/directorie. This guarantees MDS sends a reply
221 // to client before other request modifies the new file/directorie.
222 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
223 dout(3) << "queuing completed op" << dendl;
224 queue_replay = true;
225 }
226 // this request was created before the cap reconnect message, drop any embedded
227 // cap releases.
228 req->releases.clear();
229 }
230 if (queue_replay) {
231 req->mark_queued_for_replay();
232 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
233 return;
234 }
235 }
236
237 bool wait_for_active = true;
238 if (mds->is_stopping()) {
239 wait_for_active = false;
240 } else if (mds->is_clientreplay()) {
241 if (req->is_queued_for_replay()) {
242 wait_for_active = false;
243 }
244 }
245 if (wait_for_active) {
246 dout(3) << "not active yet, waiting" << dendl;
247 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
248 return;
249 }
250 }
251
252 switch (m->get_type()) {
253 case CEPH_MSG_CLIENT_SESSION:
254 handle_client_session(static_cast<MClientSession*>(m));
255 return;
256 case CEPH_MSG_CLIENT_REQUEST:
257 handle_client_request(static_cast<MClientRequest*>(m));
258 return;
259 case MSG_MDS_SLAVE_REQUEST:
260 handle_slave_request(static_cast<MMDSSlaveRequest*>(m));
261 return;
262 default:
263 derr << "server unknown message " << m->get_type() << dendl;
264 assert(0 == "server unknown message");
265 }
266 }
267
268
269
270 // ----------------------------------------------------------
271 // SESSION management
272
273 class C_MDS_session_finish : public ServerLogContext {
274 Session *session;
275 uint64_t state_seq;
276 bool open;
277 version_t cmapv;
278 interval_set<inodeno_t> inos;
279 version_t inotablev;
280 Context *fin;
281 public:
282 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
283 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
284 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
285 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { }
286 void finish(int r) override {
287 assert(r == 0);
288 server->_session_logged(session, state_seq, open, cmapv, inos, inotablev);
289 if (fin) {
290 fin->complete(r);
291 }
292 }
293 };
294
295 /* This function DOES put the passed message before returning*/
296 void Server::handle_client_session(MClientSession *m)
297 {
298 version_t pv;
299 bool blacklisted = false;
300 Session *session = mds->get_session(m);
301
302 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
303 assert(m->get_source().is_client()); // should _not_ come from an mds!
304
305 if (!session) {
306 dout(0) << " ignoring sessionless msg " << *m << dendl;
307 m->put();
308 return;
309 }
310
311 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
312 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
313 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
314 // close requests need to be handled when mds is active
315 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
316 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
317 return;
318 }
319 } else {
320 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
321 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
322 return;
323 }
324 }
325
326 if (logger)
327 logger->inc(l_mdss_handle_client_session);
328
329 uint64_t sseq = 0;
330 switch (m->get_op()) {
331 case CEPH_SESSION_REQUEST_OPEN:
332 if (session->is_opening() ||
333 session->is_open() ||
334 session->is_stale() ||
335 session->is_killing() ||
336 terminating_sessions) {
337 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
338 // set client metadata for session opened by prepare_force_open_sessions
339 if (!m->client_meta.empty())
340 session->set_client_metadata(m->client_meta);
341 m->put();
342 return;
343 }
344 assert(session->is_closed() ||
345 session->is_closing());
346
347 if (mds->is_stopping()) {
348 dout(10) << "mds is stopping, dropping open req" << dendl;
349 m->put();
350 return;
351 }
352
353 blacklisted = mds->objecter->with_osdmap(
354 [session](const OSDMap &osd_map) -> bool {
355 return osd_map.is_blacklisted(session->info.inst.addr);
356 });
357
358 if (blacklisted) {
359 dout(10) << "rejecting blacklisted client " << session->info.inst.addr << dendl;
360 mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
361 m->put();
362 return;
363 }
364
365 session->set_client_metadata(m->client_meta);
366 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN "
367 << session->info.client_metadata.size() << " metadata entries:" << dendl;
368 for (map<string, string>::iterator i = session->info.client_metadata.begin();
369 i != session->info.client_metadata.end(); ++i) {
370 dout(20) << " " << i->first << ": " << i->second << dendl;
371 }
372
373 // Special case for the 'root' metadata path; validate that the claimed
374 // root is actually within the caps of the session
375 if (session->info.client_metadata.count("root")) {
376 const auto claimed_root = session->info.client_metadata.at("root");
377 // claimed_root has a leading "/" which we strip before passing
378 // into caps check
379 if (claimed_root.empty() || claimed_root[0] != '/' ||
380 !session->auth_caps.path_capable(claimed_root.substr(1))) {
381 derr << __func__ << " forbidden path claimed as mount root: "
382 << claimed_root << " by " << m->get_source() << dendl;
383 // Tell the client we're rejecting their open
384 mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
385 mds->clog->warn() << "client session with invalid root '" <<
386 claimed_root << "' denied (" << session->info.inst << ")";
387 session->clear();
388 // Drop out; don't record this session in SessionMap or journal it.
389 break;
390 }
391 }
392
393 if (session->is_closed())
394 mds->sessionmap.add_session(session);
395
396 pv = mds->sessionmap.mark_projected(session);
397 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
398 mds->sessionmap.touch_session(session);
399 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, m->client_meta),
400 new C_MDS_session_finish(this, session, sseq, true, pv));
401 mdlog->flush();
402 break;
403
404 case CEPH_SESSION_REQUEST_RENEWCAPS:
405 if (session->is_open() ||
406 session->is_stale()) {
407 mds->sessionmap.touch_session(session);
408 if (session->is_stale()) {
409 mds->sessionmap.set_state(session, Session::STATE_OPEN);
410 mds->locker->resume_stale_caps(session);
411 mds->sessionmap.touch_session(session);
412 }
413 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_RENEWCAPS, m->get_seq()));
414 } else {
415 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
416 }
417 break;
418
419 case CEPH_SESSION_REQUEST_CLOSE:
420 {
421 if (session->is_closed() ||
422 session->is_closing() ||
423 session->is_killing()) {
424 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
425 m->put();
426 return;
427 }
428 if (session->is_importing()) {
429 dout(10) << "ignoring close req on importing session" << dendl;
430 m->put();
431 return;
432 }
433 assert(session->is_open() ||
434 session->is_stale() ||
435 session->is_opening());
436 if (m->get_seq() < session->get_push_seq()) {
437 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
438 << ", dropping" << dendl;
439 m->put();
440 return;
441 }
442 // We are getting a seq that is higher than expected.
443 // Handle the same as any other seqn error.
444 //
445 if (m->get_seq() != session->get_push_seq()) {
446 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
447 << ", BUGGY!" << dendl;
448 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
449 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
450 m->put();
451 return;
452 }
453 journal_close_session(session, Session::STATE_CLOSING, NULL);
454 }
455 break;
456
457 case CEPH_SESSION_FLUSHMSG_ACK:
458 finish_flush_session(session, m->get_seq());
459 break;
460
461 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
462 if (mds->is_active())
463 mdlog->flush();
464 break;
465
466 default:
467 ceph_abort();
468 }
469 m->put();
470 }
471
472 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
473 {
474 for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
475 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
476 assert(session);
477 if (!session->is_open() ||
478 !session->connection.get() ||
479 !session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER))
480 continue;
481 version_t seq = session->wait_for_flush(gather.new_sub());
482 mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
483 }
484 }
485
486 void Server::finish_flush_session(Session *session, version_t seq)
487 {
488 list<MDSInternalContextBase*> finished;
489 session->finish_flush(seq, finished);
490 mds->queue_waiters(finished);
491 }
492
493 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
494 interval_set<inodeno_t>& inos, version_t piv)
495 {
496 dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close")
497 << " " << pv << dendl;
498
499 if (piv) {
500 assert(session->is_closing() || session->is_killing() ||
501 session->is_opening()); // re-open closing session
502 session->info.prealloc_inos.subtract(inos);
503 mds->inotable->apply_release_ids(inos);
504 assert(mds->inotable->get_version() == piv);
505 }
506
507 mds->sessionmap.mark_dirty(session);
508
509 // apply
510 if (session->get_state_seq() != state_seq) {
511 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
512 << ", noop" << dendl;
513 // close must have been canceled (by an import?), or any number of other things..
514 } else if (open) {
515 assert(session->is_opening());
516 mds->sessionmap.set_state(session, Session::STATE_OPEN);
517 mds->sessionmap.touch_session(session);
518 assert(session->connection != NULL);
519 session->connection->send_message(new MClientSession(CEPH_SESSION_OPEN));
520 if (mdcache->is_readonly())
521 session->connection->send_message(new MClientSession(CEPH_SESSION_FORCE_RO));
522 } else if (session->is_closing() ||
523 session->is_killing()) {
524 // kill any lingering capabilities, leases, requests
525 while (!session->caps.empty()) {
526 Capability *cap = session->caps.front();
527 CInode *in = cap->get_inode();
528 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
529 mds->locker->remove_client_cap(in, session->info.inst.name.num());
530 }
531 while (!session->leases.empty()) {
532 ClientLease *r = session->leases.front();
533 CDentry *dn = static_cast<CDentry*>(r->parent);
534 dout(20) << " killing client lease of " << *dn << dendl;
535 dn->remove_client_lease(r, mds->locker);
536 }
537 if (client_reconnect_gather.count(session->info.get_client())) {
538 dout(20) << " removing client from reconnect set" << dendl;
539 client_reconnect_gather.erase(session->info.get_client());
540
541 if (client_reconnect_gather.empty()) {
542 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
543 reconnect_gather_finish();
544 }
545 }
546
547 if (session->is_closing()) {
548 // mark con disposable. if there is a fault, we will get a
549 // reset and clean it up. if the client hasn't received the
550 // CLOSE message yet, they will reconnect and get an
551 // ms_handle_remote_reset() and realize they had in fact closed.
552 // do this *before* sending the message to avoid a possible
553 // race.
554 if (session->connection != NULL) {
555 // Conditional because terminate_sessions will indiscrimately
556 // put sessions in CLOSING whether they ever had a conn or not.
557 session->connection->mark_disposable();
558 }
559
560 // reset session
561 mds->send_message_client(new MClientSession(CEPH_SESSION_CLOSE), session);
562 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
563 session->clear();
564 mds->sessionmap.remove_session(session);
565 } else if (session->is_killing()) {
566 // destroy session, close connection
567 if (session->connection != NULL) {
568 session->connection->mark_down();
569 session->connection->set_priv(NULL);
570 }
571 mds->sessionmap.remove_session(session);
572 } else {
573 ceph_abort();
574 }
575 } else {
576 ceph_abort();
577 }
578 }
579
580 /**
581 * Inject sessions from some source other than actual connections.
582 *
583 * For example:
584 * - sessions inferred from journal replay
585 * - sessions learned from other MDSs during rejoin
586 * - sessions learned from other MDSs during dir/caps migration
587 * - sessions learned from other MDSs during a cross-MDS rename
588 */
589 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
590 map<client_t, pair<Session*,uint64_t> >& smap)
591 {
592 version_t pv = mds->sessionmap.get_projected();
593
594 dout(10) << "prepare_force_open_sessions " << pv
595 << " on " << cm.size() << " clients"
596 << dendl;
597
598 mds->objecter->with_osdmap(
599 [this, &cm](const OSDMap &osd_map) {
600 for (auto p = cm.begin(); p != cm.end(); ) {
601 if (osd_map.is_blacklisted(p->second.addr)) {
602 dout(10) << " ignoring blacklisted client." << p->first
603 << " (" << p->second.addr << ")" << dendl;
604 cm.erase(p++);
605 } else {
606 ++p;
607 }
608 }
609 });
610
611 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
612 Session *session = mds->sessionmap.get_or_add_session(p->second);
613 pv = mds->sessionmap.mark_projected(session);
614 uint64_t sseq;
615 if (session->is_closed() ||
616 session->is_closing() ||
617 session->is_killing()) {
618 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
619 } else {
620 assert(session->is_open() ||
621 session->is_opening() ||
622 session->is_stale());
623 sseq = 0;
624 }
625 smap[p->first] = make_pair(session, sseq);
626 session->inc_importing();
627 }
628 return pv;
629 }
630
631 void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
632 bool dec_import)
633 {
634 /*
635 * FIXME: need to carefully consider the race conditions between a
636 * client trying to close a session and an MDS doing an import
637 * trying to force open a session...
638 */
639 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
640 << " initial v " << mds->sessionmap.get_version() << dendl;
641
642 for (auto &it : smap) {
643 Session *session = it.second.first;
644 uint64_t sseq = it.second.second;
645 if (sseq > 0) {
646 if (session->get_state_seq() != sseq) {
647 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
648 } else {
649 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
650 mds->sessionmap.set_state(session, Session::STATE_OPEN);
651 mds->sessionmap.touch_session(session);
652 mds->send_message_client(new MClientSession(CEPH_SESSION_OPEN), session);
653 if (mdcache->is_readonly())
654 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
655 }
656 } else {
657 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
658 assert(session->is_open() || session->is_stale());
659 }
660
661 if (dec_import) {
662 session->dec_importing();
663 }
664
665 mds->sessionmap.mark_dirty(session);
666 }
667
668 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
669 }
670
671 class C_MDS_TerminatedSessions : public ServerContext {
672 void finish(int r) override {
673 server->terminating_sessions = false;
674 }
675 public:
676 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
677 };
678
679 void Server::terminate_sessions()
680 {
681 dout(2) << "terminate_sessions" << dendl;
682
683 terminating_sessions = true;
684
685 // kill them off. clients will retry etc.
686 set<Session*> sessions;
687 mds->sessionmap.get_client_session_set(sessions);
688 for (set<Session*>::const_iterator p = sessions.begin();
689 p != sessions.end();
690 ++p) {
691 Session *session = *p;
692 if (session->is_closing() ||
693 session->is_killing() ||
694 session->is_closed())
695 continue;
696 journal_close_session(session, Session::STATE_CLOSING, NULL);
697 }
698
699 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
700 }
701
702
703 void Server::find_idle_sessions()
704 {
705 dout(10) << "find_idle_sessions. laggy until " << mds->get_laggy_until() << dendl;
706
707 // timeout/stale
708 // (caps go stale, lease die)
709 utime_t now = ceph_clock_now();
710 utime_t cutoff = now;
711 cutoff -= g_conf->mds_session_timeout;
712 while (1) {
713 Session *session = mds->sessionmap.get_oldest_session(Session::STATE_OPEN);
714 if (!session) break;
715 dout(20) << "laggiest active session is " << session->info.inst << dendl;
716 if (session->last_cap_renew >= cutoff) {
717 dout(20) << "laggiest active session is " << session->info.inst << " and sufficiently new ("
718 << session->last_cap_renew << ")" << dendl;
719 break;
720 }
721
722 dout(10) << "new stale session " << session->info.inst << " last " << session->last_cap_renew << dendl;
723 mds->sessionmap.set_state(session, Session::STATE_STALE);
724 mds->locker->revoke_stale_caps(session);
725 mds->locker->remove_stale_leases(session);
726 mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
727 finish_flush_session(session, session->get_push_seq());
728 }
729
730 // autoclose
731 cutoff = now;
732 cutoff -= g_conf->mds_session_autoclose;
733
734 // don't kick clients if we've been laggy
735 if (mds->get_laggy_until() > cutoff) {
736 dout(10) << " laggy_until " << mds->get_laggy_until() << " > cutoff " << cutoff
737 << ", not kicking any clients to be safe" << dendl;
738 return;
739 }
740
741 if (mds->sessionmap.get_sessions().size() == 1 &&
742 mds->mdsmap->get_num_in_mds() == 1) {
743 dout(20) << "not evicting a slow client, because there is only one"
744 << dendl;
745 return;
746 }
747
748 // Collect a list of sessions exceeding the autoclose threshold
749 std::vector<Session *> to_evict;
750 const auto sessions_p = mds->sessionmap.by_state.find(Session::STATE_STALE);
751 if (sessions_p == mds->sessionmap.by_state.end() || sessions_p->second->empty()) {
752 return;
753 }
754 const auto &stale_sessions = sessions_p->second;
755 assert(stale_sessions != nullptr);
756
757 for (const auto &session: *stale_sessions) {
758 if (session->is_importing()) {
759 dout(10) << "stopping at importing session " << session->info.inst << dendl;
760 break;
761 }
762 assert(session->is_stale());
763 if (session->last_cap_renew >= cutoff) {
764 dout(20) << "oldest stale session is " << session->info.inst << " and sufficiently new ("
765 << session->last_cap_renew << ")" << dendl;
766 break;
767 }
768
769 to_evict.push_back(session);
770 }
771
772 for (const auto &session: to_evict) {
773 utime_t age = now;
774 age -= session->last_cap_renew;
775 mds->clog->warn() << "evicting unresponsive client " << *session
776 << ", after " << age << " seconds";
777 dout(10) << "autoclosing stale session " << session->info.inst << " last "
778 << session->last_cap_renew << dendl;
779
780 if (g_conf->mds_session_blacklist_on_timeout) {
781 std::stringstream ss;
782 mds->evict_client(session->info.inst.name.num(), false, true,
783 ss, nullptr);
784 } else {
785 kill_session(session, NULL);
786 }
787 }
788 }
789
790 /*
791 * XXX bump in the interface here, not using an MDSInternalContextBase here
792 * because all the callers right now happen to use a SaferCond
793 */
794 void Server::kill_session(Session *session, Context *on_safe)
795 {
796 assert(mds->mds_lock.is_locked_by_me());
797
798 if ((session->is_opening() ||
799 session->is_open() ||
800 session->is_stale()) &&
801 !session->is_importing()) {
802 dout(10) << "kill_session " << session << dendl;
803 journal_close_session(session, Session::STATE_KILLING, on_safe);
804 } else {
805 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
806 assert(session->is_closing() ||
807 session->is_closed() ||
808 session->is_killing() ||
809 session->is_importing());
810 if (on_safe) {
811 on_safe->complete(0);
812 }
813 }
814 }
815
816 size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
817 {
818 std::list<Session*> victims;
819 const auto sessions = mds->sessionmap.get_sessions();
820 for (const auto p : sessions) {
821 if (!p.first.is_client()) {
822 // Do not apply OSDMap blacklist to MDS daemons, we find out
823 // about their death via MDSMap.
824 continue;
825 }
826
827 Session *s = p.second;
828 if (blacklist.count(s->info.inst.addr)) {
829 victims.push_back(s);
830 }
831 }
832
833 for (const auto s : victims) {
834 kill_session(s, nullptr);
835 }
836
837 dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
838
839 return victims.size();
840 }
841
842 void Server::journal_close_session(Session *session, int state, Context *on_safe)
843 {
844 uint64_t sseq = mds->sessionmap.set_state(session, state);
845 version_t pv = mds->sessionmap.mark_projected(session);
846 version_t piv = 0;
847
848 // release alloc and pending-alloc inos for this session
849 // and wipe out session state, in case the session close aborts for some reason
850 interval_set<inodeno_t> both;
851 both.insert(session->info.prealloc_inos);
852 both.insert(session->pending_prealloc_inos);
853 if (both.size()) {
854 mds->inotable->project_release_ids(both);
855 piv = mds->inotable->get_projected_version();
856 } else
857 piv = 0;
858
859 mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv),
860 new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe));
861 mdlog->flush();
862
863 // clean up requests, too
864 elist<MDRequestImpl*>::iterator p =
865 session->requests.begin(member_offset(MDRequestImpl,
866 item_session_request));
867 while (!p.end()) {
868 MDRequestRef mdr = mdcache->request_get((*p)->reqid);
869 ++p;
870 mdcache->request_kill(mdr);
871 }
872
873 finish_flush_session(session, session->get_push_seq());
874 }
875
876 void Server::reconnect_clients(MDSInternalContext *reconnect_done_)
877 {
878 reconnect_done = reconnect_done_;
879
880 set<Session*> sessions;
881 mds->sessionmap.get_client_session_set(sessions);
882 for (auto session : sessions) {
883 if (session->is_open())
884 client_reconnect_gather.insert(session->get_client());
885 }
886
887 if (client_reconnect_gather.empty()) {
888 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
889 reconnect_gather_finish();
890 return;
891 }
892
893 // clients will get the mdsmap and discover we're reconnecting via the monitor.
894
895 reconnect_start = ceph_clock_now();
896 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
897 mds->sessionmap.dump();
898 }
899
900 /* This function DOES put the passed message before returning*/
901 void Server::handle_client_reconnect(MClientReconnect *m)
902 {
903 dout(7) << "handle_client_reconnect " << m->get_source() << dendl;
904 client_t from = m->get_source().num();
905 Session *session = mds->get_session(m);
906 assert(session);
907
908 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
909 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
910 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
911 return;
912 }
913
914 utime_t delay = ceph_clock_now();
915 delay -= reconnect_start;
916 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
917
918 bool deny = false;
919 if (!mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
920 // XXX maybe in the future we can do better than this?
921 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
922 mds->clog->info() << "denied reconnect attempt (mds is "
923 << ceph_mds_state_name(mds->get_state())
924 << ") from " << m->get_source_inst()
925 << " after " << delay << " (allowed interval " << g_conf->mds_reconnect_timeout << ")";
926 deny = true;
927 } else if (!session->is_open()) {
928 dout(1) << " session is closed, ignoring reconnect, sending close" << dendl;
929 mds->clog->info() << "denied reconnect attempt (mds is "
930 << ceph_mds_state_name(mds->get_state())
931 << ") from " << m->get_source_inst() << " (session is closed)";
932 deny = true;
933 } else if (mdcache->is_readonly()) {
934 dout(1) << " read-only FS, ignoring reconnect, sending close" << dendl;
935 mds->clog->info() << "denied reconnect attempt (mds is read-only)";
936 deny = true;
937 }
938
939 if (deny) {
940 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_CLOSE));
941 m->put();
942 return;
943 }
944
945 // notify client of success with an OPEN
946 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_OPEN));
947 session->last_cap_renew = ceph_clock_now();
948 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
949
950 // snaprealms
951 for (vector<ceph_mds_snaprealm_reconnect>::iterator p = m->realms.begin();
952 p != m->realms.end();
953 ++p) {
954 CInode *in = mdcache->get_inode(inodeno_t(p->ino));
955 if (in && in->state_test(CInode::STATE_PURGING))
956 continue;
957 if (in) {
958 assert(in->snaprealm);
959 if (in->snaprealm->have_past_parents_open()) {
960 dout(15) << "open snaprealm (w/ past parents) on " << *in << dendl;
961 mdcache->finish_snaprealm_reconnect(from, in->snaprealm, snapid_t(p->seq));
962 } else {
963 dout(15) << "open snaprealm (w/o past parents) on " << *in << dendl;
964 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
965 }
966 } else {
967 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(p->ino)
968 << " seq " << p->seq << dendl;
969 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
970 }
971 }
972
973 // caps
974 for (map<inodeno_t, cap_reconnect_t>::iterator p = m->caps.begin();
975 p != m->caps.end();
976 ++p) {
977 // make sure our last_cap_id is MAX over all issued caps
978 if (p->second.capinfo.cap_id > mdcache->last_cap_id)
979 mdcache->last_cap_id = p->second.capinfo.cap_id;
980
981 CInode *in = mdcache->get_inode(p->first);
982 if (in && in->state_test(CInode::STATE_PURGING))
983 continue;
984 if (in && in->is_auth()) {
985 // we recovered it, and it's ours. take note.
986 dout(15) << "open cap realm " << inodeno_t(p->second.capinfo.snaprealm)
987 << " on " << *in << dendl;
988 in->reconnect_cap(from, p->second, session);
989 mdcache->add_reconnected_cap(from, p->first, p->second);
990 recover_filelocks(in, p->second.flockbl, m->get_orig_source().num());
991 continue;
992 }
993
994 if (in && !in->is_auth()) {
995 // not mine.
996 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
997 // add to cap export list.
998 p->second.path.clear(); // we don't need path
999 mdcache->rejoin_export_caps(p->first, from, p->second,
1000 in->authority().first);
1001 } else {
1002 // don't know if the inode is mine
1003 dout(10) << "missing ino " << p->first << ", will load later" << dendl;
1004 p->second.path.clear(); // we don't need path
1005 mdcache->rejoin_recovered_caps(p->first, from, p->second, MDS_RANK_NONE);
1006 }
1007 }
1008 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1009
1010 // remove from gather set
1011 client_reconnect_gather.erase(from);
1012 if (client_reconnect_gather.empty())
1013 reconnect_gather_finish();
1014
1015 m->put();
1016 }
1017
1018
1019
1020 void Server::reconnect_gather_finish()
1021 {
1022 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1023 assert(reconnect_done);
1024 reconnect_done->complete(0);
1025 reconnect_done = NULL;
1026 }
1027
1028 void Server::reconnect_tick()
1029 {
1030 if (reconnect_evicting) {
1031 dout(4) << "reconnect_tick: waiting for evictions" << dendl;
1032 return;
1033 }
1034
1035 utime_t reconnect_end = reconnect_start;
1036 reconnect_end += g_conf->mds_reconnect_timeout;
1037 if (ceph_clock_now() >= reconnect_end &&
1038 !client_reconnect_gather.empty()) {
1039 dout(10) << "reconnect timed out" << dendl;
1040
1041 // If we're doing blacklist evictions, use this to wait for them before
1042 // proceeding to reconnect_gather_finish
1043 MDSGatherBuilder gather(g_ceph_context);
1044
1045 for (set<client_t>::iterator p = client_reconnect_gather.begin();
1046 p != client_reconnect_gather.end();
1047 ++p) {
1048 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
1049 assert(session);
1050 dout(1) << "reconnect gave up on " << session->info.inst << dendl;
1051
1052 mds->clog->warn() << "evicting unresponsive client " << *session
1053 << ", after waiting " << g_conf->mds_reconnect_timeout
1054 << " seconds during MDS startup";
1055
1056 if (g_conf->mds_session_blacklist_on_timeout) {
1057 std::stringstream ss;
1058 mds->evict_client(session->info.inst.name.num(), false, true, ss,
1059 gather.new_sub());
1060 } else {
1061 kill_session(session, NULL);
1062 }
1063
1064 failed_reconnects++;
1065 }
1066 client_reconnect_gather.clear();
1067
1068 if (gather.has_subs()) {
1069 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1070 gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
1071 [this](int r){reconnect_gather_finish();})));
1072 gather.activate();
1073 reconnect_evicting = true;
1074 } else {
1075 reconnect_gather_finish();
1076 }
1077 }
1078 }
1079
1080 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1081 {
1082 if (!locks.length()) return;
1083 int numlocks;
1084 ceph_filelock lock;
1085 bufferlist::iterator p = locks.begin();
1086 ::decode(numlocks, p);
1087 for (int i = 0; i < numlocks; ++i) {
1088 ::decode(lock, p);
1089 lock.client = client;
1090 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1091 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1092 }
1093 ::decode(numlocks, p);
1094 for (int i = 0; i < numlocks; ++i) {
1095 ::decode(lock, p);
1096 lock.client = client;
1097 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1098 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1099 }
1100 }
1101
1102
1103 /**
1104 * Call this when the MDCache is oversized, to send requests to the clients
1105 * to trim some caps, and consequently unpin some inodes in the MDCache so
1106 * that it can trim too.
1107 */
1108 void Server::recall_client_state(void)
1109 {
1110 /* try to recall at least 80% of all caps */
1111 uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
1112 uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
1113 if (max_caps_per_client < min_caps_per_client) {
1114 dout(0) << "max_caps_per_client " << max_caps_per_client
1115 << " < min_caps_per_client " << min_caps_per_client << dendl;
1116 max_caps_per_client = min_caps_per_client + 1;
1117 }
1118
1119 /* unless this ratio is smaller: */
1120 /* ratio: determine the amount of caps to recall from each client. Use
1121 * percentage full over the cache reservation. Cap the ratio at 80% of client
1122 * caps. */
1123 double ratio = 1.0-fmin(0.80, mdcache->cache_toofull_ratio());
1124
1125 dout(10) << "recall_client_state " << ratio
1126 << ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
1127 << dendl;
1128
1129 set<Session*> sessions;
1130 mds->sessionmap.get_client_session_set(sessions);
1131 for (auto &session : sessions) {
1132 if (!session->is_open() ||
1133 !session->info.inst.name.is_client())
1134 continue;
1135
1136 dout(10) << " session " << session->info.inst
1137 << " caps " << session->caps.size()
1138 << ", leases " << session->leases.size()
1139 << dendl;
1140
1141 uint64_t newlim = MAX(MIN((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
1142 if (session->caps.size() > newlim) {
1143 MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
1144 m->head.max_caps = newlim;
1145 mds->send_message_client(m, session);
1146 session->notify_recall_sent(newlim);
1147 }
1148 }
1149 }
1150
1151 void Server::force_clients_readonly()
1152 {
1153 dout(10) << "force_clients_readonly" << dendl;
1154 set<Session*> sessions;
1155 mds->sessionmap.get_client_session_set(sessions);
1156 for (set<Session*>::const_iterator p = sessions.begin();
1157 p != sessions.end();
1158 ++p) {
1159 Session *session = *p;
1160 if (!session->info.inst.name.is_client() ||
1161 !(session->is_open() || session->is_stale()))
1162 continue;
1163 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
1164 }
1165 }
1166
1167 /*******
1168 * some generic stuff for finishing off requests
1169 */
1170 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1171 {
1172 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1173 assert(!mdr->has_completed);
1174
1175 // note trace items for eventual reply.
1176 mdr->tracei = in;
1177 if (in)
1178 mdr->pin(in);
1179
1180 mdr->tracedn = dn;
1181 if (dn)
1182 mdr->pin(dn);
1183
1184 early_reply(mdr, in, dn);
1185
1186 mdr->committing = true;
1187 submit_mdlog_entry(le, fin, mdr, __func__);
1188
1189 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1190 if (mds->queue_one_replay()) {
1191 dout(10) << " queued next replay op" << dendl;
1192 } else {
1193 dout(10) << " journaled last replay op, flushing" << dendl;
1194 mdlog->flush();
1195 }
1196 } else if (mdr->did_early_reply)
1197 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
1198 else
1199 mdlog->flush();
1200 }
1201
1202 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1203 const char *event)
1204 {
1205 if (mdr) {
1206 string event_str("submit entry: ");
1207 event_str += event;
1208 mdr->mark_event_string(event_str);
1209 }
1210 mdlog->submit_entry(le, fin);
1211 }
1212
1213 /*
1214 * send response built from mdr contents and error code; clean up mdr
1215 */
1216 void Server::respond_to_request(MDRequestRef& mdr, int r)
1217 {
1218 if (mdr->client_request) {
1219 reply_client_request(mdr, new MClientReply(mdr->client_request, r));
1220
1221 // add here to avoid counting ops multiple times (e.g., locks, loading)
1222 switch(mdr->client_request->get_op()) {
1223 case CEPH_MDS_OP_LOOKUPHASH:
1224 logger->inc(l_mdss_req_lookuphash);
1225 break;
1226 case CEPH_MDS_OP_LOOKUPINO:
1227 logger->inc(l_mdss_req_lookupino);
1228 break;
1229 case CEPH_MDS_OP_LOOKUPPARENT:
1230 logger->inc(l_mdss_req_lookupparent);
1231 break;
1232 case CEPH_MDS_OP_LOOKUPNAME:
1233 logger->inc(l_mdss_req_lookupname);
1234 break;
1235 case CEPH_MDS_OP_LOOKUP:
1236 logger->inc(l_mdss_req_lookup);
1237 break;
1238 case CEPH_MDS_OP_LOOKUPSNAP:
1239 logger->inc(l_mdss_req_lookupsnap);
1240 break;
1241 case CEPH_MDS_OP_GETATTR:
1242 logger->inc(l_mdss_req_getattr);
1243 break;
1244 case CEPH_MDS_OP_SETATTR:
1245 logger->inc(l_mdss_req_setattr);
1246 break;
1247 case CEPH_MDS_OP_SETLAYOUT:
1248 logger->inc(l_mdss_req_setlayout);
1249 break;
1250 case CEPH_MDS_OP_SETDIRLAYOUT:
1251 logger->inc(l_mdss_req_setdirlayout);
1252 break;
1253 case CEPH_MDS_OP_SETXATTR:
1254 logger->inc(l_mdss_req_setxattr);
1255 break;
1256 case CEPH_MDS_OP_RMXATTR:
1257 logger->inc(l_mdss_req_rmxattr);
1258 break;
1259 case CEPH_MDS_OP_READDIR:
1260 logger->inc(l_mdss_req_readdir);
1261 break;
1262 case CEPH_MDS_OP_SETFILELOCK:
1263 logger->inc(l_mdss_req_setfilelock);
1264 break;
1265 case CEPH_MDS_OP_GETFILELOCK:
1266 logger->inc(l_mdss_req_getfilelock);
1267 break;
1268 case CEPH_MDS_OP_CREATE:
1269 logger->inc(l_mdss_req_create);
1270 case CEPH_MDS_OP_OPEN:
1271 logger->inc(l_mdss_req_open);
1272 break;
1273 case CEPH_MDS_OP_MKNOD:
1274 logger->inc(l_mdss_req_mknod);
1275 break;
1276 case CEPH_MDS_OP_LINK:
1277 logger->inc(l_mdss_req_link);
1278 break;
1279 case CEPH_MDS_OP_UNLINK:
1280 logger->inc(l_mdss_req_unlink);
1281 break;
1282 case CEPH_MDS_OP_RMDIR:
1283 logger->inc(l_mdss_req_rmdir);
1284 break;
1285 case CEPH_MDS_OP_RENAME:
1286 logger->inc(l_mdss_req_rename);
1287 break;
1288 case CEPH_MDS_OP_MKDIR:
1289 logger->inc(l_mdss_req_mkdir);
1290 break;
1291 case CEPH_MDS_OP_SYMLINK:
1292 logger->inc(l_mdss_req_symlink);
1293 break;
1294 case CEPH_MDS_OP_LSSNAP:
1295 logger->inc(l_mdss_req_lssnap);
1296 break;
1297 case CEPH_MDS_OP_MKSNAP:
1298 logger->inc(l_mdss_req_mksnap);
1299 break;
1300 case CEPH_MDS_OP_RMSNAP:
1301 logger->inc(l_mdss_req_rmsnap);
1302 break;
1303 case CEPH_MDS_OP_RENAMESNAP:
1304 logger->inc(l_mdss_req_renamesnap);
1305 break;
1306 }
1307 } else if (mdr->internal_op > -1) {
1308 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1309 if (!mdr->internal_op_finish)
1310 assert(0 == "trying to respond to internal op without finisher");
1311 mdr->internal_op_finish->complete(r);
1312 mdcache->request_finish(mdr);
1313 }
1314 }
1315
1316 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
1317 {
1318 if (!g_conf->mds_early_reply)
1319 return;
1320
1321 if (mdr->no_early_reply) {
1322 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
1323 return;
1324 }
1325
1326 if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
1327 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
1328 return;
1329 }
1330
1331 if (mdr->alloc_ino) {
1332 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
1333 return;
1334 }
1335
1336 MClientRequest *req = mdr->client_request;
1337 entity_inst_t client_inst = req->get_source_inst();
1338 if (client_inst.name.is_mds())
1339 return;
1340
1341 if (req->is_replay()) {
1342 dout(10) << " no early reply on replay op" << dendl;
1343 return;
1344 }
1345
1346
1347 MClientReply *reply = new MClientReply(req, 0);
1348 reply->set_unsafe();
1349
1350 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1351 //
1352 //_rename_finish() does not send dentry link/unlink message to replicas.
1353 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1354 // that have projected linkages from getting new replica.
1355 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
1356
1357 dout(10) << "early_reply " << reply->get_result()
1358 << " (" << cpp_strerror(reply->get_result())
1359 << ") " << *req << dendl;
1360
1361 if (tracei || tracedn) {
1362 if (tracei)
1363 mdr->cap_releases.erase(tracei->vino());
1364 if (tracedn)
1365 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1366
1367 set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid,
1368 req->get_dentry_wanted(), mdr);
1369 }
1370
1371 reply->set_extra_bl(mdr->reply_extra_bl);
1372 req->get_connection()->send_message(reply);
1373
1374 mdr->did_early_reply = true;
1375
1376 mds->logger->inc(l_mds_reply);
1377 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
1378 mds->logger->tinc(l_mds_reply_latency, lat);
1379 dout(20) << "lat " << lat << dendl;
1380
1381 mdr->mark_event("early_replied");
1382 }
1383
1384 /*
1385 * send given reply
1386 * include a trace to tracei
1387 * Clean up mdr
1388 */
1389 void Server::reply_client_request(MDRequestRef& mdr, MClientReply *reply)
1390 {
1391 assert(mdr.get());
1392 MClientRequest *req = mdr->client_request;
1393
1394 dout(7) << "reply_client_request " << reply->get_result()
1395 << " (" << cpp_strerror(reply->get_result())
1396 << ") " << *req << dendl;
1397
1398 mdr->mark_event("replying");
1399
1400 Session *session = mdr->session;
1401
1402 // note successful request in session map?
1403 //
1404 // setfilelock requests are special, they only modify states in MDS memory.
1405 // The states get lost when MDS fails. If Client re-send a completed
1406 // setfilelock request, it means that client did not receive corresponding
1407 // setfilelock reply. So MDS should re-execute the setfilelock request.
1408 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
1409 reply->get_result() == 0 && session) {
1410 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
1411 session->add_completed_request(mdr->reqid.tid, created);
1412 if (mdr->ls) {
1413 mdr->ls->touched_sessions.insert(session->info.inst.name);
1414 }
1415 }
1416
1417 // give any preallocated inos to the session
1418 apply_allocated_inos(mdr, session);
1419
1420 // get tracei/tracedn from mdr?
1421 snapid_t snapid = mdr->snapid;
1422 CInode *tracei = mdr->tracei;
1423 CDentry *tracedn = mdr->tracedn;
1424
1425 bool is_replay = mdr->client_request->is_replay();
1426 bool did_early_reply = mdr->did_early_reply;
1427 entity_inst_t client_inst = req->get_source_inst();
1428 int dentry_wanted = req->get_dentry_wanted();
1429
1430 if (!did_early_reply && !is_replay) {
1431
1432 mds->logger->inc(l_mds_reply);
1433 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
1434 mds->logger->tinc(l_mds_reply_latency, lat);
1435 dout(20) << "lat " << lat << dendl;
1436
1437 if (tracei)
1438 mdr->cap_releases.erase(tracei->vino());
1439 if (tracedn)
1440 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1441 }
1442
1443 // drop non-rdlocks before replying, so that we can issue leases
1444 mdcache->request_drop_non_rdlocks(mdr);
1445
1446 // reply at all?
1447 if (client_inst.name.is_mds() || !session) {
1448 reply->put(); // mds doesn't need a reply
1449 reply = 0;
1450 } else {
1451 // send reply.
1452 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
1453 (tracei || tracedn)) {
1454 if (is_replay) {
1455 if (tracei)
1456 mdcache->try_reconnect_cap(tracei, session);
1457 } else {
1458 // include metadata in reply
1459 set_trace_dist(session, reply, tracei, tracedn,
1460 snapid, dentry_wanted,
1461 mdr);
1462 }
1463 }
1464
1465 // We can set the extra bl unconditionally: if it's already been sent in the
1466 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
1467 reply->set_extra_bl(mdr->reply_extra_bl);
1468
1469 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
1470 req->get_connection()->send_message(reply);
1471 }
1472
1473 if (req->is_queued_for_replay() &&
1474 (mdr->has_completed || reply->get_result() < 0)) {
1475 if (reply->get_result() < 0) {
1476 int r = reply->get_result();
1477 derr << "reply_client_request: failed to replay " << *req
1478 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
1479 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
1480 }
1481 mds->queue_one_replay();
1482 }
1483
1484 // clean up request
1485 mdcache->request_finish(mdr);
1486
1487 // take a closer look at tracei, if it happens to be a remote link
1488 if (tracei &&
1489 tracedn &&
1490 tracedn->get_projected_linkage()->is_remote()) {
1491 mdcache->eval_remote(tracedn);
1492 }
1493 }
1494
1495
1496 void Server::encode_empty_dirstat(bufferlist& bl)
1497 {
1498 static DirStat empty;
1499 empty.encode(bl);
1500 }
1501
1502 void Server::encode_infinite_lease(bufferlist& bl)
1503 {
1504 LeaseStat e;
1505 e.seq = 0;
1506 e.mask = -1;
1507 e.duration_ms = -1;
1508 ::encode(e, bl);
1509 dout(20) << "encode_infinite_lease " << e << dendl;
1510 }
1511
1512 void Server::encode_null_lease(bufferlist& bl)
1513 {
1514 LeaseStat e;
1515 e.seq = 0;
1516 e.mask = 0;
1517 e.duration_ms = 0;
1518 ::encode(e, bl);
1519 dout(20) << "encode_null_lease " << e << dendl;
1520 }
1521
1522
1523 /*
1524 * pass inode OR dentry (not both, or we may get confused)
1525 *
1526 * trace is in reverse order (i.e. root inode comes last)
1527 */
1528 void Server::set_trace_dist(Session *session, MClientReply *reply,
1529 CInode *in, CDentry *dn,
1530 snapid_t snapid,
1531 int dentry_wanted,
1532 MDRequestRef& mdr)
1533 {
1534 // skip doing this for debugging purposes?
1535 if (g_conf->mds_inject_traceless_reply_probability &&
1536 mdr->ls && !mdr->o_trunc &&
1537 (rand() % 10000 < g_conf->mds_inject_traceless_reply_probability * 10000.0)) {
1538 dout(5) << "deliberately skipping trace for " << *reply << dendl;
1539 return;
1540 }
1541
1542 // inode, dentry, dir, ..., inode
1543 bufferlist bl;
1544 mds_rank_t whoami = mds->get_nodeid();
1545 client_t client = session->get_client();
1546 utime_t now = ceph_clock_now();
1547
1548 dout(20) << "set_trace_dist snapid " << snapid << dendl;
1549
1550 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
1551
1552 // realm
1553 if (snapid == CEPH_NOSNAP) {
1554 SnapRealm *realm;
1555 if (in)
1556 realm = in->find_snaprealm();
1557 else
1558 realm = dn->get_dir()->get_inode()->find_snaprealm();
1559 reply->snapbl = realm->get_snap_trace();
1560 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
1561 }
1562
1563 // dir + dentry?
1564 if (dn) {
1565 reply->head.is_dentry = 1;
1566 CDir *dir = dn->get_dir();
1567 CInode *diri = dir->get_inode();
1568
1569 diri->encode_inodestat(bl, session, NULL, snapid);
1570 dout(20) << "set_trace_dist added diri " << *diri << dendl;
1571
1572 #ifdef MDS_VERIFY_FRAGSTAT
1573 if (dir->is_complete())
1574 dir->verify_fragstat();
1575 #endif
1576 dir->encode_dirstat(bl, whoami);
1577 dout(20) << "set_trace_dist added dir " << *dir << dendl;
1578
1579 ::encode(dn->get_name(), bl);
1580 if (snapid == CEPH_NOSNAP)
1581 mds->locker->issue_client_lease(dn, client, bl, now, session);
1582 else
1583 encode_null_lease(bl);
1584 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
1585 } else
1586 reply->head.is_dentry = 0;
1587
1588 // inode
1589 if (in) {
1590 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
1591 dout(20) << "set_trace_dist added in " << *in << dendl;
1592 reply->head.is_target = 1;
1593 } else
1594 reply->head.is_target = 0;
1595
1596 reply->set_trace(bl);
1597 }
1598
1599
1600
1601
1602 /***
1603 * process a client request
1604 * This function DOES put the passed message before returning
1605 */
1606 void Server::handle_client_request(MClientRequest *req)
1607 {
1608 dout(4) << "handle_client_request " << *req << dendl;
1609
1610 if (mds->logger)
1611 mds->logger->inc(l_mds_request);
1612 if (logger)
1613 logger->inc(l_mdss_handle_client_request);
1614
1615 if (!mdcache->is_open()) {
1616 dout(5) << "waiting for root" << dendl;
1617 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
1618 return;
1619 }
1620
1621 // active session?
1622 Session *session = 0;
1623 if (req->get_source().is_client()) {
1624 session = mds->get_session(req);
1625 if (!session) {
1626 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
1627 } else if (session->is_closed() ||
1628 session->is_closing() ||
1629 session->is_killing()) {
1630 dout(5) << "session closed|closing|killing, dropping" << dendl;
1631 session = NULL;
1632 }
1633 if (!session) {
1634 if (req->is_queued_for_replay())
1635 mds->queue_one_replay();
1636 req->put();
1637 return;
1638 }
1639 }
1640
1641 // old mdsmap?
1642 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
1643 // send it? hrm, this isn't ideal; they may get a lot of copies if
1644 // they have a high request rate.
1645 }
1646
1647 // completed request?
1648 bool has_completed = false;
1649 if (req->is_replay() || req->get_retry_attempt()) {
1650 assert(session);
1651 inodeno_t created;
1652 if (session->have_completed_request(req->get_reqid().tid, &created)) {
1653 has_completed = true;
1654 // Don't send traceless reply if the completed request has created
1655 // new inode. Treat the request as lookup request instead.
1656 if (req->is_replay() ||
1657 ((created == inodeno_t() || !mds->is_clientreplay()) &&
1658 req->get_op() != CEPH_MDS_OP_OPEN &&
1659 req->get_op() != CEPH_MDS_OP_CREATE)) {
1660 dout(5) << "already completed " << req->get_reqid() << dendl;
1661 MClientReply *reply = new MClientReply(req, 0);
1662 if (created != inodeno_t()) {
1663 bufferlist extra;
1664 ::encode(created, extra);
1665 reply->set_extra_bl(extra);
1666 }
1667 req->get_connection()->send_message(reply);
1668
1669 if (req->is_queued_for_replay())
1670 mds->queue_one_replay();
1671
1672 req->put();
1673 return;
1674 }
1675 if (req->get_op() != CEPH_MDS_OP_OPEN &&
1676 req->get_op() != CEPH_MDS_OP_CREATE) {
1677 dout(10) << " completed request which created new inode " << created
1678 << ", convert it to lookup request" << dendl;
1679 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
1680 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
1681 }
1682 }
1683 }
1684
1685 // trim completed_request list
1686 if (req->get_oldest_client_tid() > 0) {
1687 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
1688 assert(session);
1689 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
1690 // Sessions 'completed_requests' was dirtied, mark it to be
1691 // potentially flushed at segment expiry.
1692 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
1693
1694 if (session->get_num_trim_requests_warnings() > 0 &&
1695 session->get_num_completed_requests() * 2 < g_conf->mds_max_completed_requests)
1696 session->reset_num_trim_requests_warnings();
1697 } else {
1698 if (session->get_num_completed_requests() >=
1699 (g_conf->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
1700 session->inc_num_trim_requests_warnings();
1701 stringstream ss;
1702 ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
1703 << req->get_oldest_client_tid() << "), "
1704 << session->get_num_completed_requests()
1705 << " completed requests recorded in session\n";
1706 mds->clog->warn() << ss.str();
1707 dout(20) << __func__ << " " << ss.str() << dendl;
1708 }
1709 }
1710 }
1711
1712 // register + dispatch
1713 MDRequestRef mdr = mdcache->request_start(req);
1714 if (!mdr.get())
1715 return;
1716
1717 if (session) {
1718 mdr->session = session;
1719 session->requests.push_back(&mdr->item_session_request);
1720 }
1721
1722 if (has_completed)
1723 mdr->has_completed = true;
1724
1725 // process embedded cap releases?
1726 // (only if NOT replay!)
1727 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
1728 client_t client = req->get_source().num();
1729 for (vector<MClientRequest::Release>::iterator p = req->releases.begin();
1730 p != req->releases.end();
1731 ++p)
1732 mds->locker->process_request_cap_release(mdr, client, p->item, p->dname);
1733 req->releases.clear();
1734 }
1735
1736 dispatch_client_request(mdr);
1737 return;
1738 }
1739
1740 void Server::handle_osd_map()
1741 {
1742 /* Note that we check the OSDMAP_FULL flag directly rather than
1743 * using osdmap_full_flag(), because we want to know "is the flag set"
1744 * rather than "does the flag apply to us?" */
1745 mds->objecter->with_osdmap([this](const OSDMap& o) {
1746 auto pi = o.get_pg_pool(mds->mdsmap->get_metadata_pool());
1747 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
1748 dout(7) << __func__ << ": full = " << is_full << " epoch = "
1749 << o.get_epoch() << dendl;
1750 });
1751 }
1752
1753 void Server::dispatch_client_request(MDRequestRef& mdr)
1754 {
1755 // we shouldn't be waiting on anyone.
1756 assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
1757
1758 if (mdr->killed) {
1759 dout(10) << "request " << *mdr << " was killed" << dendl;
1760 return;
1761 } else if (mdr->aborted) {
1762 mdr->aborted = false;
1763 mdcache->request_kill(mdr);
1764 return;
1765 }
1766
1767 MClientRequest *req = mdr->client_request;
1768
1769 if (logger) logger->inc(l_mdss_dispatch_client_request);
1770
1771 dout(7) << "dispatch_client_request " << *req << dendl;
1772
1773 if (req->may_write()) {
1774 if (mdcache->is_readonly()) {
1775 dout(10) << " read-only FS" << dendl;
1776 respond_to_request(mdr, -EROFS);
1777 return;
1778 }
1779 if (mdr->has_more() && mdr->more()->slave_error) {
1780 dout(10) << " got error from slaves" << dendl;
1781 respond_to_request(mdr, mdr->more()->slave_error);
1782 return;
1783 }
1784 }
1785
1786 if (is_full) {
1787 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1788 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
1789 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1790 req->get_op() == CEPH_MDS_OP_RMXATTR ||
1791 req->get_op() == CEPH_MDS_OP_SETXATTR ||
1792 req->get_op() == CEPH_MDS_OP_CREATE ||
1793 req->get_op() == CEPH_MDS_OP_SYMLINK ||
1794 req->get_op() == CEPH_MDS_OP_MKSNAP ||
1795 ((req->get_op() == CEPH_MDS_OP_LINK ||
1796 req->get_op() == CEPH_MDS_OP_RENAME) &&
1797 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
1798 ) {
1799
1800 dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
1801 respond_to_request(mdr, -ENOSPC);
1802 return;
1803 } else {
1804 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
1805 }
1806 }
1807
1808 switch (req->get_op()) {
1809 case CEPH_MDS_OP_LOOKUPHASH:
1810 case CEPH_MDS_OP_LOOKUPINO:
1811 handle_client_lookup_ino(mdr, false, false);
1812 break;
1813 case CEPH_MDS_OP_LOOKUPPARENT:
1814 handle_client_lookup_ino(mdr, true, false);
1815 break;
1816 case CEPH_MDS_OP_LOOKUPNAME:
1817 handle_client_lookup_ino(mdr, false, true);
1818 break;
1819
1820 // inodes ops.
1821 case CEPH_MDS_OP_LOOKUP:
1822 handle_client_getattr(mdr, true);
1823 break;
1824
1825 case CEPH_MDS_OP_LOOKUPSNAP:
1826 // lookupsnap does not reference a CDentry; treat it as a getattr
1827 case CEPH_MDS_OP_GETATTR:
1828 handle_client_getattr(mdr, false);
1829 break;
1830
1831 case CEPH_MDS_OP_SETATTR:
1832 handle_client_setattr(mdr);
1833 break;
1834 case CEPH_MDS_OP_SETLAYOUT:
1835 handle_client_setlayout(mdr);
1836 break;
1837 case CEPH_MDS_OP_SETDIRLAYOUT:
1838 handle_client_setdirlayout(mdr);
1839 break;
1840 case CEPH_MDS_OP_SETXATTR:
1841 handle_client_setxattr(mdr);
1842 break;
1843 case CEPH_MDS_OP_RMXATTR:
1844 handle_client_removexattr(mdr);
1845 break;
1846
1847 case CEPH_MDS_OP_READDIR:
1848 handle_client_readdir(mdr);
1849 break;
1850
1851 case CEPH_MDS_OP_SETFILELOCK:
1852 handle_client_file_setlock(mdr);
1853 break;
1854
1855 case CEPH_MDS_OP_GETFILELOCK:
1856 handle_client_file_readlock(mdr);
1857 break;
1858
1859 // funky.
1860 case CEPH_MDS_OP_CREATE:
1861 if (mdr->has_completed)
1862 handle_client_open(mdr); // already created.. just open
1863 else
1864 handle_client_openc(mdr);
1865 break;
1866
1867 case CEPH_MDS_OP_OPEN:
1868 handle_client_open(mdr);
1869 break;
1870
1871 // namespace.
1872 // no prior locks.
1873 case CEPH_MDS_OP_MKNOD:
1874 handle_client_mknod(mdr);
1875 break;
1876 case CEPH_MDS_OP_LINK:
1877 handle_client_link(mdr);
1878 break;
1879 case CEPH_MDS_OP_UNLINK:
1880 case CEPH_MDS_OP_RMDIR:
1881 handle_client_unlink(mdr);
1882 break;
1883 case CEPH_MDS_OP_RENAME:
1884 handle_client_rename(mdr);
1885 break;
1886 case CEPH_MDS_OP_MKDIR:
1887 handle_client_mkdir(mdr);
1888 break;
1889 case CEPH_MDS_OP_SYMLINK:
1890 handle_client_symlink(mdr);
1891 break;
1892
1893
1894 // snaps
1895 case CEPH_MDS_OP_LSSNAP:
1896 handle_client_lssnap(mdr);
1897 break;
1898 case CEPH_MDS_OP_MKSNAP:
1899 handle_client_mksnap(mdr);
1900 break;
1901 case CEPH_MDS_OP_RMSNAP:
1902 handle_client_rmsnap(mdr);
1903 break;
1904 case CEPH_MDS_OP_RENAMESNAP:
1905 handle_client_renamesnap(mdr);
1906 break;
1907
1908 default:
1909 dout(1) << " unknown client op " << req->get_op() << dendl;
1910 respond_to_request(mdr, -EOPNOTSUPP);
1911 }
1912 }
1913
1914
1915 // ---------------------------------------
1916 // SLAVE REQUESTS
1917
1918 /* This function DOES put the passed message before returning*/
1919 void Server::handle_slave_request(MMDSSlaveRequest *m)
1920 {
1921 dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
1922 mds_rank_t from = mds_rank_t(m->get_source().num());
1923
1924 if (logger) logger->inc(l_mdss_handle_slave_request);
1925
1926 // reply?
1927 if (m->is_reply())
1928 return handle_slave_request_reply(m);
1929
1930 // the purpose of rename notify is enforcing causal message ordering. making sure
1931 // bystanders have received all messages from rename srcdn's auth MDS.
1932 if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
1933 MMDSSlaveRequest *reply = new MMDSSlaveRequest(m->get_reqid(), m->get_attempt(),
1934 MMDSSlaveRequest::OP_RENAMENOTIFYACK);
1935 mds->send_message(reply, m->get_connection());
1936 m->put();
1937 return;
1938 }
1939
1940 CDentry *straydn = NULL;
1941 if (m->stray.length() > 0) {
1942 straydn = mdcache->add_replica_stray(m->stray, from);
1943 assert(straydn);
1944 m->stray.clear();
1945 }
1946
1947 // am i a new slave?
1948 MDRequestRef mdr;
1949 if (mdcache->have_request(m->get_reqid())) {
1950 // existing?
1951 mdr = mdcache->request_get(m->get_reqid());
1952
1953 // is my request newer?
1954 if (mdr->attempt > m->get_attempt()) {
1955 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
1956 << ", dropping " << *m << dendl;
1957 m->put();
1958 return;
1959 }
1960
1961
1962 if (mdr->attempt < m->get_attempt()) {
1963 // mine is old, close it out
1964 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
1965 << ", closing out" << dendl;
1966 mdcache->request_finish(mdr);
1967 mdr.reset();
1968 } else if (mdr->slave_to_mds != from) {
1969 dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
1970 m->put();
1971 return;
1972 }
1973
1974 if (m->get_op() == MMDSSlaveRequest::OP_FINISH && m->is_abort()) {
1975 mdr->aborted = true;
1976 if (mdr->slave_request) {
1977 // only abort on-going xlock, wrlock and auth pin
1978 assert(!mdr->slave_did_prepare());
1979 } else {
1980 mdcache->request_finish(mdr);
1981 }
1982 m->put();
1983 return;
1984 }
1985 }
1986 if (!mdr.get()) {
1987 // new?
1988 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
1989 dout(10) << "missing slave request for " << m->get_reqid()
1990 << " OP_FINISH, must have lost race with a forward" << dendl;
1991 m->put();
1992 return;
1993 }
1994 mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
1995 mdr->set_op_stamp(m->op_stamp);
1996 }
1997 assert(mdr->slave_request == 0); // only one at a time, please!
1998
1999 if (straydn) {
2000 mdr->pin(straydn);
2001 mdr->straydn = straydn;
2002 }
2003
2004 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2005 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2006 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2007 return;
2008 } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2009 mdr->locks.empty()) {
2010 dout(3) << "not active yet, waiting" << dendl;
2011 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2012 return;
2013 }
2014
2015 mdr->slave_request = m;
2016
2017 dispatch_slave_request(mdr);
2018 }
2019
2020 /* This function DOES put the passed message before returning*/
2021 void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
2022 {
2023 mds_rank_t from = mds_rank_t(m->get_source().num());
2024
2025 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2026 metareqid_t r = m->get_reqid();
2027 if (!mdcache->have_uncommitted_master(r, from)) {
2028 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2029 << from << " reqid " << r << dendl;
2030 m->put();
2031 return;
2032 }
2033 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2034 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2035 return;
2036 }
2037
2038 if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
2039 metareqid_t r = m->get_reqid();
2040 mdcache->committed_master_slave(r, from);
2041 m->put();
2042 return;
2043 }
2044
2045 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2046 if (m->get_attempt() != mdr->attempt) {
2047 dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
2048 << m->get_attempt() << dendl;
2049 m->put();
2050 return;
2051 }
2052
2053 switch (m->get_op()) {
2054 case MMDSSlaveRequest::OP_XLOCKACK:
2055 {
2056 // identify lock, master request
2057 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2058 m->get_object_info());
2059 mdr->more()->slaves.insert(from);
2060 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2061 mdr->xlocks.insert(lock);
2062 mdr->locks.insert(lock);
2063 mdr->finish_locking(lock);
2064 lock->get_xlock(mdr, mdr->get_client());
2065
2066 assert(mdr->more()->waiting_on_slave.count(from));
2067 mdr->more()->waiting_on_slave.erase(from);
2068 assert(mdr->more()->waiting_on_slave.empty());
2069 mdcache->dispatch_request(mdr);
2070 }
2071 break;
2072
2073 case MMDSSlaveRequest::OP_WRLOCKACK:
2074 {
2075 // identify lock, master request
2076 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2077 m->get_object_info());
2078 mdr->more()->slaves.insert(from);
2079 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2080 mdr->remote_wrlocks[lock] = from;
2081 mdr->locks.insert(lock);
2082 mdr->finish_locking(lock);
2083
2084 assert(mdr->more()->waiting_on_slave.count(from));
2085 mdr->more()->waiting_on_slave.erase(from);
2086 assert(mdr->more()->waiting_on_slave.empty());
2087 mdcache->dispatch_request(mdr);
2088 }
2089 break;
2090
2091 case MMDSSlaveRequest::OP_AUTHPINACK:
2092 handle_slave_auth_pin_ack(mdr, m);
2093 break;
2094
2095 case MMDSSlaveRequest::OP_LINKPREPACK:
2096 handle_slave_link_prep_ack(mdr, m);
2097 break;
2098
2099 case MMDSSlaveRequest::OP_RMDIRPREPACK:
2100 handle_slave_rmdir_prep_ack(mdr, m);
2101 break;
2102
2103 case MMDSSlaveRequest::OP_RENAMEPREPACK:
2104 handle_slave_rename_prep_ack(mdr, m);
2105 break;
2106
2107 case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
2108 handle_slave_rename_notify_ack(mdr, m);
2109 break;
2110
2111 default:
2112 ceph_abort();
2113 }
2114
2115 // done with reply.
2116 m->put();
2117 }
2118
2119 /* This function DOES put the mdr->slave_request before returning*/
2120 void Server::dispatch_slave_request(MDRequestRef& mdr)
2121 {
2122 dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
2123
2124 if (mdr->aborted) {
2125 dout(7) << " abort flag set, finishing" << dendl;
2126 mdcache->request_finish(mdr);
2127 return;
2128 }
2129
2130 if (logger) logger->inc(l_mdss_dispatch_slave_request);
2131
2132 int op = mdr->slave_request->get_op();
2133 switch (op) {
2134 case MMDSSlaveRequest::OP_XLOCK:
2135 case MMDSSlaveRequest::OP_WRLOCK:
2136 {
2137 // identify object
2138 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2139 mdr->slave_request->get_object_info());
2140
2141 if (!lock) {
2142 dout(10) << "don't have object, dropping" << dendl;
2143 ceph_abort(); // can this happen, if we auth pinned properly.
2144 }
2145 if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2146 dout(10) << "not auth for remote xlock attempt, dropping on "
2147 << *lock << " on " << *lock->get_parent() << dendl;
2148 } else {
2149 // use acquire_locks so that we get auth_pinning.
2150 set<SimpleLock*> rdlocks;
2151 set<SimpleLock*> wrlocks = mdr->wrlocks;
2152 set<SimpleLock*> xlocks = mdr->xlocks;
2153
2154 int replycode = 0;
2155 switch (op) {
2156 case MMDSSlaveRequest::OP_XLOCK:
2157 xlocks.insert(lock);
2158 replycode = MMDSSlaveRequest::OP_XLOCKACK;
2159 break;
2160 case MMDSSlaveRequest::OP_WRLOCK:
2161 wrlocks.insert(lock);
2162 replycode = MMDSSlaveRequest::OP_WRLOCKACK;
2163 break;
2164 }
2165
2166 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
2167 return;
2168
2169 // ack
2170 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, replycode);
2171 r->set_lock_type(lock->get_type());
2172 lock->get_parent()->set_object_info(r->get_object_info());
2173 mds->send_message(r, mdr->slave_request->get_connection());
2174 }
2175
2176 // done.
2177 mdr->slave_request->put();
2178 mdr->slave_request = 0;
2179 }
2180 break;
2181
2182 case MMDSSlaveRequest::OP_UNXLOCK:
2183 case MMDSSlaveRequest::OP_UNWRLOCK:
2184 {
2185 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2186 mdr->slave_request->get_object_info());
2187 assert(lock);
2188 bool need_issue = false;
2189 switch (op) {
2190 case MMDSSlaveRequest::OP_UNXLOCK:
2191 mds->locker->xlock_finish(lock, mdr.get(), &need_issue);
2192 break;
2193 case MMDSSlaveRequest::OP_UNWRLOCK:
2194 mds->locker->wrlock_finish(lock, mdr.get(), &need_issue);
2195 break;
2196 }
2197 if (need_issue)
2198 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2199
2200 // done. no ack necessary.
2201 mdr->slave_request->put();
2202 mdr->slave_request = 0;
2203 }
2204 break;
2205
2206 case MMDSSlaveRequest::OP_DROPLOCKS:
2207 mds->locker->drop_locks(mdr.get());
2208 mdr->slave_request->put();
2209 mdr->slave_request = 0;
2210 break;
2211
2212 case MMDSSlaveRequest::OP_AUTHPIN:
2213 handle_slave_auth_pin(mdr);
2214 break;
2215
2216 case MMDSSlaveRequest::OP_LINKPREP:
2217 case MMDSSlaveRequest::OP_UNLINKPREP:
2218 handle_slave_link_prep(mdr);
2219 break;
2220
2221 case MMDSSlaveRequest::OP_RMDIRPREP:
2222 handle_slave_rmdir_prep(mdr);
2223 break;
2224
2225 case MMDSSlaveRequest::OP_RENAMEPREP:
2226 handle_slave_rename_prep(mdr);
2227 break;
2228
2229 case MMDSSlaveRequest::OP_FINISH:
2230 // information about rename imported caps
2231 if (mdr->slave_request->inode_export.length() > 0)
2232 mdr->more()->inode_import.claim(mdr->slave_request->inode_export);
2233 // finish off request.
2234 mdcache->request_finish(mdr);
2235 break;
2236
2237 default:
2238 ceph_abort();
2239 }
2240 }
2241
2242 /* This function DOES put the mdr->slave_request before returning*/
2243 void Server::handle_slave_auth_pin(MDRequestRef& mdr)
2244 {
2245 dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
2246
2247 // build list of objects
2248 list<MDSCacheObject*> objects;
2249 CInode *auth_pin_freeze = NULL;
2250 bool fail = false, wouldblock = false, readonly = false;
2251
2252 if (mdcache->is_readonly()) {
2253 dout(10) << " read-only FS" << dendl;
2254 readonly = true;
2255 fail = true;
2256 }
2257
2258 if (!fail) {
2259 for (vector<MDSCacheObjectInfo>::iterator p = mdr->slave_request->get_authpins().begin();
2260 p != mdr->slave_request->get_authpins().end();
2261 ++p) {
2262 MDSCacheObject *object = mdcache->get_object(*p);
2263 if (!object) {
2264 dout(10) << " don't have " << *p << dendl;
2265 fail = true;
2266 break;
2267 }
2268
2269 objects.push_back(object);
2270 if (*p == mdr->slave_request->get_authpin_freeze())
2271 auth_pin_freeze = static_cast<CInode*>(object);
2272 }
2273 }
2274
2275 // can we auth pin them?
2276 if (!fail) {
2277 for (list<MDSCacheObject*>::iterator p = objects.begin();
2278 p != objects.end();
2279 ++p) {
2280 if (!(*p)->is_auth()) {
2281 dout(10) << " not auth for " << **p << dendl;
2282 fail = true;
2283 break;
2284 }
2285 if (mdr->is_auth_pinned(*p))
2286 continue;
2287 if (!mdr->can_auth_pin(*p)) {
2288 if (mdr->slave_request->is_nonblock()) {
2289 dout(10) << " can't auth_pin (freezing?) " << **p << " nonblocking" << dendl;
2290 fail = true;
2291 wouldblock = true;
2292 break;
2293 }
2294 // wait
2295 dout(10) << " waiting for authpinnable on " << **p << dendl;
2296 (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2297 mdr->drop_local_auth_pins();
2298
2299 mds->locker->notify_freeze_waiter(*p);
2300 return;
2301 }
2302 }
2303 }
2304
2305 // auth pin!
2306 if (fail) {
2307 mdr->drop_local_auth_pins(); // just in case
2308 } else {
2309 /* freeze authpin wrong inode */
2310 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
2311 mdr->more()->rename_inode != auth_pin_freeze)
2312 mdr->unfreeze_auth_pin(true);
2313
2314 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2315 * on the source inode to complete. This happens after all locks for the rename
2316 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2317 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2318 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2319 * The solution is freeze the inode and prevent other MDRequests from getting new
2320 * auth pins.
2321 */
2322 if (auth_pin_freeze) {
2323 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
2324 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
2325 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
2326 mds->mdlog->flush();
2327 return;
2328 }
2329 }
2330 for (list<MDSCacheObject*>::iterator p = objects.begin();
2331 p != objects.end();
2332 ++p) {
2333 dout(10) << "auth_pinning " << **p << dendl;
2334 mdr->auth_pin(*p);
2335 }
2336 }
2337
2338 // ack!
2339 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
2340
2341 // return list of my auth_pins (if any)
2342 for (set<MDSCacheObject*>::iterator p = mdr->auth_pins.begin();
2343 p != mdr->auth_pins.end();
2344 ++p) {
2345 MDSCacheObjectInfo info;
2346 (*p)->set_object_info(info);
2347 reply->get_authpins().push_back(info);
2348 if (*p == (MDSCacheObject*)auth_pin_freeze)
2349 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
2350 }
2351
2352 if (wouldblock)
2353 reply->mark_error_wouldblock();
2354 if (readonly)
2355 reply->mark_error_rofs();
2356
2357 mds->send_message_mds(reply, mdr->slave_to_mds);
2358
2359 // clean up this request
2360 mdr->slave_request->put();
2361 mdr->slave_request = 0;
2362 return;
2363 }
2364
2365 /* This function DOES NOT put the passed ack before returning*/
2366 void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
2367 {
2368 dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
2369 mds_rank_t from = mds_rank_t(ack->get_source().num());
2370
2371 // added auth pins?
2372 set<MDSCacheObject*> pinned;
2373 for (vector<MDSCacheObjectInfo>::iterator p = ack->get_authpins().begin();
2374 p != ack->get_authpins().end();
2375 ++p) {
2376 MDSCacheObject *object = mdcache->get_object(*p);
2377 assert(object); // we pinned it
2378 dout(10) << " remote has pinned " << *object << dendl;
2379 if (!mdr->is_auth_pinned(object))
2380 mdr->remote_auth_pins[object] = from;
2381 if (*p == ack->get_authpin_freeze())
2382 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
2383 pinned.insert(object);
2384 }
2385
2386 // removed frozen auth pin ?
2387 if (mdr->more()->is_remote_frozen_authpin &&
2388 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
2389 auto p = mdr->remote_auth_pins.find(mdr->more()->rename_inode);
2390 assert(p != mdr->remote_auth_pins.end());
2391 if (p->second == from) {
2392 mdr->more()->is_remote_frozen_authpin = false;
2393 }
2394 }
2395
2396 // removed auth pins?
2397 map<MDSCacheObject*, mds_rank_t>::iterator p = mdr->remote_auth_pins.begin();
2398 while (p != mdr->remote_auth_pins.end()) {
2399 MDSCacheObject* object = p->first;
2400 if (p->second == from && pinned.count(object) == 0) {
2401 dout(10) << " remote has unpinned " << *object << dendl;
2402 mdr->remote_auth_pins.erase(p++);
2403 } else {
2404 ++p;
2405 }
2406 }
2407
2408 if (ack->is_error_rofs()) {
2409 mdr->more()->slave_error = -EROFS;
2410 mdr->aborted = true;
2411 } else if (ack->is_error_wouldblock()) {
2412 mdr->more()->slave_error = -EWOULDBLOCK;
2413 mdr->aborted = true;
2414 }
2415
2416 // note slave
2417 mdr->more()->slaves.insert(from);
2418
2419 // clear from waiting list
2420 assert(mdr->more()->waiting_on_slave.count(from));
2421 mdr->more()->waiting_on_slave.erase(from);
2422
2423 // go again?
2424 if (mdr->more()->waiting_on_slave.empty())
2425 mdcache->dispatch_request(mdr);
2426 else
2427 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
2428 }
2429
2430
2431 // ---------------------------------------
2432 // HELPERS
2433
2434
2435 /**
2436 * check whether we are permitted to complete a request
2437 *
2438 * Check whether we have permission to perform the operation specified
2439 * by mask on the given inode, based on the capability in the mdr's
2440 * session.
2441 */
2442 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
2443 {
2444 if (mdr->session) {
2445 int r = mdr->session->check_access(
2446 in, mask,
2447 mdr->client_request->get_caller_uid(),
2448 mdr->client_request->get_caller_gid(),
2449 &mdr->client_request->get_caller_gid_list(),
2450 mdr->client_request->head.args.setattr.uid,
2451 mdr->client_request->head.args.setattr.gid);
2452 if (r < 0) {
2453 respond_to_request(mdr, r);
2454 return false;
2455 }
2456 }
2457 return true;
2458 }
2459
2460 /**
2461 * check whether fragment has reached maximum size
2462 *
2463 */
2464 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
2465 {
2466 const auto size = in->get_frag_size();
2467 if (size >= g_conf->mds_bal_fragment_size_max) {
2468 dout(10) << "fragment " << *in << " size exceeds " << g_conf->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
2469 respond_to_request(mdr, -ENOSPC);
2470 return false;
2471 }
2472
2473 return true;
2474 }
2475
2476
2477 /** validate_dentry_dir
2478 *
2479 * verify that the dir exists and would own the dname.
2480 * do not check if the dentry exists.
2481 */
2482 CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, boost::string_view dname)
2483 {
2484 // make sure parent is a dir?
2485 if (!diri->is_dir()) {
2486 dout(7) << "validate_dentry_dir: not a dir" << dendl;
2487 respond_to_request(mdr, -ENOTDIR);
2488 return NULL;
2489 }
2490
2491 // which dirfrag?
2492 frag_t fg = diri->pick_dirfrag(dname);
2493 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
2494 if (!dir)
2495 return 0;
2496
2497 // frozen?
2498 if (dir->is_frozen()) {
2499 dout(7) << "dir is frozen " << *dir << dendl;
2500 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2501 return NULL;
2502 }
2503
2504 return dir;
2505 }
2506
2507
2508 /** prepare_null_dentry
2509 * prepare a null (or existing) dentry in given dir.
2510 * wait for any dn lock.
2511 */
2512 CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, boost::string_view dname, bool okexist)
2513 {
2514 dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
2515 assert(dir->is_auth());
2516
2517 client_t client = mdr->get_client();
2518
2519 // does it already exist?
2520 CDentry *dn = dir->lookup(dname);
2521 if (dn) {
2522 /*
2523 if (dn->lock.is_xlocked_by_other(mdr)) {
2524 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2525 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2526 return 0;
2527 }
2528 */
2529 if (!dn->get_linkage(client, mdr)->is_null()) {
2530 // name already exists
2531 dout(10) << "dentry " << dname << " exists in " << *dir << dendl;
2532 if (!okexist) {
2533 respond_to_request(mdr, -EEXIST);
2534 return 0;
2535 }
2536 } else {
2537 dn->first = dir->inode->find_snaprealm()->get_newest_seq() + 1;
2538 }
2539
2540 return dn;
2541 }
2542
2543 // make sure dir is complete
2544 if (!dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2545 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2546 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2547 return 0;
2548 }
2549
2550 // create
2551 dn = dir->add_null_dentry(dname, dir->inode->find_snaprealm()->get_newest_seq() + 1);
2552 dn->mark_new();
2553 dout(10) << "prepare_null_dentry added " << *dn << dendl;
2554 return dn;
2555 }
2556
2557 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
2558 {
2559 CDentry *straydn = mdr->straydn;
2560 if (straydn) {
2561 string straydname;
2562 in->name_stray_dentry(straydname);
2563 if (straydn->get_name() == straydname)
2564 return straydn;
2565
2566 assert(!mdr->done_locking);
2567 mdr->unpin(straydn);
2568 }
2569
2570 CDir *straydir = mdcache->get_stray_dir(in);
2571
2572 if (!mdr->client_request->is_replay() &&
2573 !check_fragment_space(mdr, straydir))
2574 return NULL;
2575
2576 straydn = mdcache->get_or_create_stray_dentry(in);
2577 mdr->straydn = straydn;
2578 mdr->pin(straydn);
2579 return straydn;
2580 }
2581
2582 /** prepare_new_inode
2583 *
2584 * create a new inode. set c/m/atime. hit dir pop.
2585 */
2586 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
2587 file_layout_t *layout)
2588 {
2589 CInode *in = new CInode(mdcache);
2590
2591 // Server::prepare_force_open_sessions() can re-open session in closing
2592 // state. In that corner case, session's prealloc_inos are being freed.
2593 // To simplify the code, we disallow using/refilling session's prealloc_ino
2594 // while session is opening.
2595 bool allow_prealloc_inos = !mdr->session->is_opening();
2596
2597 // assign ino
2598 if (allow_prealloc_inos &&
2599 mdr->session->info.prealloc_inos.size()) {
2600 mdr->used_prealloc_ino =
2601 in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used
2602 mds->sessionmap.mark_projected(mdr->session);
2603
2604 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
2605 << " (" << mdr->session->info.prealloc_inos
2606 << ", " << mdr->session->info.prealloc_inos.size() << " left)"
2607 << dendl;
2608 } else {
2609 mdr->alloc_ino =
2610 in->inode.ino = mds->inotable->project_alloc_id();
2611 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
2612 }
2613
2614 if (useino && useino != in->inode.ino) {
2615 dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
2616 mds->clog->error() << mdr->client_request->get_source()
2617 << " specified ino " << useino
2618 << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
2619 //ceph_abort(); // just for now.
2620 }
2621
2622 if (allow_prealloc_inos &&
2623 mdr->session->get_num_projected_prealloc_inos() < g_conf->mds_client_prealloc_inos / 2) {
2624 int need = g_conf->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
2625 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
2626 assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
2627 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
2628 mds->sessionmap.mark_projected(mdr->session);
2629 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
2630 }
2631
2632 in->inode.version = 1;
2633 in->inode.xattr_version = 1;
2634 in->inode.nlink = 1; // FIXME
2635
2636 in->inode.mode = mode;
2637
2638 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
2639 if (in->inode.is_dir()) {
2640 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
2641 } else if (layout) {
2642 in->inode.layout = *layout;
2643 } else {
2644 in->inode.layout = mdcache->default_file_layout;
2645 }
2646
2647 in->inode.truncate_size = -1ull; // not truncated, yet!
2648 in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
2649
2650 CInode *diri = dir->get_inode();
2651
2652 dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
2653
2654 if (diri->inode.mode & S_ISGID) {
2655 dout(10) << " dir is sticky" << dendl;
2656 in->inode.gid = diri->inode.gid;
2657 if (S_ISDIR(mode)) {
2658 dout(10) << " new dir also sticky" << dendl;
2659 in->inode.mode |= S_ISGID;
2660 }
2661 } else
2662 in->inode.gid = mdr->client_request->get_caller_gid();
2663
2664 in->inode.uid = mdr->client_request->get_caller_uid();
2665
2666 in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
2667 mdr->get_op_stamp();
2668
2669 in->inode.change_attr = 0;
2670
2671 MClientRequest *req = mdr->client_request;
2672 if (req->get_data().length()) {
2673 bufferlist::iterator p = req->get_data().begin();
2674
2675 // xattrs on new inode?
2676 CInode::mempool_xattr_map xattrs;
2677 ::decode(xattrs, p);
2678 for (const auto &p : xattrs) {
2679 dout(10) << "prepare_new_inode setting xattr " << p.first << dendl;
2680 auto em = in->xattrs.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple(p.second));
2681 if (!em.second)
2682 em.first->second = p.second;
2683 }
2684 }
2685
2686 if (!mds->mdsmap->get_inline_data_enabled() ||
2687 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
2688 in->inode.inline_data.version = CEPH_INLINE_NONE;
2689
2690 mdcache->add_inode(in); // add
2691 dout(10) << "prepare_new_inode " << *in << dendl;
2692 return in;
2693 }
2694
2695 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
2696 {
2697 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
2698 << " inotablev " << mds->inotable->get_projected_version()
2699 << dendl;
2700 blob->set_ino_alloc(mdr->alloc_ino,
2701 mdr->used_prealloc_ino,
2702 mdr->prealloc_inos,
2703 mdr->client_request->get_source(),
2704 mds->sessionmap.get_projected(),
2705 mds->inotable->get_projected_version());
2706 }
2707
2708 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
2709 {
2710 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
2711 << " / " << mdr->prealloc_inos
2712 << " / " << mdr->used_prealloc_ino << dendl;
2713
2714 if (mdr->alloc_ino) {
2715 mds->inotable->apply_alloc_id(mdr->alloc_ino);
2716 }
2717 if (mdr->prealloc_inos.size()) {
2718 assert(session);
2719 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
2720 session->info.prealloc_inos.insert(mdr->prealloc_inos);
2721 mds->sessionmap.mark_dirty(session);
2722 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
2723 }
2724 if (mdr->used_prealloc_ino) {
2725 assert(session);
2726 session->info.used_inos.erase(mdr->used_prealloc_ino);
2727 mds->sessionmap.mark_dirty(session);
2728 }
2729 }
2730
2731 class C_MDS_TryFindInode : public ServerContext {
2732 MDRequestRef mdr;
2733 public:
2734 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
2735 void finish(int r) override {
2736 if (r == -ESTALE) // :( find_ino_peers failed
2737 server->respond_to_request(mdr, r);
2738 else
2739 server->dispatch_client_request(mdr);
2740 }
2741 };
2742
2743 CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath)
2744 {
2745 // figure parent dir vs dname
2746 if (refpath.depth() == 0) {
2747 dout(7) << "can't do that to root" << dendl;
2748 respond_to_request(mdr, -EINVAL);
2749 return 0;
2750 }
2751 string dname = refpath.last_dentry();
2752 refpath.pop_dentry();
2753
2754 dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl;
2755
2756 // traverse to parent dir
2757 CInode *diri;
2758 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &trace, &diri, MDS_TRAVERSE_FORWARD);
2759 if (r > 0) return 0; // delayed
2760 if (r < 0) {
2761 if (r == -ESTALE) {
2762 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2763 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
2764 return 0;
2765 }
2766 respond_to_request(mdr, r);
2767 return 0;
2768 }
2769
2770 // is it an auth dir?
2771 CDir *dir = validate_dentry_dir(mdr, diri, dname);
2772 if (!dir)
2773 return 0; // forwarded or waiting for freeze
2774
2775 dout(10) << "traverse_to_auth_dir " << *dir << dendl;
2776 return dir;
2777 }
2778
2779 /* If this returns null, the request has been handled
2780 * as appropriate: forwarded on, or the client's been replied to */
2781 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n,
2782 set<SimpleLock*> &rdlocks,
2783 bool want_auth,
2784 bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's
2785 a snapped dir */
2786 file_layout_t **layout,
2787 bool no_lookup) // true if we cannot return a null dentry lease
2788 {
2789 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2790 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
2791
2792 if (mdr->done_locking)
2793 return mdr->in[n];
2794
2795 // traverse
2796 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &mdr->dn[n], &mdr->in[n], MDS_TRAVERSE_FORWARD);
2797 if (r > 0)
2798 return NULL; // delayed
2799 if (r < 0) { // error
2800 if (r == -ENOENT && n == 0 && mdr->dn[n].size()) {
2801 if (!no_lookup)
2802 mdr->tracedn = mdr->dn[n][mdr->dn[n].size()-1];
2803 respond_to_request(mdr, r);
2804 } else if (r == -ESTALE) {
2805 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2806 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
2807 mdcache->find_ino_peers(refpath.get_ino(), c);
2808 } else {
2809 dout(10) << "FAIL on error " << r << dendl;
2810 respond_to_request(mdr, r);
2811 }
2812 return 0;
2813 }
2814 CInode *ref = mdr->in[n];
2815 dout(10) << "ref is " << *ref << dendl;
2816
2817 // fw to inode auth?
2818 if (mdr->snapid != CEPH_NOSNAP && !no_want_auth)
2819 want_auth = true;
2820
2821 if (want_auth) {
2822 if (ref->is_ambiguous_auth()) {
2823 dout(10) << "waiting for single auth on " << *ref << dendl;
2824 ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
2825 return 0;
2826 }
2827 if (!ref->is_auth()) {
2828 dout(10) << "fw to auth for " << *ref << dendl;
2829 mdcache->request_forward(mdr, ref->authority().first);
2830 return 0;
2831 }
2832
2833 // auth_pin?
2834 // do NOT proceed if freezing, as cap release may defer in that case, and
2835 // we could deadlock when we try to lock @ref.
2836 // if we're already auth_pinned, continue; the release has already been processed.
2837 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
2838 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
2839 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
2840 ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2841 /* If we have any auth pins, this will deadlock.
2842 * But the only way to get here if we've already got auth pins
2843 * is because we're on an inode with snapshots that got updated
2844 * between dispatches of this request. So we're going to drop
2845 * our locks and our auth pins and reacquire them later.
2846 *
2847 * This is safe since we're only in this function when working on
2848 * a single MDS request; otherwise we'd be in
2849 * rdlock_path_xlock_dentry.
2850 */
2851 mds->locker->drop_locks(mdr.get(), NULL);
2852 mdr->drop_local_auth_pins();
2853 if (!mdr->remote_auth_pins.empty())
2854 mds->locker->notify_freeze_waiter(ref);
2855 return 0;
2856 }
2857
2858 mdr->auth_pin(ref);
2859 }
2860
2861 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2862 rdlocks.insert(&mdr->dn[n][i]->lock);
2863 if (layout)
2864 mds->locker->include_snap_rdlocks_wlayout(rdlocks, ref, layout);
2865 else
2866 mds->locker->include_snap_rdlocks(rdlocks, ref);
2867
2868 // set and pin ref
2869 mdr->pin(ref);
2870 return ref;
2871 }
2872
2873
2874 /** rdlock_path_xlock_dentry
2875 * traverse path to the directory that could/would contain dentry.
2876 * make sure i am auth for that dentry, forward as necessary.
2877 * create null dentry in place (or use existing if okexist).
2878 * get rdlocks on traversed dentries, xlock on new dentry.
2879 */
2880 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
2881 set<SimpleLock*>& rdlocks, set<SimpleLock*>& wrlocks, set<SimpleLock*>& xlocks,
2882 bool okexist, bool mustexist, bool alwaysxlock,
2883 file_layout_t **layout)
2884 {
2885 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2886
2887 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
2888
2889 client_t client = mdr->get_client();
2890
2891 if (mdr->done_locking)
2892 return mdr->dn[n].back();
2893
2894 CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
2895 if (!dir) return 0;
2896
2897 CInode *diri = dir->get_inode();
2898 if (!mdr->reqid.name.is_mds()) {
2899 if (diri->is_system() && !diri->is_root()) {
2900 respond_to_request(mdr, -EROFS);
2901 return 0;
2902 }
2903 }
2904 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
2905 respond_to_request(mdr, -ENOENT);
2906 return 0;
2907 }
2908
2909 // make a null dentry?
2910 boost::string_view dname = refpath.last_dentry();
2911 CDentry *dn;
2912 if (mustexist) {
2913 dn = dir->lookup(dname);
2914
2915 // make sure dir is complete
2916 if (!dn && !dir->is_complete() &&
2917 (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2918 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2919 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2920 return 0;
2921 }
2922
2923 // readable?
2924 if (dn && !dn->lock.can_read(client) && dn->lock.get_xlock_by() != mdr) {
2925 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2926 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2927 return 0;
2928 }
2929
2930 // exists?
2931 if (!dn || dn->get_linkage(client, mdr)->is_null()) {
2932 dout(7) << "dentry " << dname << " dne in " << *dir << dendl;
2933 respond_to_request(mdr, -ENOENT);
2934 return 0;
2935 }
2936 } else {
2937 dn = prepare_null_dentry(mdr, dir, dname, okexist);
2938 if (!dn)
2939 return 0;
2940 }
2941
2942 mdr->dn[n].push_back(dn);
2943 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
2944 mdr->in[n] = dnl->get_inode();
2945
2946 // -- lock --
2947 // NOTE: rename takes the same set of locks for srcdn
2948 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2949 rdlocks.insert(&mdr->dn[n][i]->lock);
2950 if (alwaysxlock || dnl->is_null())
2951 xlocks.insert(&dn->lock); // new dn, xlock
2952 else
2953 rdlocks.insert(&dn->lock); // existing dn, rdlock
2954 wrlocks.insert(&dn->get_dir()->inode->filelock); // also, wrlock on dir mtime
2955 wrlocks.insert(&dn->get_dir()->inode->nestlock); // also, wrlock on dir mtime
2956 if (layout)
2957 mds->locker->include_snap_rdlocks_wlayout(rdlocks, dn->get_dir()->inode, layout);
2958 else
2959 mds->locker->include_snap_rdlocks(rdlocks, dn->get_dir()->inode);
2960
2961 return dn;
2962 }
2963
2964
2965
2966
2967
2968 /**
2969 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
2970 *
2971 * @param diri base inode
2972 * @param fg the exact frag we want
2973 * @param mdr request
2974 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
2975 */
2976 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
2977 {
2978 CDir *dir = diri->get_dirfrag(fg);
2979
2980 // not open and inode not mine?
2981 if (!dir && !diri->is_auth()) {
2982 mds_rank_t inauth = diri->authority().first;
2983 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
2984 mdcache->request_forward(mdr, inauth);
2985 return 0;
2986 }
2987
2988 // not open and inode frozen?
2989 if (!dir && diri->is_frozen()) {
2990 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
2991 assert(diri->get_parent_dir());
2992 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2993 return 0;
2994 }
2995
2996 // invent?
2997 if (!dir)
2998 dir = diri->get_or_open_dirfrag(mdcache, fg);
2999
3000 // am i auth for the dirfrag?
3001 if (!dir->is_auth()) {
3002 mds_rank_t auth = dir->authority().first;
3003 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3004 << ", fw to mds." << auth << dendl;
3005 mdcache->request_forward(mdr, auth);
3006 return 0;
3007 }
3008
3009 return dir;
3010 }
3011
3012
3013 // ===============================================================================
3014 // STAT
3015
3016 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3017 {
3018 MClientRequest *req = mdr->client_request;
3019 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3020
3021 if (req->get_filepath().depth() == 0 && is_lookup) {
3022 // refpath can't be empty for lookup but it can for
3023 // getattr (we do getattr with empty refpath for mount of '/')
3024 respond_to_request(mdr, -EINVAL);
3025 return;
3026 }
3027
3028 bool want_auth = false;
3029 int mask = req->head.args.getattr.mask;
3030 if (mask & CEPH_STAT_RSTAT)
3031 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3032
3033 CInode *ref = rdlock_path_pin_ref(mdr, 0, rdlocks, want_auth, false, NULL,
3034 !is_lookup);
3035 if (!ref) return;
3036
3037 /*
3038 * if client currently holds the EXCL cap on a field, do not rdlock
3039 * it; client's stat() will result in valid info if _either_ EXCL
3040 * cap is held or MDS rdlocks and reads the value here.
3041 *
3042 * handling this case here is easier than weakening rdlock
3043 * semantics... that would cause problems elsewhere.
3044 */
3045 client_t client = mdr->get_client();
3046 int issued = 0;
3047 Capability *cap = ref->get_client_cap(client);
3048 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3049 mdr->snapid <= cap->client_follows))
3050 issued = cap->issued();
3051
3052 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
3053 rdlocks.insert(&ref->linklock);
3054 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
3055 rdlocks.insert(&ref->authlock);
3056 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
3057 rdlocks.insert(&ref->xattrlock);
3058 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3059 // Don't wait on unstable filelock if client is allowed to read file size.
3060 // This can reduce the response time of getattr in the case that multiple
3061 // clients do stat(2) and there are writers.
3062 // The downside of this optimization is that mds may not issue Fs caps along
3063 // with getattr reply. Client may need to send more getattr requests.
3064 if (mdr->rdlocks.count(&ref->filelock)) {
3065 rdlocks.insert(&ref->filelock);
3066 } else if (ref->filelock.is_stable() ||
3067 ref->filelock.get_num_wrlocks() > 0 ||
3068 !ref->filelock.can_read(mdr->get_client())) {
3069 rdlocks.insert(&ref->filelock);
3070 mdr->done_locking = false;
3071 }
3072 }
3073
3074 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3075 return;
3076
3077 if (!check_access(mdr, ref, MAY_READ))
3078 return;
3079
3080 utime_t now = ceph_clock_now();
3081 mdr->set_mds_stamp(now);
3082
3083 // note which caps are requested, so we return at least a snapshot
3084 // value for them. (currently this matters for xattrs and inline data)
3085 mdr->getattr_caps = mask;
3086
3087 mds->balancer->hit_inode(now, ref, META_POP_IRD,
3088 req->get_source().num());
3089
3090 // reply
3091 dout(10) << "reply to stat on " << *req << dendl;
3092 mdr->tracei = ref;
3093 if (is_lookup)
3094 mdr->tracedn = mdr->dn[0].back();
3095 respond_to_request(mdr, 0);
3096 }
3097
3098 struct C_MDS_LookupIno2 : public ServerContext {
3099 MDRequestRef mdr;
3100 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3101 void finish(int r) override {
3102 server->_lookup_ino_2(mdr, r);
3103 }
3104 };
3105
3106 /* This function DOES clean up the mdr before returning*/
3107 /*
3108 * filepath: ino
3109 */
3110 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3111 bool want_parent, bool want_dentry)
3112 {
3113 MClientRequest *req = mdr->client_request;
3114
3115 inodeno_t ino = req->get_filepath().get_ino();
3116 CInode *in = mdcache->get_inode(ino);
3117 if (in && in->state_test(CInode::STATE_PURGING)) {
3118 respond_to_request(mdr, -ESTALE);
3119 return;
3120 }
3121 if (!in) {
3122 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3123 return;
3124 }
3125
3126 if (mdr && in->snaprealm && !in->snaprealm->is_open() &&
3127 !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
3128 return;
3129 }
3130
3131 // check for nothing (not read or write); this still applies the
3132 // path check.
3133 if (!check_access(mdr, in, 0))
3134 return;
3135
3136 CDentry *dn = in->get_projected_parent_dn();
3137 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3138
3139 set<SimpleLock*> rdlocks;
3140 if (dn && (want_parent || want_dentry)) {
3141 mdr->pin(dn);
3142 rdlocks.insert(&dn->lock);
3143 }
3144
3145 unsigned mask = req->head.args.getattr.mask;
3146 if (mask) {
3147 Capability *cap = in->get_client_cap(mdr->get_client());
3148 int issued = 0;
3149 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3150 issued = cap->issued();
3151 // permission bits, ACL/security xattrs
3152 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3153 rdlocks.insert(&in->authlock);
3154 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3155 rdlocks.insert(&in->xattrlock);
3156
3157 mdr->getattr_caps = mask;
3158 }
3159
3160 if (!rdlocks.empty()) {
3161 set<SimpleLock*> wrlocks, xlocks;
3162 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3163 return;
3164
3165 if (diri != NULL) {
3166 // need read access to directory inode
3167 if (!check_access(mdr, diri, MAY_READ))
3168 return;
3169 }
3170 }
3171
3172 if (want_parent) {
3173 if (in->is_base()) {
3174 respond_to_request(mdr, -EINVAL);
3175 return;
3176 }
3177 if (!diri || diri->is_stray()) {
3178 respond_to_request(mdr, -ESTALE);
3179 return;
3180 }
3181 dout(10) << "reply to lookup_parent " << *in << dendl;
3182 mdr->tracei = diri;
3183 respond_to_request(mdr, 0);
3184 } else {
3185 if (want_dentry) {
3186 inodeno_t dirino = req->get_filepath2().get_ino();
3187 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3188 respond_to_request(mdr, -ENOENT);
3189 return;
3190 }
3191 dout(10) << "reply to lookup_name " << *in << dendl;
3192 } else
3193 dout(10) << "reply to lookup_ino " << *in << dendl;
3194
3195 mdr->tracei = in;
3196 if (want_dentry)
3197 mdr->tracedn = dn;
3198 respond_to_request(mdr, 0);
3199 }
3200 }
3201
3202 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
3203 {
3204 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
3205 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3206
3207 // `r` is a rank if >=0, else an error code
3208 if (r >= 0) {
3209 mds_rank_t dest_rank(r);
3210 if (dest_rank == mds->get_nodeid())
3211 dispatch_client_request(mdr);
3212 else
3213 mdcache->request_forward(mdr, dest_rank);
3214 return;
3215 }
3216
3217 // give up
3218 if (r == -ENOENT || r == -ENODATA)
3219 r = -ESTALE;
3220 respond_to_request(mdr, r);
3221 }
3222
3223
3224 /* This function takes responsibility for the passed mdr*/
3225 void Server::handle_client_open(MDRequestRef& mdr)
3226 {
3227 MClientRequest *req = mdr->client_request;
3228 dout(7) << "open on " << req->get_filepath() << dendl;
3229
3230 int flags = req->head.args.open.flags;
3231 int cmode = ceph_flags_to_mode(flags);
3232 if (cmode < 0) {
3233 respond_to_request(mdr, -EINVAL);
3234 return;
3235 }
3236
3237 bool need_auth = !file_mode_is_readonly(cmode) ||
3238 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
3239
3240 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
3241 dout(7) << "read-only FS" << dendl;
3242 respond_to_request(mdr, -EROFS);
3243 return;
3244 }
3245
3246 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3247 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, need_auth);
3248 if (!cur)
3249 return;
3250
3251 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
3252 assert(!need_auth);
3253 mdr->done_locking = false;
3254 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3255 if (!cur)
3256 return;
3257 }
3258
3259 if (!cur->inode.is_file()) {
3260 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3261 cmode = CEPH_FILE_MODE_PIN;
3262 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3263 if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
3264 flags &= ~CEPH_O_TRUNC;
3265 }
3266
3267 dout(10) << "open flags = " << flags
3268 << ", filemode = " << cmode
3269 << ", need_auth = " << need_auth
3270 << dendl;
3271
3272 // regular file?
3273 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3274 dout(7) << "not a file or dir " << *cur << dendl;
3275 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3276 return;
3277 }*/
3278 if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
3279 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
3280 respond_to_request(mdr, -EINVAL);
3281 return;
3282 }
3283
3284 if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
3285 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
3286 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3287 respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
3288 return;
3289 }
3290
3291 if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
3292 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
3293 dout(7) << "old client cannot open inline data file " << *cur << dendl;
3294 respond_to_request(mdr, -EPERM);
3295 return;
3296 }
3297
3298 // snapped data is read only
3299 if (mdr->snapid != CEPH_NOSNAP &&
3300 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
3301 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
3302 respond_to_request(mdr, -EROFS);
3303 return;
3304 }
3305
3306 unsigned mask = req->head.args.open.mask;
3307 if (mask) {
3308 Capability *cap = cur->get_client_cap(mdr->get_client());
3309 int issued = 0;
3310 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3311 issued = cap->issued();
3312 // permission bits, ACL/security xattrs
3313 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3314 rdlocks.insert(&cur->authlock);
3315 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3316 rdlocks.insert(&cur->xattrlock);
3317
3318 mdr->getattr_caps = mask;
3319 }
3320
3321 // O_TRUNC
3322 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
3323 assert(cur->is_auth());
3324
3325 xlocks.insert(&cur->filelock);
3326 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3327 return;
3328
3329 if (!check_access(mdr, cur, MAY_WRITE))
3330 return;
3331
3332 // wait for pending truncate?
3333 const auto pi = cur->get_projected_inode();
3334 if (pi->is_truncating()) {
3335 dout(10) << " waiting for pending truncate from " << pi->truncate_from
3336 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
3337 mds->locker->drop_locks(mdr.get());
3338 mdr->drop_local_auth_pins();
3339 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
3340 return;
3341 }
3342
3343 do_open_truncate(mdr, cmode);
3344 return;
3345 }
3346
3347 // sync filelock if snapped.
3348 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3349 // and that data itself is flushed so that we can read the snapped data off disk.
3350 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
3351 rdlocks.insert(&cur->filelock);
3352 }
3353
3354 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3355 return;
3356
3357 mask = MAY_READ;
3358 if (cmode & CEPH_FILE_MODE_WR)
3359 mask |= MAY_WRITE;
3360 if (!check_access(mdr, cur, mask))
3361 return;
3362
3363 utime_t now = ceph_clock_now();
3364 mdr->set_mds_stamp(now);
3365
3366 if (cur->is_file() || cur->is_dir()) {
3367 if (mdr->snapid == CEPH_NOSNAP) {
3368 // register new cap
3369 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr->session, 0, req->is_replay());
3370 if (cap)
3371 dout(12) << "open issued caps " << ccap_string(cap->pending())
3372 << " for " << req->get_source()
3373 << " on " << *cur << dendl;
3374 } else {
3375 int caps = ceph_caps_for_mode(cmode);
3376 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
3377 << " for " << req->get_source()
3378 << " snapid " << mdr->snapid
3379 << " on " << *cur << dendl;
3380 mdr->snap_caps = caps;
3381 }
3382 }
3383
3384 // increase max_size?
3385 if (cmode & CEPH_FILE_MODE_WR)
3386 mds->locker->check_inode_max_size(cur);
3387
3388 // make sure this inode gets into the journal
3389 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
3390 !cur->item_open_file.is_on_list()) {
3391 LogSegment *ls = mds->mdlog->get_current_segment();
3392 EOpen *le = new EOpen(mds->mdlog);
3393 mdlog->start_entry(le);
3394 le->add_clean_inode(cur);
3395 ls->open_files.push_back(&cur->item_open_file);
3396 mdlog->submit_entry(le);
3397 }
3398
3399 // hit pop
3400 if (cmode & CEPH_FILE_MODE_WR)
3401 mds->balancer->hit_inode(now, cur, META_POP_IWR);
3402 else
3403 mds->balancer->hit_inode(now, cur, META_POP_IRD,
3404 mdr->client_request->get_source().num());
3405
3406 CDentry *dn = 0;
3407 if (req->get_dentry_wanted()) {
3408 assert(mdr->dn[0].size());
3409 dn = mdr->dn[0].back();
3410 }
3411
3412 mdr->tracei = cur;
3413 mdr->tracedn = dn;
3414 respond_to_request(mdr, 0);
3415 }
3416
3417 class C_MDS_openc_finish : public ServerLogContext {
3418 CDentry *dn;
3419 CInode *newi;
3420 snapid_t follows;
3421 public:
3422 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni, snapid_t f) :
3423 ServerLogContext(s, r), dn(d), newi(ni), follows(f) {}
3424 void finish(int r) override {
3425 assert(r == 0);
3426
3427 dn->pop_projected_linkage();
3428
3429 // dirty inode, dn, dir
3430 newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
3431 newi->mark_dirty(newi->inode.version+1, mdr->ls);
3432 newi->mark_dirty_parent(mdr->ls, true);
3433
3434 mdr->apply();
3435
3436 get_mds()->locker->share_inode_max_size(newi);
3437
3438 MDRequestRef null_ref;
3439 get_mds()->mdcache->send_dentry_link(dn, null_ref);
3440
3441 utime_t now = ceph_clock_now();
3442 get_mds()->balancer->hit_inode(now, newi, META_POP_IWR);
3443
3444 server->respond_to_request(mdr, 0);
3445
3446 assert(g_conf->mds_kill_openc_at != 1);
3447 }
3448 };
3449
3450 /* This function takes responsibility for the passed mdr*/
3451 void Server::handle_client_openc(MDRequestRef& mdr)
3452 {
3453 MClientRequest *req = mdr->client_request;
3454 client_t client = mdr->get_client();
3455
3456 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
3457
3458 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
3459 if (cmode < 0) {
3460 respond_to_request(mdr, -EINVAL);
3461 return;
3462 }
3463
3464 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
3465
3466 if (!excl) {
3467 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(),
3468 &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
3469 if (r > 0) return;
3470 if (r == 0) {
3471 // it existed.
3472 handle_client_open(mdr);
3473 return;
3474 }
3475 if (r < 0 && r != -ENOENT) {
3476 if (r == -ESTALE) {
3477 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3478 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
3479 mdcache->find_ino_peers(req->get_filepath().get_ino(), c);
3480 } else {
3481 dout(10) << "FAIL on error " << r << dendl;
3482 respond_to_request(mdr, r);
3483 }
3484 return;
3485 }
3486 }
3487
3488 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3489 file_layout_t *dir_layout = NULL;
3490 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks,
3491 !excl, false, false, &dir_layout);
3492 if (!dn) return;
3493 if (mdr->snapid != CEPH_NOSNAP) {
3494 respond_to_request(mdr, -EROFS);
3495 return;
3496 }
3497 // set layout
3498 file_layout_t layout;
3499 if (dir_layout)
3500 layout = *dir_layout;
3501 else
3502 layout = mdcache->default_file_layout;
3503
3504 // What kind of client caps are required to complete this operation
3505 uint64_t access = MAY_WRITE;
3506
3507 const auto default_layout = layout;
3508
3509 // fill in any special params from client
3510 if (req->head.args.open.stripe_unit)
3511 layout.stripe_unit = req->head.args.open.stripe_unit;
3512 if (req->head.args.open.stripe_count)
3513 layout.stripe_count = req->head.args.open.stripe_count;
3514 if (req->head.args.open.object_size)
3515 layout.object_size = req->head.args.open.object_size;
3516 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
3517 (__s32)req->head.args.open.pool >= 0) {
3518 layout.pool_id = req->head.args.open.pool;
3519
3520 // make sure we have as new a map as the client
3521 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
3522 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
3523 return;
3524 }
3525 }
3526
3527 // If client doesn't have capability to modify layout pools, then
3528 // only permit this request if the requested pool matches what the
3529 // file would have inherited anyway from its parent.
3530 if (default_layout != layout) {
3531 access |= MAY_SET_VXATTR;
3532 }
3533
3534 if (!layout.is_valid()) {
3535 dout(10) << " invalid initial file layout" << dendl;
3536 respond_to_request(mdr, -EINVAL);
3537 return;
3538 }
3539 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
3540 dout(10) << " invalid data pool " << layout.pool_id << dendl;
3541 respond_to_request(mdr, -EINVAL);
3542 return;
3543 }
3544
3545 // created null dn.
3546 CDir *dir = dn->get_dir();
3547 CInode *diri = dir->get_inode();
3548 rdlocks.insert(&diri->authlock);
3549 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3550 return;
3551
3552 if (!check_access(mdr, diri, access))
3553 return;
3554
3555 if (!check_fragment_space(mdr, dir))
3556 return;
3557
3558 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3559
3560 if (!dnl->is_null()) {
3561 // it existed.
3562 assert(req->head.args.open.flags & CEPH_O_EXCL);
3563 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
3564 mdr->tracei = dnl->get_inode();
3565 mdr->tracedn = dn;
3566 respond_to_request(mdr, -EEXIST);
3567 return;
3568 }
3569
3570 // create inode.
3571 SnapRealm *realm = diri->find_snaprealm(); // use directory's realm; inode isn't attached yet.
3572 snapid_t follows = realm->get_newest_seq();
3573
3574 CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
3575 req->head.args.open.mode | S_IFREG, &layout);
3576 assert(in);
3577
3578 // it's a file.
3579 dn->push_projected_linkage(in);
3580
3581 in->inode.version = dn->pre_dirty();
3582 if (layout.pool_id != mdcache->default_file_layout.pool_id)
3583 in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
3584 in->inode.update_backtrace();
3585 if (cmode & CEPH_FILE_MODE_WR) {
3586 in->inode.client_ranges[client].range.first = 0;
3587 in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
3588 in->inode.client_ranges[client].follows = follows;
3589 }
3590 in->inode.rstat.rfiles = 1;
3591
3592 assert(dn->first == follows+1);
3593 in->first = dn->first;
3594
3595 // prepare finisher
3596 mdr->ls = mdlog->get_current_segment();
3597 EUpdate *le = new EUpdate(mdlog, "openc");
3598 mdlog->start_entry(le);
3599 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
3600 journal_allocated_inos(mdr, &le->metablob);
3601 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
3602 le->metablob.add_primary_dentry(dn, in, true, true, true);
3603
3604 // do the open
3605 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
3606 in->authlock.set_state(LOCK_EXCL);
3607 in->xattrlock.set_state(LOCK_EXCL);
3608
3609 // make sure this inode gets into the journal
3610 le->metablob.add_opened_ino(in->ino());
3611 LogSegment *ls = mds->mdlog->get_current_segment();
3612 ls->open_files.push_back(&in->item_open_file);
3613
3614 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in, follows);
3615
3616 if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
3617 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
3618 // add the file created flag onto the reply if create_flags features is supported
3619 ::encode(in->inode.ino, mdr->reply_extra_bl);
3620 }
3621
3622 journal_and_reply(mdr, in, dn, le, fin);
3623
3624 // We hit_dir (via hit_inode) in our finish callback, but by then we might
3625 // have overshot the split size (multiple opencs in flight), so here is
3626 // an early chance to split the dir if this openc makes it oversized.
3627 mds->balancer->maybe_fragment(dir, false);
3628 }
3629
3630
3631
3632 void Server::handle_client_readdir(MDRequestRef& mdr)
3633 {
3634 MClientRequest *req = mdr->client_request;
3635 client_t client = req->get_source().num();
3636 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3637 CInode *diri = rdlock_path_pin_ref(mdr, 0, rdlocks, false, true);
3638 if (!diri) return;
3639
3640 // it's a directory, right?
3641 if (!diri->is_dir()) {
3642 // not a dir
3643 dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
3644 respond_to_request(mdr, -ENOTDIR);
3645 return;
3646 }
3647
3648 rdlocks.insert(&diri->filelock);
3649 rdlocks.insert(&diri->dirfragtreelock);
3650
3651 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3652 return;
3653
3654 if (!check_access(mdr, diri, MAY_READ))
3655 return;
3656
3657 // which frag?
3658 frag_t fg = (__u32)req->head.args.readdir.frag;
3659 unsigned req_flags = (__u32)req->head.args.readdir.flags;
3660 string offset_str = req->get_path2();
3661
3662 __u32 offset_hash = 0;
3663 if (!offset_str.empty())
3664 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
3665 else
3666 offset_hash = (__u32)req->head.args.readdir.offset_hash;
3667
3668 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
3669 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
3670
3671 // does the frag exist?
3672 if (diri->dirfragtree[fg.value()] != fg) {
3673 frag_t newfg;
3674 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3675 if (fg.contains((unsigned)offset_hash)) {
3676 newfg = diri->dirfragtree[offset_hash];
3677 } else {
3678 // client actually wants next frag
3679 newfg = diri->dirfragtree[fg.value()];
3680 }
3681 } else {
3682 offset_str.clear();
3683 newfg = diri->dirfragtree[fg.value()];
3684 }
3685 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
3686 fg = newfg;
3687 }
3688
3689 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
3690 if (!dir) return;
3691
3692 // ok!
3693 dout(10) << "handle_client_readdir on " << *dir << dendl;
3694 assert(dir->is_auth());
3695
3696 if (!dir->is_complete()) {
3697 if (dir->is_frozen()) {
3698 dout(7) << "dir is frozen " << *dir << dendl;
3699 mds->locker->drop_locks(mdr.get());
3700 mdr->drop_local_auth_pins();
3701 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3702 return;
3703 }
3704 // fetch
3705 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
3706 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
3707 return;
3708 }
3709
3710 #ifdef MDS_VERIFY_FRAGSTAT
3711 dir->verify_fragstat();
3712 #endif
3713
3714 utime_t now = ceph_clock_now();
3715 mdr->set_mds_stamp(now);
3716
3717 snapid_t snapid = mdr->snapid;
3718 dout(10) << "snapid " << snapid << dendl;
3719
3720 SnapRealm *realm = diri->find_snaprealm();
3721
3722 unsigned max = req->head.args.readdir.max_entries;
3723 if (!max)
3724 max = dir->get_num_any(); // whatever, something big.
3725 unsigned max_bytes = req->head.args.readdir.max_bytes;
3726 if (!max_bytes)
3727 // make sure at least one item can be encoded
3728 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
3729
3730 // start final blob
3731 bufferlist dirbl;
3732 dir->encode_dirstat(dirbl, mds->get_nodeid());
3733
3734 // count bytes available.
3735 // this isn't perfect, but we should capture the main variable/unbounded size items!
3736 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
3737 int bytes_left = max_bytes - front_bytes;
3738 bytes_left -= realm->get_snap_trace().length();
3739
3740 // build dir contents
3741 bufferlist dnbl;
3742 __u32 numfiles = 0;
3743 bool start = !offset_hash && offset_str.empty();
3744 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
3745 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
3746 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
3747 bool end = (it == dir->end());
3748 for (; !end && numfiles < max; end = (it == dir->end())) {
3749 CDentry *dn = it->second;
3750 ++it;
3751
3752 if (dn->state_test(CDentry::STATE_PURGING))
3753 continue;
3754
3755 bool dnp = dn->use_projected(client, mdr);
3756 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
3757
3758 if (dnl->is_null())
3759 continue;
3760
3761 if (dn->last < snapid || dn->first > snapid) {
3762 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
3763 continue;
3764 }
3765
3766 if (!start) {
3767 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
3768 if (!(offset_key < dn->key()))
3769 continue;
3770 }
3771
3772 CInode *in = dnl->get_inode();
3773
3774 if (in && in->ino() == CEPH_INO_CEPH)
3775 continue;
3776
3777 // remote link?
3778 // better for the MDS to do the work, if we think the client will stat any of these files.
3779 if (dnl->is_remote() && !in) {
3780 in = mdcache->get_inode(dnl->get_remote_ino());
3781 if (in) {
3782 dn->link_remote(dnl, in);
3783 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
3784 dout(10) << "skipping bad remote ino on " << *dn << dendl;
3785 continue;
3786 } else {
3787 // touch everything i _do_ have
3788 for (auto &p : *dir) {
3789 if (!p.second->get_linkage()->is_null())
3790 mdcache->lru.lru_touch(p.second);
3791 }
3792
3793 // already issued caps and leases, reply immediately.
3794 if (dnbl.length() > 0) {
3795 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
3796 dout(10) << " open remote dentry after caps were issued, stopping at "
3797 << dnbl.length() << " < " << bytes_left << dendl;
3798 break;
3799 }
3800
3801 mds->locker->drop_locks(mdr.get());
3802 mdr->drop_local_auth_pins();
3803 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
3804 return;
3805 }
3806 }
3807 assert(in);
3808
3809 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
3810 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
3811 break;
3812 }
3813
3814 unsigned start_len = dnbl.length();
3815
3816 // dentry
3817 dout(12) << "including dn " << *dn << dendl;
3818 ::encode(dn->get_name(), dnbl);
3819 mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session);
3820
3821 // inode
3822 dout(12) << "including inode " << *in << dendl;
3823 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
3824 if (r < 0) {
3825 // chop off dn->name, lease
3826 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
3827 bufferlist keep;
3828 keep.substr_of(dnbl, 0, start_len);
3829 dnbl.swap(keep);
3830 break;
3831 }
3832 assert(r >= 0);
3833 numfiles++;
3834
3835 // touch dn
3836 mdcache->lru.lru_touch(dn);
3837 }
3838
3839 __u16 flags = 0;
3840 if (end) {
3841 flags = CEPH_READDIR_FRAG_END;
3842 if (start)
3843 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
3844 }
3845 // client only understand END and COMPLETE flags ?
3846 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3847 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
3848 }
3849
3850 // finish final blob
3851 ::encode(numfiles, dirbl);
3852 ::encode(flags, dirbl);
3853 dirbl.claim_append(dnbl);
3854
3855 // yay, reply
3856 dout(10) << "reply to " << *req << " readdir num=" << numfiles
3857 << " bytes=" << dirbl.length()
3858 << " start=" << (int)start
3859 << " end=" << (int)end
3860 << dendl;
3861 mdr->reply_extra_bl = dirbl;
3862
3863 // bump popularity. NOTE: this doesn't quite capture it.
3864 mds->balancer->hit_dir(now, dir, META_POP_IRD, -1, numfiles);
3865
3866 // reply
3867 mdr->tracei = diri;
3868 respond_to_request(mdr, 0);
3869 }
3870
3871
3872
3873 // ===============================================================================
3874 // INODE UPDATES
3875
3876
3877 /*
3878 * finisher for basic inode updates
3879 */
3880 class C_MDS_inode_update_finish : public ServerLogContext {
3881 CInode *in;
3882 bool truncating_smaller, changed_ranges;
3883 public:
3884 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
3885 bool sm=false, bool cr=false) :
3886 ServerLogContext(s, r), in(i), truncating_smaller(sm), changed_ranges(cr) { }
3887 void finish(int r) override {
3888 assert(r == 0);
3889
3890 // apply
3891 in->pop_and_dirty_projected_inode(mdr->ls);
3892 mdr->apply();
3893
3894 // notify any clients
3895 if (truncating_smaller && in->inode.is_truncating()) {
3896 get_mds()->locker->issue_truncate(in);
3897 get_mds()->mdcache->truncate_inode(in, mdr->ls);
3898 }
3899
3900 utime_t now = ceph_clock_now();
3901 get_mds()->balancer->hit_inode(now, in, META_POP_IWR);
3902
3903 server->respond_to_request(mdr, 0);
3904
3905 if (changed_ranges)
3906 get_mds()->locker->share_inode_max_size(in);
3907 }
3908 };
3909
3910 void Server::handle_client_file_setlock(MDRequestRef& mdr)
3911 {
3912 MClientRequest *req = mdr->client_request;
3913 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3914
3915 // get the inode to operate on, and set up any locks needed for that
3916 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3917 if (!cur)
3918 return;
3919
3920 xlocks.insert(&cur->flocklock);
3921 /* acquire_locks will return true if it gets the locks. If it fails,
3922 it will redeliver this request at a later date, so drop the request.
3923 */
3924 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
3925 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
3926 return;
3927 }
3928
3929 // copy the lock change into a ceph_filelock so we can store/apply it
3930 ceph_filelock set_lock;
3931 set_lock.start = req->head.args.filelock_change.start;
3932 set_lock.length = req->head.args.filelock_change.length;
3933 set_lock.client = req->get_orig_source().num();
3934 set_lock.owner = req->head.args.filelock_change.owner;
3935 set_lock.pid = req->head.args.filelock_change.pid;
3936 set_lock.type = req->head.args.filelock_change.type;
3937 bool will_wait = req->head.args.filelock_change.wait;
3938
3939 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
3940
3941 ceph_lock_state_t *lock_state = NULL;
3942 bool interrupt = false;
3943
3944 // get the appropriate lock state
3945 switch (req->head.args.filelock_change.rule) {
3946 case CEPH_LOCK_FLOCK_INTR:
3947 interrupt = true;
3948 // fall-thru
3949 case CEPH_LOCK_FLOCK:
3950 lock_state = cur->get_flock_lock_state();
3951 break;
3952
3953 case CEPH_LOCK_FCNTL_INTR:
3954 interrupt = true;
3955 // fall-thru
3956 case CEPH_LOCK_FCNTL:
3957 lock_state = cur->get_fcntl_lock_state();
3958 break;
3959
3960 default:
3961 dout(10) << "got unknown lock type " << set_lock.type
3962 << ", dropping request!" << dendl;
3963 respond_to_request(mdr, -EOPNOTSUPP);
3964 return;
3965 }
3966
3967 dout(10) << " state prior to lock change: " << *lock_state << dendl;
3968 if (CEPH_LOCK_UNLOCK == set_lock.type) {
3969 list<ceph_filelock> activated_locks;
3970 list<MDSInternalContextBase*> waiters;
3971 if (lock_state->is_waiting(set_lock)) {
3972 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
3973 lock_state->remove_waiting(set_lock);
3974 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
3975 } else if (!interrupt) {
3976 dout(10) << " unlock attempt on " << set_lock << dendl;
3977 lock_state->remove_lock(set_lock, activated_locks);
3978 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
3979 }
3980 mds->queue_waiters(waiters);
3981
3982 respond_to_request(mdr, 0);
3983 } else {
3984 dout(10) << " lock attempt on " << set_lock << dendl;
3985 bool deadlock = false;
3986 if (mdr->more()->flock_was_waiting &&
3987 !lock_state->is_waiting(set_lock)) {
3988 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
3989 respond_to_request(mdr, -EINTR);
3990 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
3991 dout(10) << " it failed on this attempt" << dendl;
3992 // couldn't set lock right now
3993 if (deadlock) {
3994 respond_to_request(mdr, -EDEADLK);
3995 } else if (!will_wait) {
3996 respond_to_request(mdr, -EWOULDBLOCK);
3997 } else {
3998 dout(10) << " added to waiting list" << dendl;
3999 assert(lock_state->is_waiting(set_lock));
4000 mdr->more()->flock_was_waiting = true;
4001 mds->locker->drop_locks(mdr.get());
4002 mdr->drop_local_auth_pins();
4003 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
4004 }
4005 } else
4006 respond_to_request(mdr, 0);
4007 }
4008 dout(10) << " state after lock change: " << *lock_state << dendl;
4009 }
4010
4011 void Server::handle_client_file_readlock(MDRequestRef& mdr)
4012 {
4013 MClientRequest *req = mdr->client_request;
4014 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4015
4016 // get the inode to operate on, and set up any locks needed for that
4017 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4018 if (!cur)
4019 return;
4020
4021 /* acquire_locks will return true if it gets the locks. If it fails,
4022 it will redeliver this request at a later date, so drop the request.
4023 */
4024 rdlocks.insert(&cur->flocklock);
4025 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
4026 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
4027 return;
4028 }
4029
4030 // copy the lock change into a ceph_filelock so we can store/apply it
4031 ceph_filelock checking_lock;
4032 checking_lock.start = req->head.args.filelock_change.start;
4033 checking_lock.length = req->head.args.filelock_change.length;
4034 checking_lock.client = req->get_orig_source().num();
4035 checking_lock.owner = req->head.args.filelock_change.owner;
4036 checking_lock.pid = req->head.args.filelock_change.pid;
4037 checking_lock.type = req->head.args.filelock_change.type;
4038
4039 // get the appropriate lock state
4040 ceph_lock_state_t *lock_state = NULL;
4041 switch (req->head.args.filelock_change.rule) {
4042 case CEPH_LOCK_FLOCK:
4043 lock_state = cur->get_flock_lock_state();
4044 break;
4045
4046 case CEPH_LOCK_FCNTL:
4047 lock_state = cur->get_fcntl_lock_state();
4048 break;
4049
4050 default:
4051 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
4052 respond_to_request(mdr, -EINVAL);
4053 return;
4054 }
4055 lock_state->look_for_lock(checking_lock);
4056
4057 bufferlist lock_bl;
4058 ::encode(checking_lock, lock_bl);
4059
4060 mdr->reply_extra_bl = lock_bl;
4061 respond_to_request(mdr, 0);
4062 }
4063
4064 void Server::handle_client_setattr(MDRequestRef& mdr)
4065 {
4066 MClientRequest *req = mdr->client_request;
4067 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4068 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4069 if (!cur) return;
4070
4071 if (mdr->snapid != CEPH_NOSNAP) {
4072 respond_to_request(mdr, -EROFS);
4073 return;
4074 }
4075 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
4076 respond_to_request(mdr, -EPERM);
4077 return;
4078 }
4079
4080 __u32 mask = req->head.args.setattr.mask;
4081 __u32 access_mask = MAY_WRITE;
4082
4083 // xlock inode
4084 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
4085 xlocks.insert(&cur->authlock);
4086 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
4087 xlocks.insert(&cur->filelock);
4088 if (mask & CEPH_SETATTR_CTIME)
4089 wrlocks.insert(&cur->versionlock);
4090
4091 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4092 return;
4093
4094 if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
4095 access_mask |= MAY_CHOWN;
4096
4097 if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
4098 access_mask |= MAY_CHGRP;
4099
4100 if (!check_access(mdr, cur, access_mask))
4101 return;
4102
4103 // trunc from bigger -> smaller?
4104 auto pip = cur->get_projected_inode();
4105
4106 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
4107
4108 // ENOSPC on growing file while full, but allow shrinks
4109 if (is_full && req->head.args.setattr.size > old_size) {
4110 dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
4111 respond_to_request(mdr, -ENOSPC);
4112 return;
4113 }
4114
4115 bool truncating_smaller = false;
4116 if (mask & CEPH_SETATTR_SIZE) {
4117 truncating_smaller = req->head.args.setattr.size < old_size;
4118 if (truncating_smaller && pip->is_truncating()) {
4119 dout(10) << " waiting for pending truncate from " << pip->truncate_from
4120 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
4121 mds->locker->drop_locks(mdr.get());
4122 mdr->drop_local_auth_pins();
4123 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4124 return;
4125 }
4126 }
4127
4128 bool changed_ranges = false;
4129
4130 // project update
4131 mdr->ls = mdlog->get_current_segment();
4132 EUpdate *le = new EUpdate(mdlog, "setattr");
4133 mdlog->start_entry(le);
4134
4135 auto &pi = cur->project_inode();
4136
4137 if (mask & CEPH_SETATTR_UID)
4138 pi.inode.uid = req->head.args.setattr.uid;
4139 if (mask & CEPH_SETATTR_GID)
4140 pi.inode.gid = req->head.args.setattr.gid;
4141
4142 if (mask & CEPH_SETATTR_MODE)
4143 pi.inode.mode = (pi.inode.mode & ~07777) | (req->head.args.setattr.mode & 07777);
4144 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
4145 S_ISREG(pi.inode.mode) &&
4146 (pi.inode.mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
4147 pi.inode.mode &= ~(S_ISUID|S_ISGID);
4148 }
4149
4150 if (mask & CEPH_SETATTR_MTIME)
4151 pi.inode.mtime = req->head.args.setattr.mtime;
4152 if (mask & CEPH_SETATTR_ATIME)
4153 pi.inode.atime = req->head.args.setattr.atime;
4154 if (mask & CEPH_SETATTR_BTIME)
4155 pi.inode.btime = req->head.args.setattr.btime;
4156 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
4157 pi.inode.time_warp_seq++; // maybe not a timewarp, but still a serialization point.
4158 if (mask & CEPH_SETATTR_SIZE) {
4159 if (truncating_smaller) {
4160 pi.inode.truncate(old_size, req->head.args.setattr.size);
4161 le->metablob.add_truncate_start(cur->ino());
4162 } else {
4163 pi.inode.size = req->head.args.setattr.size;
4164 pi.inode.rstat.rbytes = pi.inode.size;
4165 }
4166 pi.inode.mtime = mdr->get_op_stamp();
4167
4168 // adjust client's max_size?
4169 CInode::mempool_inode::client_range_map new_ranges;
4170 bool max_increased = false;
4171 mds->locker->calc_new_client_ranges(cur, pi.inode.size, &new_ranges, &max_increased);
4172 if (pi.inode.client_ranges != new_ranges) {
4173 dout(10) << " client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl;
4174 pi.inode.client_ranges = new_ranges;
4175 changed_ranges = true;
4176 }
4177 }
4178
4179 pi.inode.version = cur->pre_dirty();
4180 pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
4181 pi.inode.change_attr++;
4182
4183 // log + wait
4184 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4185 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4186 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4187
4188 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
4189 truncating_smaller, changed_ranges));
4190
4191 // flush immediately if there are readers/writers waiting
4192 if (xlocks.count(&cur->filelock) &&
4193 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
4194 mds->mdlog->flush();
4195 }
4196
4197 /* Takes responsibility for mdr */
4198 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
4199 {
4200 CInode *in = mdr->in[0];
4201 client_t client = mdr->get_client();
4202 assert(in);
4203
4204 dout(10) << "do_open_truncate " << *in << dendl;
4205
4206 SnapRealm *realm = in->find_snaprealm();
4207 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
4208
4209 mdr->ls = mdlog->get_current_segment();
4210 EUpdate *le = new EUpdate(mdlog, "open_truncate");
4211 mdlog->start_entry(le);
4212
4213 // prepare
4214 auto &pi = in->project_inode();
4215 pi.inode.version = in->pre_dirty();
4216 pi.inode.mtime = pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
4217 pi.inode.change_attr++;
4218
4219 uint64_t old_size = std::max<uint64_t>(pi.inode.size, mdr->client_request->head.args.open.old_size);
4220 if (old_size > 0) {
4221 pi.inode.truncate(old_size, 0);
4222 le->metablob.add_truncate_start(in->ino());
4223 }
4224
4225 bool changed_ranges = false;
4226 if (cmode & CEPH_FILE_MODE_WR) {
4227 pi.inode.client_ranges[client].range.first = 0;
4228 pi.inode.client_ranges[client].range.last = pi.inode.get_layout_size_increment();
4229 pi.inode.client_ranges[client].follows = in->find_snaprealm()->get_newest_seq();
4230 changed_ranges = true;
4231 }
4232
4233 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
4234
4235 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
4236 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
4237
4238 // make sure ino gets into the journal
4239 le->metablob.add_opened_ino(in->ino());
4240 LogSegment *ls = mds->mdlog->get_current_segment();
4241 ls->open_files.push_back(&in->item_open_file);
4242
4243 mdr->o_trunc = true;
4244
4245 CDentry *dn = 0;
4246 if (mdr->client_request->get_dentry_wanted()) {
4247 assert(mdr->dn[0].size());
4248 dn = mdr->dn[0].back();
4249 }
4250
4251 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
4252 changed_ranges));
4253 // Although the `open` part can give an early reply, the truncation won't
4254 // happen until our EUpdate is persistent, to give the client a prompt
4255 // response we must also flush that event.
4256 mdlog->flush();
4257 }
4258
4259
4260 /* This function cleans up the passed mdr */
4261 void Server::handle_client_setlayout(MDRequestRef& mdr)
4262 {
4263 MClientRequest *req = mdr->client_request;
4264 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4265 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4266 if (!cur) return;
4267
4268 if (mdr->snapid != CEPH_NOSNAP) {
4269 respond_to_request(mdr, -EROFS);
4270 return;
4271 }
4272 if (!cur->is_file()) {
4273 respond_to_request(mdr, -EINVAL);
4274 return;
4275 }
4276 if (cur->get_projected_inode()->size ||
4277 cur->get_projected_inode()->truncate_seq > 1) {
4278 respond_to_request(mdr, -ENOTEMPTY);
4279 return;
4280 }
4281
4282 // validate layout
4283 file_layout_t layout = cur->get_projected_inode()->layout;
4284 // save existing layout for later
4285 const auto old_layout = layout;
4286
4287 int access = MAY_WRITE;
4288
4289 if (req->head.args.setlayout.layout.fl_object_size > 0)
4290 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4291 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4292 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4293 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4294 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4295 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4296 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4297
4298 // make sure we have as new a map as the client
4299 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4300 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4301 return;
4302 }
4303 }
4304
4305 // Don't permit layout modifications without 'p' caps
4306 if (layout != old_layout) {
4307 access |= MAY_SET_VXATTR;
4308 }
4309
4310 if (!layout.is_valid()) {
4311 dout(10) << "bad layout" << dendl;
4312 respond_to_request(mdr, -EINVAL);
4313 return;
4314 }
4315 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4316 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4317 respond_to_request(mdr, -EINVAL);
4318 return;
4319 }
4320
4321 xlocks.insert(&cur->filelock);
4322 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4323 return;
4324
4325 if (!check_access(mdr, cur, access))
4326 return;
4327
4328 // project update
4329 auto &pi = cur->project_inode();
4330 pi.inode.layout = layout;
4331 // add the old pool to the inode
4332 pi.inode.add_old_pool(old_layout.pool_id);
4333 pi.inode.version = cur->pre_dirty();
4334 pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
4335 pi.inode.change_attr++;
4336
4337 // log + wait
4338 mdr->ls = mdlog->get_current_segment();
4339 EUpdate *le = new EUpdate(mdlog, "setlayout");
4340 mdlog->start_entry(le);
4341 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4342 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4343 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4344
4345 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4346 }
4347
4348 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
4349 {
4350 MClientRequest *req = mdr->client_request;
4351 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4352 file_layout_t *dir_layout = NULL;
4353 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4354 if (!cur) return;
4355
4356 if (mdr->snapid != CEPH_NOSNAP) {
4357 respond_to_request(mdr, -EROFS);
4358 return;
4359 }
4360
4361 if (!cur->is_dir()) {
4362 respond_to_request(mdr, -ENOTDIR);
4363 return;
4364 }
4365
4366 xlocks.insert(&cur->policylock);
4367 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4368 return;
4369
4370 // validate layout
4371 const auto old_pi = cur->get_projected_inode();
4372 file_layout_t layout;
4373 if (old_pi->has_layout())
4374 layout = old_pi->layout;
4375 else if (dir_layout)
4376 layout = *dir_layout;
4377 else
4378 layout = mdcache->default_file_layout;
4379
4380 // Level of access required to complete
4381 int access = MAY_WRITE;
4382
4383 const auto old_layout = layout;
4384
4385 if (req->head.args.setlayout.layout.fl_object_size > 0)
4386 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4387 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4388 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4389 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4390 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4391 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4392 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4393 // make sure we have as new a map as the client
4394 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4395 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4396 return;
4397 }
4398 }
4399
4400 if (layout != old_layout) {
4401 access |= MAY_SET_VXATTR;
4402 }
4403
4404 if (!layout.is_valid()) {
4405 dout(10) << "bad layout" << dendl;
4406 respond_to_request(mdr, -EINVAL);
4407 return;
4408 }
4409 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4410 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4411 respond_to_request(mdr, -EINVAL);
4412 return;
4413 }
4414
4415 if (!check_access(mdr, cur, access))
4416 return;
4417
4418 auto &pi = cur->project_inode();
4419 pi.inode.layout = layout;
4420 pi.inode.version = cur->pre_dirty();
4421
4422 // log + wait
4423 mdr->ls = mdlog->get_current_segment();
4424 EUpdate *le = new EUpdate(mdlog, "setlayout");
4425 mdlog->start_entry(le);
4426 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4427 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4428 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4429
4430 mdr->no_early_reply = true;
4431 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4432 }
4433
4434 // XATTRS
4435
4436 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
4437 file_layout_t *layout, bool validate)
4438 {
4439 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
4440 try {
4441 if (name == "layout") {
4442 string::iterator begin = value.begin();
4443 string::iterator end = value.end();
4444 keys_and_values<string::iterator> p; // create instance of parser
4445 std::map<string, string> m; // map to receive results
4446 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4447 return -EINVAL;
4448 }
4449 string left(begin, end);
4450 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4451 if (begin != end)
4452 return -EINVAL;
4453 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4454 // Skip validation on each attr, we do it once at the end (avoid
4455 // rejecting intermediate states if the overall result is ok)
4456 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
4457 osdmap, layout, false);
4458 if (r < 0)
4459 return r;
4460 }
4461 } else if (name == "layout.object_size") {
4462 layout->object_size = boost::lexical_cast<unsigned>(value);
4463 } else if (name == "layout.stripe_unit") {
4464 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
4465 } else if (name == "layout.stripe_count") {
4466 layout->stripe_count = boost::lexical_cast<unsigned>(value);
4467 } else if (name == "layout.pool") {
4468 try {
4469 layout->pool_id = boost::lexical_cast<unsigned>(value);
4470 } catch (boost::bad_lexical_cast const&) {
4471 int64_t pool = osdmap.lookup_pg_pool_name(value);
4472 if (pool < 0) {
4473 dout(10) << " unknown pool " << value << dendl;
4474 return -ENOENT;
4475 }
4476 layout->pool_id = pool;
4477 }
4478 } else if (name == "layout.pool_namespace") {
4479 layout->pool_ns = value;
4480 } else {
4481 dout(10) << " unknown layout vxattr " << name << dendl;
4482 return -EINVAL;
4483 }
4484 } catch (boost::bad_lexical_cast const&) {
4485 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4486 return -EINVAL;
4487 }
4488
4489 if (validate && !layout->is_valid()) {
4490 dout(10) << "bad layout" << dendl;
4491 return -EINVAL;
4492 }
4493 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
4494 dout(10) << " invalid data pool " << layout->pool_id << dendl;
4495 return -EINVAL;
4496 }
4497 return 0;
4498 }
4499
4500 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
4501 {
4502 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
4503 try {
4504 if (name == "quota") {
4505 string::iterator begin = value.begin();
4506 string::iterator end = value.end();
4507 keys_and_values<string::iterator> p; // create instance of parser
4508 std::map<string, string> m; // map to receive results
4509 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4510 return -EINVAL;
4511 }
4512 string left(begin, end);
4513 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4514 if (begin != end)
4515 return -EINVAL;
4516 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4517 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
4518 if (r < 0)
4519 return r;
4520 }
4521 } else if (name == "quota.max_bytes") {
4522 int64_t q = boost::lexical_cast<int64_t>(value);
4523 if (q < 0)
4524 return -EINVAL;
4525 quota->max_bytes = q;
4526 } else if (name == "quota.max_files") {
4527 int64_t q = boost::lexical_cast<int64_t>(value);
4528 if (q < 0)
4529 return -EINVAL;
4530 quota->max_files = q;
4531 } else {
4532 dout(10) << " unknown quota vxattr " << name << dendl;
4533 return -EINVAL;
4534 }
4535 } catch (boost::bad_lexical_cast const&) {
4536 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4537 return -EINVAL;
4538 }
4539
4540 if (!quota->is_valid()) {
4541 dout(10) << "bad quota" << dendl;
4542 return -EINVAL;
4543 }
4544 return 0;
4545 }
4546
4547 /*
4548 * Verify that the file layout attribute carried by client
4549 * is well-formatted.
4550 * Return 0 on success, otherwise this function takes
4551 * responsibility for the passed mdr.
4552 */
4553 int Server::check_layout_vxattr(MDRequestRef& mdr,
4554 string name,
4555 string value,
4556 file_layout_t *layout)
4557 {
4558 MClientRequest *req = mdr->client_request;
4559 epoch_t epoch;
4560 int r;
4561
4562 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4563 r = parse_layout_vxattr(name, value, osdmap, layout);
4564 epoch = osdmap.get_epoch();
4565 });
4566
4567 if (r == -ENOENT) {
4568
4569 // we don't have the specified pool, make sure our map
4570 // is newer than or as new as the client.
4571 epoch_t req_epoch = req->get_osdmap_epoch();
4572
4573 if (req_epoch > epoch) {
4574
4575 // well, our map is older. consult mds.
4576 Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
4577
4578 if (!mds->objecter->wait_for_map(req_epoch, fin))
4579 return r; // wait, fin will retry this request later
4580
4581 delete fin;
4582
4583 // now we have at least as new a map as the client, try again.
4584 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4585 r = parse_layout_vxattr(name, value, osdmap, layout);
4586 epoch = osdmap.get_epoch();
4587 });
4588
4589 assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
4590
4591 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
4592
4593 // For compatibility with client w/ old code, we still need get the
4594 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
4595 // we can remove those code.
4596 mdr->waited_for_osdmap = true;
4597 mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
4598 mds, new C_MDS_RetryRequest(mdcache, mdr)));
4599 return r;
4600 }
4601 }
4602
4603 if (r < 0) {
4604
4605 if (r == -ENOENT)
4606 r = -EINVAL;
4607
4608 respond_to_request(mdr, r);
4609 return r;
4610 }
4611
4612 // all is well
4613 return 0;
4614 }
4615
4616 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
4617 file_layout_t *dir_layout,
4618 set<SimpleLock*> rdlocks,
4619 set<SimpleLock*> wrlocks,
4620 set<SimpleLock*> xlocks)
4621 {
4622 MClientRequest *req = mdr->client_request;
4623 string name(req->get_path2());
4624 bufferlist bl = req->get_data();
4625 string value (bl.c_str(), bl.length());
4626 dout(10) << "handle_set_vxattr " << name
4627 << " val " << value.length()
4628 << " bytes on " << *cur
4629 << dendl;
4630
4631 CInode::mempool_inode *pip = nullptr;
4632 string rest;
4633
4634 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
4635 return;
4636 }
4637
4638 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
4639 if (!cur->is_dir()) {
4640 respond_to_request(mdr, -EINVAL);
4641 return;
4642 }
4643
4644 file_layout_t layout;
4645 if (cur->get_projected_inode()->has_layout())
4646 layout = cur->get_projected_inode()->layout;
4647 else if (dir_layout)
4648 layout = *dir_layout;
4649 else
4650 layout = mdcache->default_file_layout;
4651
4652 rest = name.substr(name.find("layout"));
4653 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4654 return;
4655
4656 xlocks.insert(&cur->policylock);
4657 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4658 return;
4659
4660 auto &pi = cur->project_inode();
4661 pi.inode.layout = layout;
4662 mdr->no_early_reply = true;
4663 pip = &pi.inode;
4664 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
4665 if (!cur->is_file()) {
4666 respond_to_request(mdr, -EINVAL);
4667 return;
4668 }
4669 if (cur->get_projected_inode()->size ||
4670 cur->get_projected_inode()->truncate_seq > 1) {
4671 respond_to_request(mdr, -ENOTEMPTY);
4672 return;
4673 }
4674 file_layout_t layout = cur->get_projected_inode()->layout;
4675 rest = name.substr(name.find("layout"));
4676 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4677 return;
4678
4679 xlocks.insert(&cur->filelock);
4680 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4681 return;
4682
4683 auto &pi = cur->project_inode();
4684 int64_t old_pool = pi.inode.layout.pool_id;
4685 pi.inode.add_old_pool(old_pool);
4686 pi.inode.layout = layout;
4687 pip = &pi.inode;
4688 } else if (name.compare(0, 10, "ceph.quota") == 0) {
4689 if (!cur->is_dir() || cur->is_root()) {
4690 respond_to_request(mdr, -EINVAL);
4691 return;
4692 }
4693
4694 quota_info_t quota = cur->get_projected_inode()->quota;
4695
4696 rest = name.substr(name.find("quota"));
4697 int r = parse_quota_vxattr(rest, value, &quota);
4698 if (r < 0) {
4699 respond_to_request(mdr, r);
4700 return;
4701 }
4702
4703 xlocks.insert(&cur->policylock);
4704 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4705 return;
4706
4707 auto &pi = cur->project_inode();
4708 pi.inode.quota = quota;
4709
4710 mdr->no_early_reply = true;
4711 pip = &pi.inode;
4712
4713 client_t exclude_ct = mdr->get_client();
4714 mdcache->broadcast_quota_to_client(cur, exclude_ct);
4715 } else if (name.find("ceph.dir.pin") == 0) {
4716 if (!cur->is_dir() || cur->is_root()) {
4717 respond_to_request(mdr, -EINVAL);
4718 return;
4719 }
4720
4721 mds_rank_t rank;
4722 try {
4723 rank = boost::lexical_cast<mds_rank_t>(value);
4724 if (rank < 0) rank = MDS_RANK_NONE;
4725 } catch (boost::bad_lexical_cast const&) {
4726 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4727 respond_to_request(mdr, -EINVAL);
4728 return;
4729 }
4730
4731 xlocks.insert(&cur->policylock);
4732 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4733 return;
4734
4735 auto &pi = cur->project_inode();
4736 cur->set_export_pin(rank);
4737 pip = &pi.inode;
4738 } else {
4739 dout(10) << " unknown vxattr " << name << dendl;
4740 respond_to_request(mdr, -EINVAL);
4741 return;
4742 }
4743
4744 pip->change_attr++;
4745 pip->ctime = pip->rstat.rctime = mdr->get_op_stamp();
4746 pip->version = cur->pre_dirty();
4747 if (cur->is_file())
4748 pip->update_backtrace();
4749
4750 // log + wait
4751 mdr->ls = mdlog->get_current_segment();
4752 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
4753 mdlog->start_entry(le);
4754 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4755 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4756 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4757
4758 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4759 return;
4760 }
4761
4762 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
4763 file_layout_t *dir_layout,
4764 set<SimpleLock*> rdlocks,
4765 set<SimpleLock*> wrlocks,
4766 set<SimpleLock*> xlocks)
4767 {
4768 MClientRequest *req = mdr->client_request;
4769 string name(req->get_path2());
4770
4771 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
4772
4773 if (name == "ceph.dir.layout") {
4774 if (!cur->is_dir()) {
4775 respond_to_request(mdr, -ENODATA);
4776 return;
4777 }
4778 if (cur->is_root()) {
4779 dout(10) << "can't remove layout policy on the root directory" << dendl;
4780 respond_to_request(mdr, -EINVAL);
4781 return;
4782 }
4783
4784 if (!cur->get_projected_inode()->has_layout()) {
4785 respond_to_request(mdr, -ENODATA);
4786 return;
4787 }
4788
4789 xlocks.insert(&cur->policylock);
4790 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4791 return;
4792
4793 auto &pi = cur->project_inode();
4794 pi.inode.clear_layout();
4795 pi.inode.version = cur->pre_dirty();
4796
4797 // log + wait
4798 mdr->ls = mdlog->get_current_segment();
4799 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
4800 mdlog->start_entry(le);
4801 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4802 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4803 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4804
4805 mdr->no_early_reply = true;
4806 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4807 return;
4808 } else if (name == "ceph.dir.layout.pool_namespace"
4809 || name == "ceph.file.layout.pool_namespace") {
4810 // Namespace is the only layout field that has a meaningful
4811 // null/none value (empty string, means default layout). Is equivalent
4812 // to a setxattr with empty string: pass through the empty payload of
4813 // the rmxattr request to do this.
4814 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4815 return;
4816 }
4817
4818 respond_to_request(mdr, -ENODATA);
4819 }
4820
4821 class C_MDS_inode_xattr_update_finish : public ServerLogContext {
4822 CInode *in;
4823 public:
4824
4825 C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
4826 ServerLogContext(s, r), in(i) { }
4827 void finish(int r) override {
4828 assert(r == 0);
4829
4830 // apply
4831 in->pop_and_dirty_projected_inode(mdr->ls);
4832
4833 mdr->apply();
4834
4835 utime_t now = ceph_clock_now();
4836 get_mds()->balancer->hit_inode(now, in, META_POP_IWR);
4837
4838 server->respond_to_request(mdr, 0);
4839 }
4840 };
4841
4842 void Server::handle_client_setxattr(MDRequestRef& mdr)
4843 {
4844 MClientRequest *req = mdr->client_request;
4845 string name(req->get_path2());
4846 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4847 CInode *cur;
4848
4849 file_layout_t *dir_layout = NULL;
4850 if (name.compare(0, 15, "ceph.dir.layout") == 0)
4851 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4852 else
4853 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4854 if (!cur)
4855 return;
4856
4857 if (mdr->snapid != CEPH_NOSNAP) {
4858 respond_to_request(mdr, -EROFS);
4859 return;
4860 }
4861
4862 int flags = req->head.args.setxattr.flags;
4863
4864 // magic ceph.* namespace?
4865 if (name.compare(0, 5, "ceph.") == 0) {
4866 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4867 return;
4868 }
4869
4870 xlocks.insert(&cur->xattrlock);
4871 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4872 return;
4873
4874 if (!check_access(mdr, cur, MAY_WRITE))
4875 return;
4876
4877 auto pxattrs = cur->get_projected_xattrs();
4878 size_t len = req->get_data().length();
4879 size_t inc = len + name.length();
4880
4881 // check xattrs kv pairs size
4882 size_t cur_xattrs_size = 0;
4883 for (const auto& p : *pxattrs) {
4884 if ((flags & CEPH_XATTR_REPLACE) && (name.compare(std::string(boost::string_view(p.first))) == 0)) {
4885 continue;
4886 }
4887 cur_xattrs_size += p.first.length() + p.second.length();
4888 }
4889
4890 if (((cur_xattrs_size + inc) > g_conf->mds_max_xattr_pairs_size)) {
4891 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
4892 << cur_xattrs_size << ", inc " << inc << dendl;
4893 respond_to_request(mdr, -ENOSPC);
4894 return;
4895 }
4896
4897 if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(mempool::mds_co::string(boost::string_view(name)))) {
4898 dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
4899 respond_to_request(mdr, -EEXIST);
4900 return;
4901 }
4902 if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(mempool::mds_co::string(boost::string_view(name)))) {
4903 dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
4904 respond_to_request(mdr, -ENODATA);
4905 return;
4906 }
4907
4908 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
4909
4910 // project update
4911 auto &pi = cur->project_inode(true);
4912 pi.inode.version = cur->pre_dirty();
4913 pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
4914 pi.inode.change_attr++;
4915 pi.inode.xattr_version++;
4916 auto &px = *pi.xattrs;
4917 if ((flags & CEPH_XATTR_REMOVE)) {
4918 px.erase(mempool::mds_co::string(boost::string_view(name)));
4919 } else {
4920 bufferptr b = buffer::create(len);
4921 if (len)
4922 req->get_data().copy(0, len, b.c_str());
4923 auto em = px.emplace(std::piecewise_construct, std::forward_as_tuple(mempool::mds_co::string(boost::string_view(name))), std::forward_as_tuple(b));
4924 if (!em.second)
4925 em.first->second = b;
4926 }
4927
4928 // log + wait
4929 mdr->ls = mdlog->get_current_segment();
4930 EUpdate *le = new EUpdate(mdlog, "setxattr");
4931 mdlog->start_entry(le);
4932 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4933 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4934 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4935
4936 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4937 }
4938
4939 void Server::handle_client_removexattr(MDRequestRef& mdr)
4940 {
4941 MClientRequest *req = mdr->client_request;
4942 std::string name(req->get_path2());
4943 std::set<SimpleLock*> rdlocks, wrlocks, xlocks;
4944 file_layout_t *dir_layout = NULL;
4945 CInode *cur;
4946 if (name == "ceph.dir.layout")
4947 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4948 else
4949 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4950 if (!cur)
4951 return;
4952
4953 if (mdr->snapid != CEPH_NOSNAP) {
4954 respond_to_request(mdr, -EROFS);
4955 return;
4956 }
4957
4958 if (name.compare(0, 5, "ceph.") == 0) {
4959 handle_remove_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4960 return;
4961 }
4962
4963 xlocks.insert(&cur->xattrlock);
4964 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4965 return;
4966
4967 auto pxattrs = cur->get_projected_xattrs();
4968 if (pxattrs->count(mempool::mds_co::string(boost::string_view(name))) == 0) {
4969 dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
4970 respond_to_request(mdr, -ENODATA);
4971 return;
4972 }
4973
4974 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
4975
4976 // project update
4977 auto &pi = cur->project_inode(true);
4978 auto &px = *pi.xattrs;
4979 pi.inode.version = cur->pre_dirty();
4980 pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
4981 pi.inode.change_attr++;
4982 pi.inode.xattr_version++;
4983 px.erase(mempool::mds_co::string(boost::string_view(name)));
4984
4985 // log + wait
4986 mdr->ls = mdlog->get_current_segment();
4987 EUpdate *le = new EUpdate(mdlog, "removexattr");
4988 mdlog->start_entry(le);
4989 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4990 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4991 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4992
4993 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4994 }
4995
4996
4997 // =================================================================
4998 // DIRECTORY and NAMESPACE OPS
4999
5000
5001 // ------------------------------------------------
5002
5003 // MKNOD
5004
5005 class C_MDS_mknod_finish : public ServerLogContext {
5006 CDentry *dn;
5007 CInode *newi;
5008 public:
5009 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
5010 ServerLogContext(s, r), dn(d), newi(ni) {}
5011 void finish(int r) override {
5012 assert(r == 0);
5013
5014 // link the inode
5015 dn->pop_projected_linkage();
5016
5017 // be a bit hacky with the inode version, here.. we decrement it
5018 // just to keep mark_dirty() happen. (we didn't bother projecting
5019 // a new version of hte inode since it's just been created)
5020 newi->inode.version--;
5021 newi->mark_dirty(newi->inode.version + 1, mdr->ls);
5022 newi->mark_dirty_parent(mdr->ls, true);
5023
5024 // mkdir?
5025 if (newi->inode.is_dir()) {
5026 CDir *dir = newi->get_dirfrag(frag_t());
5027 assert(dir);
5028 dir->fnode.version--;
5029 dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
5030 dir->mark_new(mdr->ls);
5031 }
5032
5033 mdr->apply();
5034
5035 MDRequestRef null_ref;
5036 get_mds()->mdcache->send_dentry_link(dn, null_ref);
5037
5038 if (newi->inode.is_file())
5039 get_mds()->locker->share_inode_max_size(newi);
5040
5041 // hit pop
5042 utime_t now = ceph_clock_now();
5043 get_mds()->balancer->hit_inode(now, newi, META_POP_IWR);
5044
5045 // reply
5046 server->respond_to_request(mdr, 0);
5047 }
5048 };
5049
5050
5051 void Server::handle_client_mknod(MDRequestRef& mdr)
5052 {
5053 MClientRequest *req = mdr->client_request;
5054 client_t client = mdr->get_client();
5055 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5056 file_layout_t *dir_layout = NULL;
5057 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false,
5058 &dir_layout);
5059 if (!dn) return;
5060 if (mdr->snapid != CEPH_NOSNAP) {
5061 respond_to_request(mdr, -EROFS);
5062 return;
5063 }
5064 CInode *diri = dn->get_dir()->get_inode();
5065 rdlocks.insert(&diri->authlock);
5066 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5067 return;
5068
5069 if (!check_access(mdr, diri, MAY_WRITE))
5070 return;
5071
5072 if (!check_fragment_space(mdr, dn->get_dir()))
5073 return;
5074
5075 unsigned mode = req->head.args.mknod.mode;
5076 if ((mode & S_IFMT) == 0)
5077 mode |= S_IFREG;
5078
5079 // set layout
5080 file_layout_t layout;
5081 if (dir_layout && S_ISREG(mode))
5082 layout = *dir_layout;
5083 else
5084 layout = mdcache->default_file_layout;
5085
5086 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5087 snapid_t follows = realm->get_newest_seq();
5088 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
5089 mode, &layout);
5090 assert(newi);
5091
5092 dn->push_projected_linkage(newi);
5093
5094 newi->inode.rdev = req->head.args.mknod.rdev;
5095 newi->inode.version = dn->pre_dirty();
5096 newi->inode.rstat.rfiles = 1;
5097 if (layout.pool_id != mdcache->default_file_layout.pool_id)
5098 newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
5099 newi->inode.update_backtrace();
5100
5101 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5102 // want to write to it (e.g., if they are reexporting NFS)
5103 if (S_ISREG(newi->inode.mode)) {
5104 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
5105 newi->inode.client_ranges[client].range.first = 0;
5106 newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
5107 newi->inode.client_ranges[client].follows = follows;
5108
5109 // issue a cap on the file
5110 int cmode = CEPH_FILE_MODE_RDWR;
5111 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5112 if (cap) {
5113 cap->set_wanted(0);
5114
5115 // put locks in excl mode
5116 newi->filelock.set_state(LOCK_EXCL);
5117 newi->authlock.set_state(LOCK_EXCL);
5118 newi->xattrlock.set_state(LOCK_EXCL);
5119 }
5120 }
5121
5122 assert(dn->first == follows + 1);
5123 newi->first = dn->first;
5124
5125 dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
5126
5127 // prepare finisher
5128 mdr->ls = mdlog->get_current_segment();
5129 EUpdate *le = new EUpdate(mdlog, "mknod");
5130 mdlog->start_entry(le);
5131 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5132 journal_allocated_inos(mdr, &le->metablob);
5133
5134 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
5135 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5136 le->metablob.add_primary_dentry(dn, newi, true, true, true);
5137
5138 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5139 }
5140
5141
5142
5143 // MKDIR
5144 /* This function takes responsibility for the passed mdr*/
5145 void Server::handle_client_mkdir(MDRequestRef& mdr)
5146 {
5147 MClientRequest *req = mdr->client_request;
5148 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5149 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5150 if (!dn) return;
5151 if (mdr->snapid != CEPH_NOSNAP) {
5152 respond_to_request(mdr, -EROFS);
5153 return;
5154 }
5155 CDir *dir = dn->get_dir();
5156 CInode *diri = dir->get_inode();
5157 rdlocks.insert(&diri->authlock);
5158 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5159 return;
5160
5161 // mkdir check access
5162 if (!check_access(mdr, diri, MAY_WRITE))
5163 return;
5164
5165 if (!check_fragment_space(mdr, dir))
5166 return;
5167
5168 // new inode
5169 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5170 snapid_t follows = realm->get_newest_seq();
5171
5172 unsigned mode = req->head.args.mkdir.mode;
5173 mode &= ~S_IFMT;
5174 mode |= S_IFDIR;
5175 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5176 assert(newi);
5177
5178 // it's a directory.
5179 dn->push_projected_linkage(newi);
5180
5181 newi->inode.version = dn->pre_dirty();
5182 newi->inode.rstat.rsubdirs = 1;
5183 newi->inode.update_backtrace();
5184
5185 dout(12) << " follows " << follows << dendl;
5186 assert(dn->first == follows + 1);
5187 newi->first = dn->first;
5188
5189 // ...and that new dir is empty.
5190 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
5191 newdir->state_set(CDir::STATE_CREATING);
5192 newdir->mark_complete();
5193 newdir->fnode.version = newdir->pre_dirty();
5194
5195 // prepare finisher
5196 mdr->ls = mdlog->get_current_segment();
5197 EUpdate *le = new EUpdate(mdlog, "mkdir");
5198 mdlog->start_entry(le);
5199 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5200 journal_allocated_inos(mdr, &le->metablob);
5201 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5202 le->metablob.add_primary_dentry(dn, newi, true, true);
5203 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
5204
5205 // issue a cap on the directory
5206 int cmode = CEPH_FILE_MODE_RDWR;
5207 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5208 if (cap) {
5209 cap->set_wanted(0);
5210
5211 // put locks in excl mode
5212 newi->filelock.set_state(LOCK_EXCL);
5213 newi->authlock.set_state(LOCK_EXCL);
5214 newi->xattrlock.set_state(LOCK_EXCL);
5215 }
5216
5217 // make sure this inode gets into the journal
5218 le->metablob.add_opened_ino(newi->ino());
5219 LogSegment *ls = mds->mdlog->get_current_segment();
5220 ls->open_files.push_back(&newi->item_open_file);
5221
5222 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5223 }
5224
5225
5226 // SYMLINK
5227
5228 void Server::handle_client_symlink(MDRequestRef& mdr)
5229 {
5230 MClientRequest *req = mdr->client_request;
5231 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5232 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5233 if (!dn) return;
5234 if (mdr->snapid != CEPH_NOSNAP) {
5235 respond_to_request(mdr, -EROFS);
5236 return;
5237 }
5238 CDir *dir = dn->get_dir();
5239 CInode *diri = dir->get_inode();
5240 rdlocks.insert(&diri->authlock);
5241 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5242 return;
5243
5244 if (!check_access(mdr, diri, MAY_WRITE))
5245 return;
5246
5247 if (!check_fragment_space(mdr, dir))
5248 return;
5249
5250 unsigned mode = S_IFLNK | 0777;
5251 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5252 assert(newi);
5253
5254 // it's a symlink
5255 dn->push_projected_linkage(newi);
5256
5257 newi->symlink = mempool::mds_co::string(boost::string_view(req->get_path2()));
5258 newi->inode.size = newi->symlink.length();
5259 newi->inode.rstat.rbytes = newi->inode.size;
5260 newi->inode.rstat.rfiles = 1;
5261 newi->inode.version = dn->pre_dirty();
5262 newi->inode.update_backtrace();
5263
5264 newi->first = dn->first;
5265
5266 // prepare finisher
5267 mdr->ls = mdlog->get_current_segment();
5268 EUpdate *le = new EUpdate(mdlog, "symlink");
5269 mdlog->start_entry(le);
5270 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5271 journal_allocated_inos(mdr, &le->metablob);
5272 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5273 le->metablob.add_primary_dentry(dn, newi, true, true);
5274
5275 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5276 }
5277
5278
5279
5280
5281
5282 // LINK
5283
5284 void Server::handle_client_link(MDRequestRef& mdr)
5285 {
5286 MClientRequest *req = mdr->client_request;
5287
5288 dout(7) << "handle_client_link " << req->get_filepath()
5289 << " to " << req->get_filepath2()
5290 << dendl;
5291
5292 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5293
5294 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5295 if (!dn) return;
5296 CInode *targeti = rdlock_path_pin_ref(mdr, 1, rdlocks, false);
5297 if (!targeti) return;
5298 if (mdr->snapid != CEPH_NOSNAP) {
5299 respond_to_request(mdr, -EROFS);
5300 return;
5301 }
5302
5303 CDir *dir = dn->get_dir();
5304 dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
5305 dout(7) << "target is " << *targeti << dendl;
5306 if (targeti->is_dir()) {
5307 // if srcdn is replica, need to make sure its linkage is correct
5308 vector<CDentry*>& trace = mdr->dn[1];
5309 if (trace.empty() ||
5310 trace.back()->is_auth() ||
5311 trace.back()->lock.can_read(mdr->get_client())) {
5312 dout(7) << "target is a dir, failing..." << dendl;
5313 respond_to_request(mdr, -EINVAL);
5314 return;
5315 }
5316 }
5317
5318 xlocks.insert(&targeti->linklock);
5319
5320 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5321 return;
5322
5323 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5324 if (!check_access(mdr, targeti, MAY_WRITE))
5325 return;
5326
5327 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
5328 return;
5329
5330 if (!check_fragment_space(mdr, dir))
5331 return;
5332 }
5333
5334 // go!
5335 assert(g_conf->mds_kill_link_at != 1);
5336
5337 // local or remote?
5338 if (targeti->is_auth())
5339 _link_local(mdr, dn, targeti);
5340 else
5341 _link_remote(mdr, true, dn, targeti);
5342 }
5343
5344
5345 class C_MDS_link_local_finish : public ServerLogContext {
5346 CDentry *dn;
5347 CInode *targeti;
5348 version_t dnpv;
5349 version_t tipv;
5350 public:
5351 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
5352 version_t dnpv_, version_t tipv_) :
5353 ServerLogContext(s, r), dn(d), targeti(ti),
5354 dnpv(dnpv_), tipv(tipv_) { }
5355 void finish(int r) override {
5356 assert(r == 0);
5357 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv);
5358 }
5359 };
5360
5361
5362 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
5363 {
5364 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
5365
5366 mdr->ls = mdlog->get_current_segment();
5367
5368 // predirty NEW dentry
5369 version_t dnpv = dn->pre_dirty();
5370 version_t tipv = targeti->pre_dirty();
5371
5372 // project inode update
5373 auto &pi = targeti->project_inode();
5374 pi.inode.nlink++;
5375 pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
5376 pi.inode.change_attr++;
5377 pi.inode.version = tipv;
5378
5379 // log + wait
5380 EUpdate *le = new EUpdate(mdlog, "link_local");
5381 mdlog->start_entry(le);
5382 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5383 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
5384 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
5385 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5386 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
5387
5388 // do this after predirty_*, to avoid funky extra dnl arg
5389 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5390
5391 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv));
5392 }
5393
5394 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
5395 version_t dnpv, version_t tipv)
5396 {
5397 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
5398
5399 // link and unlock the NEW dentry
5400 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
5401 if (!dnl->get_inode())
5402 dn->link_remote(dnl, targeti);
5403 dn->mark_dirty(dnpv, mdr->ls);
5404
5405 // target inode
5406 targeti->pop_and_dirty_projected_inode(mdr->ls);
5407
5408 mdr->apply();
5409
5410 MDRequestRef null_ref;
5411 mdcache->send_dentry_link(dn, null_ref);
5412
5413 // bump target popularity
5414 utime_t now = ceph_clock_now();
5415 mds->balancer->hit_inode(now, targeti, META_POP_IWR);
5416 mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
5417
5418 // reply
5419 respond_to_request(mdr, 0);
5420 }
5421
5422
5423 // link / unlink remote
5424
5425 class C_MDS_link_remote_finish : public ServerLogContext {
5426 bool inc;
5427 CDentry *dn;
5428 CInode *targeti;
5429 version_t dpv;
5430 public:
5431 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
5432 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
5433 dpv(d->get_projected_version()) {}
5434 void finish(int r) override {
5435 assert(r == 0);
5436 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
5437 }
5438 };
5439
5440 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
5441 {
5442 dout(10) << "_link_remote "
5443 << (inc ? "link ":"unlink ")
5444 << *dn << " to " << *targeti << dendl;
5445
5446 // 1. send LinkPrepare to dest (journal nlink++ prepare)
5447 mds_rank_t linkauth = targeti->authority().first;
5448 if (mdr->more()->witnessed.count(linkauth) == 0) {
5449 if (mds->is_cluster_degraded() &&
5450 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
5451 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
5452 if (mdr->more()->waiting_on_slave.empty())
5453 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
5454 return;
5455 }
5456
5457 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
5458 int op;
5459 if (inc)
5460 op = MMDSSlaveRequest::OP_LINKPREP;
5461 else
5462 op = MMDSSlaveRequest::OP_UNLINKPREP;
5463 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, op);
5464 targeti->set_object_info(req->get_object_info());
5465 req->op_stamp = mdr->get_op_stamp();
5466 mds->send_message_mds(req, linkauth);
5467
5468 assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
5469 mdr->more()->waiting_on_slave.insert(linkauth);
5470 return;
5471 }
5472 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
5473
5474 assert(g_conf->mds_kill_link_at != 2);
5475
5476 mdr->set_mds_stamp(ceph_clock_now());
5477
5478 // add to event
5479 mdr->ls = mdlog->get_current_segment();
5480 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
5481 mdlog->start_entry(le);
5482 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5483 if (!mdr->more()->witnessed.empty()) {
5484 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
5485 le->reqid = mdr->reqid;
5486 le->had_slaves = true;
5487 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
5488 }
5489
5490 if (inc) {
5491 dn->pre_dirty();
5492 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
5493 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5494 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5495 } else {
5496 dn->pre_dirty();
5497 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
5498 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
5499 le->metablob.add_null_dentry(dn, true);
5500 dn->push_projected_linkage();
5501 }
5502
5503 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
5504 }
5505
5506 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
5507 CDentry *dn, CInode *targeti,
5508 version_t dpv)
5509 {
5510 dout(10) << "_link_remote_finish "
5511 << (inc ? "link ":"unlink ")
5512 << *dn << " to " << *targeti << dendl;
5513
5514 assert(g_conf->mds_kill_link_at != 3);
5515
5516 if (!mdr->more()->witnessed.empty())
5517 mdcache->logged_master_update(mdr->reqid);
5518
5519 if (inc) {
5520 // link the new dentry
5521 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
5522 if (!dnl->get_inode())
5523 dn->link_remote(dnl, targeti);
5524 dn->mark_dirty(dpv, mdr->ls);
5525 } else {
5526 // unlink main dentry
5527 dn->get_dir()->unlink_inode(dn);
5528 dn->pop_projected_linkage();
5529 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
5530 }
5531
5532 mdr->apply();
5533
5534 MDRequestRef null_ref;
5535 if (inc)
5536 mdcache->send_dentry_link(dn, null_ref);
5537 else
5538 mdcache->send_dentry_unlink(dn, NULL, null_ref);
5539
5540 // bump target popularity
5541 utime_t now = ceph_clock_now();
5542 mds->balancer->hit_inode(now, targeti, META_POP_IWR);
5543 mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
5544
5545 // reply
5546 respond_to_request(mdr, 0);
5547
5548 if (!inc)
5549 // removing a new dn?
5550 dn->get_dir()->try_remove_unlinked_dn(dn);
5551 }
5552
5553
5554 // remote linking/unlinking
5555
5556 class C_MDS_SlaveLinkPrep : public ServerLogContext {
5557 CInode *targeti;
5558 public:
5559 C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t) :
5560 ServerLogContext(s, r), targeti(t) { }
5561 void finish(int r) override {
5562 assert(r == 0);
5563 server->_logged_slave_link(mdr, targeti);
5564 }
5565 };
5566
5567 class C_MDS_SlaveLinkCommit : public ServerContext {
5568 MDRequestRef mdr;
5569 CInode *targeti;
5570 public:
5571 C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
5572 ServerContext(s), mdr(r), targeti(t) { }
5573 void finish(int r) override {
5574 server->_commit_slave_link(mdr, r, targeti);
5575 }
5576 };
5577
5578 /* This function DOES put the mdr->slave_request before returning*/
5579 void Server::handle_slave_link_prep(MDRequestRef& mdr)
5580 {
5581 dout(10) << "handle_slave_link_prep " << *mdr
5582 << " on " << mdr->slave_request->get_object_info()
5583 << dendl;
5584
5585 assert(g_conf->mds_kill_link_at != 4);
5586
5587 CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
5588 assert(targeti);
5589 dout(10) << "targeti " << *targeti << dendl;
5590 CDentry *dn = targeti->get_parent_dn();
5591 CDentry::linkage_t *dnl = dn->get_linkage();
5592 assert(dnl->is_primary());
5593
5594 mdr->set_op_stamp(mdr->slave_request->op_stamp);
5595
5596 mdr->auth_pin(targeti);
5597
5598 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
5599 assert(g_conf->mds_kill_link_at != 5);
5600
5601 // journal it
5602 mdr->ls = mdlog->get_current_segment();
5603 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
5604 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
5605 mdlog->start_entry(le);
5606
5607 auto &pi = dnl->get_inode()->project_inode();
5608
5609 // update journaled target inode
5610 bool inc;
5611 if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
5612 inc = true;
5613 pi.inode.nlink++;
5614 } else {
5615 inc = false;
5616 pi.inode.nlink--;
5617 }
5618
5619 link_rollback rollback;
5620 rollback.reqid = mdr->reqid;
5621 rollback.ino = targeti->ino();
5622 rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
5623 const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
5624 rollback.old_dir_mtime = pf->fragstat.mtime;
5625 rollback.old_dir_rctime = pf->rstat.rctime;
5626 rollback.was_inc = inc;
5627 ::encode(rollback, le->rollback);
5628 mdr->more()->rollback_bl = le->rollback;
5629
5630 pi.inode.ctime = mdr->get_op_stamp();
5631 pi.inode.version = targeti->pre_dirty();
5632
5633 dout(10) << " projected inode " << pi.inode.ino << " v " << pi.inode.version << dendl;
5634
5635 // commit case
5636 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
5637 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
5638
5639 // set up commit waiter
5640 mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
5641
5642 mdr->more()->slave_update_journaled = true;
5643 submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti),
5644 mdr, __func__);
5645 mdlog->flush();
5646 }
5647
5648 void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti)
5649 {
5650 dout(10) << "_logged_slave_link " << *mdr
5651 << " " << *targeti << dendl;
5652
5653 assert(g_conf->mds_kill_link_at != 6);
5654
5655 // update the target
5656 targeti->pop_and_dirty_projected_inode(mdr->ls);
5657 mdr->apply();
5658
5659 // hit pop
5660 utime_t now = ceph_clock_now();
5661 mds->balancer->hit_inode(now, targeti, META_POP_IWR);
5662
5663 // done.
5664 mdr->slave_request->put();
5665 mdr->slave_request = 0;
5666
5667 // ack
5668 if (!mdr->aborted) {
5669 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5670 MMDSSlaveRequest::OP_LINKPREPACK);
5671 mds->send_message_mds(reply, mdr->slave_to_mds);
5672 } else {
5673 dout(10) << " abort flag set, finishing" << dendl;
5674 mdcache->request_finish(mdr);
5675 }
5676 }
5677
5678
5679 struct C_MDS_CommittedSlave : public ServerLogContext {
5680 C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
5681 void finish(int r) override {
5682 server->_committed_slave(mdr);
5683 }
5684 };
5685
5686 void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
5687 {
5688 dout(10) << "_commit_slave_link " << *mdr
5689 << " r=" << r
5690 << " " << *targeti << dendl;
5691
5692 assert(g_conf->mds_kill_link_at != 7);
5693
5694 if (r == 0) {
5695 // drop our pins, etc.
5696 mdr->cleanup();
5697
5698 // write a commit to the journal
5699 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
5700 ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
5701 mdlog->start_entry(le);
5702 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
5703 mdlog->flush();
5704 } else {
5705 do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
5706 }
5707 }
5708
5709 void Server::_committed_slave(MDRequestRef& mdr)
5710 {
5711 dout(10) << "_committed_slave " << *mdr << dendl;
5712
5713 assert(g_conf->mds_kill_link_at != 8);
5714
5715 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5716 MMDSSlaveRequest::OP_COMMITTED);
5717 mds->send_message_mds(req, mdr->slave_to_mds);
5718 mdcache->request_finish(mdr);
5719 }
5720
5721 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
5722 MutationRef mut;
5723 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r) : ServerLogContext(s, r), mut(m) {}
5724 void finish(int r) override {
5725 server->_link_rollback_finish(mut, mdr);
5726 }
5727 };
5728
5729 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
5730 {
5731 link_rollback rollback;
5732 bufferlist::iterator p = rbl.begin();
5733 ::decode(rollback, p);
5734
5735 dout(10) << "do_link_rollback on " << rollback.reqid
5736 << (rollback.was_inc ? " inc":" dec")
5737 << " ino " << rollback.ino
5738 << dendl;
5739
5740 assert(g_conf->mds_kill_link_at != 9);
5741
5742 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
5743 assert(mdr || mds->is_resolve());
5744
5745 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
5746 mut->ls = mds->mdlog->get_current_segment();
5747
5748 CInode *in = mdcache->get_inode(rollback.ino);
5749 assert(in);
5750 dout(10) << " target is " << *in << dendl;
5751 assert(!in->is_projected()); // live slave request hold versionlock xlock.
5752
5753 auto &pi = in->project_inode();
5754 pi.inode.version = in->pre_dirty();
5755 mut->add_projected_inode(in);
5756
5757 // parent dir rctime
5758 CDir *parent = in->get_projected_parent_dn()->get_dir();
5759 fnode_t *pf = parent->project_fnode();
5760 mut->add_projected_fnode(parent);
5761 pf->version = parent->pre_dirty();
5762 if (pf->fragstat.mtime == pi.inode.ctime) {
5763 pf->fragstat.mtime = rollback.old_dir_mtime;
5764 if (pf->rstat.rctime == pi.inode.ctime)
5765 pf->rstat.rctime = rollback.old_dir_rctime;
5766 mut->add_updated_lock(&parent->get_inode()->filelock);
5767 mut->add_updated_lock(&parent->get_inode()->nestlock);
5768 }
5769
5770 // inode
5771 pi.inode.ctime = pi.inode.rstat.rctime = rollback.old_ctime;
5772 if (rollback.was_inc)
5773 pi.inode.nlink--;
5774 else
5775 pi.inode.nlink++;
5776
5777 // journal it
5778 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
5779 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
5780 mdlog->start_entry(le);
5781 le->commit.add_dir_context(parent);
5782 le->commit.add_dir(parent, true);
5783 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
5784
5785 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr),
5786 mdr, __func__);
5787 mdlog->flush();
5788 }
5789
5790 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr)
5791 {
5792 dout(10) << "_link_rollback_finish" << dendl;
5793
5794 assert(g_conf->mds_kill_link_at != 10);
5795
5796 mut->apply();
5797 if (mdr)
5798 mdcache->request_finish(mdr);
5799
5800 mdcache->finish_rollback(mut->reqid);
5801
5802 mut->cleanup();
5803 }
5804
5805
5806 /* This function DOES NOT put the passed message before returning*/
5807 void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *m)
5808 {
5809 dout(10) << "handle_slave_link_prep_ack " << *mdr
5810 << " " << *m << dendl;
5811 mds_rank_t from = mds_rank_t(m->get_source().num());
5812
5813 assert(g_conf->mds_kill_link_at != 11);
5814
5815 // note slave
5816 mdr->more()->slaves.insert(from);
5817
5818 // witnessed!
5819 assert(mdr->more()->witnessed.count(from) == 0);
5820 mdr->more()->witnessed.insert(from);
5821 assert(!m->is_not_journaled());
5822 mdr->more()->has_journaled_slaves = true;
5823
5824 // remove from waiting list
5825 assert(mdr->more()->waiting_on_slave.count(from));
5826 mdr->more()->waiting_on_slave.erase(from);
5827
5828 assert(mdr->more()->waiting_on_slave.empty());
5829
5830 dispatch_client_request(mdr); // go again!
5831 }
5832
5833
5834
5835
5836
5837 // UNLINK
5838
5839 void Server::handle_client_unlink(MDRequestRef& mdr)
5840 {
5841 MClientRequest *req = mdr->client_request;
5842 client_t client = mdr->get_client();
5843
5844 // rmdir or unlink?
5845 bool rmdir = false;
5846 if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true;
5847
5848 if (req->get_filepath().depth() == 0) {
5849 respond_to_request(mdr, -EINVAL);
5850 return;
5851 }
5852
5853 // traverse to path
5854 vector<CDentry*> trace;
5855 CInode *in;
5856 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(), &trace, &in, MDS_TRAVERSE_FORWARD);
5857 if (r > 0) return;
5858 if (r < 0) {
5859 if (r == -ESTALE) {
5860 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
5861 mdcache->find_ino_peers(req->get_filepath().get_ino(), new C_MDS_TryFindInode(this, mdr));
5862 return;
5863 }
5864 respond_to_request(mdr, r);
5865 return;
5866 }
5867 if (mdr->snapid != CEPH_NOSNAP) {
5868 respond_to_request(mdr, -EROFS);
5869 return;
5870 }
5871
5872 CDentry *dn = trace[trace.size()-1];
5873 assert(dn);
5874 if (!dn->is_auth()) {
5875 mdcache->request_forward(mdr, dn->authority().first);
5876 return;
5877 }
5878
5879 CInode *diri = dn->get_dir()->get_inode();
5880
5881 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
5882 assert(!dnl->is_null());
5883
5884 if (rmdir) {
5885 dout(7) << "handle_client_rmdir on " << *dn << dendl;
5886 } else {
5887 dout(7) << "handle_client_unlink on " << *dn << dendl;
5888 }
5889 dout(7) << "dn links to " << *in << dendl;
5890
5891 // rmdir vs is_dir
5892 if (in->is_dir()) {
5893 if (rmdir) {
5894 // do empty directory checks
5895 if (_dir_is_nonempty_unlocked(mdr, in)) {
5896 respond_to_request(mdr, -ENOTEMPTY);
5897 return;
5898 }
5899 } else {
5900 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
5901 respond_to_request(mdr, -EISDIR);
5902 return;
5903 }
5904 } else {
5905 if (rmdir) {
5906 // unlink
5907 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
5908 respond_to_request(mdr, -ENOTDIR);
5909 return;
5910 }
5911 }
5912
5913 // -- create stray dentry? --
5914 CDentry *straydn = NULL;
5915 if (dnl->is_primary()) {
5916 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
5917 if (!straydn)
5918 return;
5919 dout(10) << " straydn is " << *straydn << dendl;
5920 } else if (mdr->straydn) {
5921 mdr->unpin(mdr->straydn);
5922 mdr->straydn = NULL;
5923 }
5924
5925 // lock
5926 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5927
5928 for (int i=0; i<(int)trace.size()-1; i++)
5929 rdlocks.insert(&trace[i]->lock);
5930 xlocks.insert(&dn->lock);
5931 wrlocks.insert(&diri->filelock);
5932 wrlocks.insert(&diri->nestlock);
5933 xlocks.insert(&in->linklock);
5934 if (straydn) {
5935 wrlocks.insert(&straydn->get_dir()->inode->filelock);
5936 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
5937 xlocks.insert(&straydn->lock);
5938 }
5939 if (in->is_dir())
5940 rdlocks.insert(&in->filelock); // to verify it's empty
5941 mds->locker->include_snap_rdlocks(rdlocks, dnl->get_inode());
5942
5943 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5944 return;
5945
5946 if (in->is_dir() &&
5947 _dir_is_nonempty(mdr, in)) {
5948 respond_to_request(mdr, -ENOTEMPTY);
5949 return;
5950 }
5951
5952 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5953 if (!check_access(mdr, diri, MAY_WRITE))
5954 return;
5955 }
5956
5957 // yay!
5958 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
5959 // subtree root auths need to be witnesses
5960 set<mds_rank_t> witnesses;
5961 in->list_replicas(witnesses);
5962 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
5963
5964 for (set<mds_rank_t>::iterator p = witnesses.begin();
5965 p != witnesses.end();
5966 ++p) {
5967 if (mdr->more()->witnessed.count(*p)) {
5968 dout(10) << " already witnessed by mds." << *p << dendl;
5969 } else if (mdr->more()->waiting_on_slave.count(*p)) {
5970 dout(10) << " already waiting on witness mds." << *p << dendl;
5971 } else {
5972 if (!_rmdir_prepare_witness(mdr, *p, trace, straydn))
5973 return;
5974 }
5975 }
5976 if (!mdr->more()->waiting_on_slave.empty())
5977 return; // we're waiting for a witness.
5978 }
5979
5980 // ok!
5981 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
5982 _link_remote(mdr, false, dn, dnl->get_inode());
5983 else
5984 _unlink_local(mdr, dn, straydn);
5985 }
5986
5987 class C_MDS_unlink_local_finish : public ServerLogContext {
5988 CDentry *dn;
5989 CDentry *straydn;
5990 version_t dnpv; // deleted dentry
5991 public:
5992 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
5993 ServerLogContext(s, r), dn(d), straydn(sd),
5994 dnpv(d->get_projected_version()) {}
5995 void finish(int r) override {
5996 assert(r == 0);
5997 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
5998 }
5999 };
6000
6001 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
6002 {
6003 dout(10) << "_unlink_local " << *dn << dendl;
6004
6005 CDentry::linkage_t *dnl = dn->get_projected_linkage();
6006 CInode *in = dnl->get_inode();
6007
6008 SnapRealm *realm = in->find_snaprealm();
6009 snapid_t follows = realm->get_newest_seq();
6010
6011 // ok, let's do it.
6012 mdr->ls = mdlog->get_current_segment();
6013
6014 // prepare log entry
6015 EUpdate *le = new EUpdate(mdlog, "unlink_local");
6016 mdlog->start_entry(le);
6017 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6018 if (!mdr->more()->witnessed.empty()) {
6019 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6020 le->reqid = mdr->reqid;
6021 le->had_slaves = true;
6022 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6023 }
6024
6025 if (straydn) {
6026 assert(dnl->is_primary());
6027 straydn->push_projected_linkage(in);
6028 straydn->first = follows + 1;
6029 }
6030
6031 // the unlinked dentry
6032 dn->pre_dirty();
6033
6034 auto &pi = in->project_inode();
6035 {
6036 std::string t;
6037 dn->make_path_string(t, true);
6038 pi.inode.stray_prior_path = mempool::mds_co::string(boost::string_view(t));
6039 }
6040 mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
6041 pi.inode.version = in->pre_dirty();
6042 pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
6043 pi.inode.change_attr++;
6044 pi.inode.nlink--;
6045 if (pi.inode.nlink == 0)
6046 in->state_set(CInode::STATE_ORPHAN);
6047
6048 if (dnl->is_primary()) {
6049 // primary link. add stray dentry.
6050 assert(straydn);
6051 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
6052 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6053
6054 // project snaprealm, too
6055 if (in->snaprealm || follows + 1 > in->get_oldest_snap())
6056 in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
6057
6058 pi.inode.update_backtrace();
6059 le->metablob.add_primary_dentry(straydn, in, true, true);
6060 } else {
6061 // remote link. update remote inode.
6062 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
6063 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6064 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
6065 }
6066
6067 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6068 le->metablob.add_null_dentry(dn, true);
6069
6070 if (in->is_dir()) {
6071 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6072 le->metablob.renamed_dirino = in->ino();
6073 }
6074
6075 dn->push_projected_linkage();
6076
6077 if (in->is_dir()) {
6078 assert(straydn);
6079 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6080
6081 in->maybe_export_pin(true);
6082 }
6083
6084 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
6085 }
6086
6087 void Server::_unlink_local_finish(MDRequestRef& mdr,
6088 CDentry *dn, CDentry *straydn,
6089 version_t dnpv)
6090 {
6091 dout(10) << "_unlink_local_finish " << *dn << dendl;
6092
6093 if (!mdr->more()->witnessed.empty())
6094 mdcache->logged_master_update(mdr->reqid);
6095
6096 // unlink main dentry
6097 dn->get_dir()->unlink_inode(dn);
6098 dn->pop_projected_linkage();
6099
6100 // relink as stray? (i.e. was primary link?)
6101 CInode *strayin = NULL;
6102 bool snap_is_new = false;
6103 if (straydn) {
6104 dout(20) << " straydn is " << *straydn << dendl;
6105 CDentry::linkage_t *straydnl = straydn->pop_projected_linkage();
6106 strayin = straydnl->get_inode();
6107
6108 snap_is_new = strayin->snaprealm ? true : false;
6109 mdcache->touch_dentry_bottom(straydn);
6110 }
6111
6112 dn->mark_dirty(dnpv, mdr->ls);
6113 mdr->apply();
6114
6115 if (snap_is_new) //only new if strayin exists
6116 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, true);
6117
6118 mdcache->send_dentry_unlink(dn, straydn, mdr);
6119
6120 // update subtree map?
6121 if (straydn && strayin->is_dir())
6122 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
6123
6124 // bump pop
6125 utime_t now = ceph_clock_now();
6126 mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
6127
6128 // reply
6129 respond_to_request(mdr, 0);
6130
6131 // removing a new dn?
6132 dn->get_dir()->try_remove_unlinked_dn(dn);
6133
6134 // clean up ?
6135 // respond_to_request() drops locks. So stray reintegration can race with us.
6136 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6137 // Tip off the MDCache that this dentry is a stray that
6138 // might be elegible for purge.
6139 mdcache->notify_stray(straydn);
6140 }
6141 }
6142
6143 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
6144 {
6145 if (mds->is_cluster_degraded() &&
6146 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6147 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
6148 if (mdr->more()->waiting_on_slave.empty())
6149 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6150 return false;
6151 }
6152
6153 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
6154 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6155 MMDSSlaveRequest::OP_RMDIRPREP);
6156 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
6157 for (auto dn : trace)
6158 req->srcdnpath.push_dentry(dn->get_name());
6159 mdcache->replicate_stray(straydn, who, req->stray);
6160
6161 req->op_stamp = mdr->get_op_stamp();
6162 mds->send_message_mds(req, who);
6163
6164 assert(mdr->more()->waiting_on_slave.count(who) == 0);
6165 mdr->more()->waiting_on_slave.insert(who);
6166 return true;
6167 }
6168
6169 struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
6170 CDentry *dn, *straydn;
6171 C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
6172 : ServerLogContext(s, r), dn(d), straydn(st) {}
6173 void finish(int r) override {
6174 server->_logged_slave_rmdir(mdr, dn, straydn);
6175 }
6176 };
6177
6178 struct C_MDS_SlaveRmdirCommit : public ServerContext {
6179 MDRequestRef mdr;
6180 CDentry *straydn;
6181 C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
6182 : ServerContext(s), mdr(r), straydn(sd) { }
6183 void finish(int r) override {
6184 server->_commit_slave_rmdir(mdr, r, straydn);
6185 }
6186 };
6187
6188 void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
6189 {
6190 dout(10) << "handle_slave_rmdir_prep " << *mdr
6191 << " " << mdr->slave_request->srcdnpath
6192 << " to " << mdr->slave_request->destdnpath
6193 << dendl;
6194
6195 vector<CDentry*> trace;
6196 filepath srcpath(mdr->slave_request->srcdnpath);
6197 dout(10) << " src " << srcpath << dendl;
6198 CInode *in;
6199 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK);
6200 if (r > 0) return;
6201 if (r == -ESTALE) {
6202 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
6203 mdr->slave_to_mds);
6204 return;
6205 }
6206 assert(r == 0);
6207 CDentry *dn = trace[trace.size()-1];
6208 dout(10) << " dn " << *dn << dendl;
6209 mdr->pin(dn);
6210
6211 assert(mdr->straydn);
6212 CDentry *straydn = mdr->straydn;
6213 dout(10) << " straydn " << *straydn << dendl;
6214
6215 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6216
6217 rmdir_rollback rollback;
6218 rollback.reqid = mdr->reqid;
6219 rollback.src_dir = dn->get_dir()->dirfrag();
6220 rollback.src_dname = std::string(dn->get_name());
6221 rollback.dest_dir = straydn->get_dir()->dirfrag();
6222 rollback.dest_dname = std::string(straydn->get_name());
6223 ::encode(rollback, mdr->more()->rollback_bl);
6224 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
6225
6226 // set up commit waiter
6227 mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
6228
6229 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
6230 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
6231 dn->get_dir()->unlink_inode(dn);
6232 straydn->get_dir()->link_primary_inode(straydn, in);
6233
6234 assert(straydn->first >= in->first);
6235 in->first = straydn->first;
6236
6237 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), false);
6238
6239 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6240 MMDSSlaveRequest::OP_RMDIRPREPACK);
6241 reply->mark_not_journaled();
6242 mds->send_message_mds(reply, mdr->slave_to_mds);
6243
6244 // send caps to auth (if we're not already)
6245 if (in->is_any_caps() && !in->state_test(CInode::STATE_EXPORTINGCAPS))
6246 mdcache->migrator->export_caps(in);
6247
6248 mdcache->touch_dentry_bottom(straydn); // move stray to end of lru
6249
6250 mdr->slave_request->put();
6251 mdr->slave_request = 0;
6252 mdr->straydn = 0;
6253 return;
6254 }
6255
6256 straydn->push_projected_linkage(in);
6257 dn->push_projected_linkage();
6258
6259 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
6260 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
6261 mdlog->start_entry(le);
6262 le->rollback = mdr->more()->rollback_bl;
6263
6264 le->commit.add_dir_context(straydn->get_dir());
6265 le->commit.add_primary_dentry(straydn, in, true);
6266 // slave: no need to journal original dentry
6267
6268 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6269 le->commit.renamed_dirino = in->ino();
6270
6271 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6272
6273 mdr->more()->slave_update_journaled = true;
6274 submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
6275 mdr, __func__);
6276 mdlog->flush();
6277 }
6278
6279 void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
6280 {
6281 dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
6282
6283 // update our cache now, so we are consistent with what is in the journal
6284 // when we journal a subtree map
6285 CInode *in = dn->get_linkage()->get_inode();
6286 dn->get_dir()->unlink_inode(dn);
6287 straydn->pop_projected_linkage();
6288 dn->pop_projected_linkage();
6289 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), true);
6290
6291 // done.
6292 mdr->slave_request->put();
6293 mdr->slave_request = 0;
6294 mdr->straydn = 0;
6295
6296 if (!mdr->aborted) {
6297 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6298 MMDSSlaveRequest::OP_RMDIRPREPACK);
6299 mds->send_message_mds(reply, mdr->slave_to_mds);
6300 } else {
6301 dout(10) << " abort flag set, finishing" << dendl;
6302 mdcache->request_finish(mdr);
6303 }
6304 }
6305
6306 void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
6307 {
6308 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
6309 << " " << *ack << dendl;
6310
6311 mds_rank_t from = mds_rank_t(ack->get_source().num());
6312
6313 mdr->more()->slaves.insert(from);
6314 mdr->more()->witnessed.insert(from);
6315 if (!ack->is_not_journaled())
6316 mdr->more()->has_journaled_slaves = true;
6317
6318 // remove from waiting list
6319 assert(mdr->more()->waiting_on_slave.count(from));
6320 mdr->more()->waiting_on_slave.erase(from);
6321
6322 if (mdr->more()->waiting_on_slave.empty())
6323 dispatch_client_request(mdr); // go again!
6324 else
6325 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
6326 }
6327
6328 void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
6329 {
6330 dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
6331
6332 if (r == 0) {
6333 if (mdr->more()->slave_update_journaled) {
6334 CInode *strayin = straydn->get_projected_linkage()->get_inode();
6335 if (strayin && !strayin->snaprealm)
6336 mdcache->clear_dirty_bits_for_stray(strayin);
6337 }
6338
6339 mdr->cleanup();
6340
6341 if (mdr->more()->slave_update_journaled) {
6342 // write a commit to the journal
6343 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
6344 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
6345 ESlaveUpdate::RMDIR);
6346 mdlog->start_entry(le);
6347 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
6348 mdlog->flush();
6349 } else {
6350 _committed_slave(mdr);
6351 }
6352 } else {
6353 // abort
6354 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
6355 }
6356 }
6357
6358 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
6359 metareqid_t reqid;
6360 CDentry *dn;
6361 CDentry *straydn;
6362 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
6363 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
6364 void finish(int r) override {
6365 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
6366 }
6367 };
6368
6369 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
6370 {
6371 // unlink the other rollback methods, the rmdir rollback is only
6372 // needed to record the subtree changes in the journal for inode
6373 // replicas who are auth for empty dirfrags. no actual changes to
6374 // the file system are taking place here, so there is no Mutation.
6375
6376 rmdir_rollback rollback;
6377 bufferlist::iterator p = rbl.begin();
6378 ::decode(rollback, p);
6379
6380 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
6381 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
6382 assert(mdr || mds->is_resolve());
6383
6384 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
6385 if (!dir)
6386 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
6387 assert(dir);
6388 CDentry *dn = dir->lookup(rollback.src_dname);
6389 assert(dn);
6390 dout(10) << " dn " << *dn << dendl;
6391 dir = mdcache->get_dirfrag(rollback.dest_dir);
6392 assert(dir);
6393 CDentry *straydn = dir->lookup(rollback.dest_dname);
6394 assert(straydn);
6395 dout(10) << " straydn " << *dn << dendl;
6396 CInode *in = straydn->get_linkage()->get_inode();
6397
6398 if (mdr && !mdr->more()->slave_update_journaled) {
6399 assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
6400
6401 straydn->get_dir()->unlink_inode(straydn);
6402 dn->get_dir()->link_primary_inode(dn, in);
6403
6404 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), false);
6405
6406 mdcache->request_finish(mdr);
6407 mdcache->finish_rollback(rollback.reqid);
6408 return;
6409 }
6410
6411 dn->push_projected_linkage(in);
6412 straydn->push_projected_linkage();
6413
6414 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
6415 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
6416 mdlog->start_entry(le);
6417
6418 le->commit.add_dir_context(dn->get_dir());
6419 le->commit.add_primary_dentry(dn, in, true);
6420 // slave: no need to journal straydn
6421
6422 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6423 le->commit.renamed_dirino = in->ino();
6424
6425 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
6426
6427 submit_mdlog_entry(le,
6428 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
6429 dn, straydn),
6430 mdr, __func__);
6431 mdlog->flush();
6432 }
6433
6434 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
6435 {
6436 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
6437
6438 straydn->get_dir()->unlink_inode(straydn);
6439 dn->pop_projected_linkage();
6440 straydn->pop_projected_linkage();
6441
6442 CInode *in = dn->get_linkage()->get_inode();
6443 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), true);
6444 if (mds->is_resolve()) {
6445 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
6446 mdcache->try_trim_non_auth_subtree(root);
6447 }
6448
6449 if (mdr)
6450 mdcache->request_finish(mdr);
6451
6452 mdcache->finish_rollback(reqid);
6453 }
6454
6455
6456 /** _dir_is_nonempty[_unlocked]
6457 *
6458 * check if a directory is non-empty (i.e. we can rmdir it).
6459 *
6460 * the unlocked varient this is a fastpath check. we can't really be
6461 * sure until we rdlock the filelock.
6462 */
6463 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
6464 {
6465 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
6466 assert(in->is_auth());
6467
6468 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
6469 return true; // in a snapshot!
6470
6471 list<CDir*> ls;
6472 in->get_dirfrags(ls);
6473 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6474 CDir *dir = *p;
6475 // is the frag obviously non-empty?
6476 if (dir->is_auth()) {
6477 if (dir->get_projected_fnode()->fragstat.size()) {
6478 dout(10) << "dir_is_nonempty_unlocked dirstat has "
6479 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
6480 return true;
6481 }
6482 }
6483 }
6484
6485 return false;
6486 }
6487
6488 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
6489 {
6490 dout(10) << "dir_is_nonempty " << *in << dendl;
6491 assert(in->is_auth());
6492 assert(in->filelock.can_read(mdr->get_client()));
6493
6494 frag_info_t dirstat;
6495 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
6496
6497 list<CDir*> ls;
6498 in->get_dirfrags(ls);
6499 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6500 CDir *dir = *p;
6501 const fnode_t *pf = dir->get_projected_fnode();
6502 if (pf->fragstat.size()) {
6503 dout(10) << "dir_is_nonempty dirstat has "
6504 << pf->fragstat.size() << " items " << *dir << dendl;
6505 return true;
6506 }
6507
6508 if (pf->accounted_fragstat.version == dirstat_version)
6509 dirstat.add(pf->accounted_fragstat);
6510 else
6511 dirstat.add(pf->fragstat);
6512 }
6513
6514 return dirstat.size() != in->get_projected_inode()->dirstat.size();
6515 }
6516
6517
6518 // ======================================================
6519
6520
6521 class C_MDS_rename_finish : public ServerLogContext {
6522 CDentry *srcdn;
6523 CDentry *destdn;
6524 CDentry *straydn;
6525 public:
6526 C_MDS_rename_finish(Server *s, MDRequestRef& r,
6527 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
6528 ServerLogContext(s, r),
6529 srcdn(sdn), destdn(ddn), straydn(stdn) { }
6530 void finish(int r) override {
6531 assert(r == 0);
6532 server->_rename_finish(mdr, srcdn, destdn, straydn);
6533 }
6534 };
6535
6536
6537 /** handle_client_rename
6538 *
6539 * rename master is the destdn auth. this is because cached inodes
6540 * must remain connected. thus, any replica of srci, must also
6541 * replicate destdn, and possibly straydn, so that srci (and
6542 * destdn->inode) remain connected during the rename.
6543 *
6544 * to do this, we freeze srci, then master (destdn auth) verifies that
6545 * all other nodes have also replciated destdn and straydn. note that
6546 * destdn replicas need not also replicate srci. this only works when
6547 * destdn is master.
6548 *
6549 * This function takes responsibility for the passed mdr.
6550 */
6551 void Server::handle_client_rename(MDRequestRef& mdr)
6552 {
6553 MClientRequest *req = mdr->client_request;
6554 dout(7) << "handle_client_rename " << *req << dendl;
6555
6556 filepath destpath = req->get_filepath();
6557 filepath srcpath = req->get_filepath2();
6558 if (destpath.depth() == 0 || srcpath.depth() == 0) {
6559 respond_to_request(mdr, -EINVAL);
6560 return;
6561 }
6562 boost::string_view destname = destpath.last_dentry();
6563
6564 vector<CDentry*>& srctrace = mdr->dn[1];
6565 vector<CDentry*>& desttrace = mdr->dn[0];
6566
6567 set<SimpleLock*> rdlocks, wrlocks, xlocks;
6568
6569 CDentry *destdn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, true, false, true);
6570 if (!destdn) return;
6571 dout(10) << " destdn " << *destdn << dendl;
6572 if (mdr->snapid != CEPH_NOSNAP) {
6573 respond_to_request(mdr, -EROFS);
6574 return;
6575 }
6576 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
6577 CDir *destdir = destdn->get_dir();
6578 assert(destdir->is_auth());
6579
6580 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER);
6581 if (r > 0)
6582 return; // delayed
6583 if (r < 0) {
6584 if (r == -ESTALE) {
6585 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
6586 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
6587 } else {
6588 dout(10) << "FAIL on error " << r << dendl;
6589 respond_to_request(mdr, r);
6590 }
6591 return;
6592
6593 }
6594 assert(!srctrace.empty());
6595 CDentry *srcdn = srctrace[srctrace.size()-1];
6596 dout(10) << " srcdn " << *srcdn << dendl;
6597 if (srcdn->last != CEPH_NOSNAP) {
6598 respond_to_request(mdr, -EROFS);
6599 return;
6600 }
6601 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
6602 CInode *srci = srcdnl->get_inode();
6603 dout(10) << " srci " << *srci << dendl;
6604
6605 CInode *oldin = 0;
6606 if (!destdnl->is_null()) {
6607 //dout(10) << "dest dn exists " << *destdn << dendl;
6608 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
6609 if (!oldin) return;
6610 dout(10) << " oldin " << *oldin << dendl;
6611
6612 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
6613 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
6614 respond_to_request(mdr, -ENOTEMPTY);
6615 return;
6616 }
6617
6618 // if srcdn is replica, need to make sure its linkage is correct
6619 if (srcdn->is_auth() ||
6620 srcdn->lock.can_read(mdr->get_client()) ||
6621 (srcdn->lock.is_xlocked() && srcdn->lock.get_xlock_by() == mdr)) {
6622 // mv /some/thing /to/some/existing_other_thing
6623 if (oldin->is_dir() && !srci->is_dir()) {
6624 respond_to_request(mdr, -EISDIR);
6625 return;
6626 }
6627 if (!oldin->is_dir() && srci->is_dir()) {
6628 respond_to_request(mdr, -ENOTDIR);
6629 return;
6630 }
6631 if (srci == oldin && !srcdn->get_dir()->inode->is_stray()) {
6632 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
6633 return;
6634 }
6635 }
6636 }
6637
6638 // -- some sanity checks --
6639
6640 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
6641 if (destpath.get_ino() != srcpath.get_ino() &&
6642 !(req->get_source().is_mds() &&
6643 MDS_INO_IS_MDSDIR(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
6644 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
6645 CInode *destbase = desttrace[0]->get_dir()->get_inode();
6646 // ok, extend srctrace toward root until it is an ancestor of desttrace.
6647 while (srcbase != destbase &&
6648 !srcbase->is_projected_ancestor_of(destbase)) {
6649 CDentry *pdn = srcbase->get_projected_parent_dn();
6650 srctrace.insert(srctrace.begin(), pdn);
6651 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
6652 srcbase = pdn->get_dir()->get_inode();
6653 }
6654
6655 // then, extend destpath until it shares the same parent inode as srcpath.
6656 while (destbase != srcbase) {
6657 CDentry *pdn = destbase->get_projected_parent_dn();
6658 desttrace.insert(desttrace.begin(), pdn);
6659 rdlocks.insert(&pdn->lock);
6660 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
6661 destbase = pdn->get_dir()->get_inode();
6662 }
6663 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
6664 }
6665
6666 // src == dest?
6667 if (srcdn->get_dir() == destdir && srcdn->get_name() == destname) {
6668 dout(7) << "rename src=dest, noop" << dendl;
6669 respond_to_request(mdr, 0);
6670 return;
6671 }
6672
6673 // dest a child of src?
6674 // e.g. mv /usr /usr/foo
6675 CDentry *pdn = destdir->inode->get_projected_parent_dn();
6676 while (pdn) {
6677 if (pdn == srcdn) {
6678 dout(7) << "cannot rename item to be a child of itself" << dendl;
6679 respond_to_request(mdr, -EINVAL);
6680 return;
6681 }
6682 pdn = pdn->get_dir()->inode->parent;
6683 }
6684
6685 // is this a stray migration, reintegration or merge? (sanity checks!)
6686 if (mdr->reqid.name.is_mds() &&
6687 !(MDS_INO_IS_MDSDIR(srcpath.get_ino()) &&
6688 MDS_INO_IS_MDSDIR(destpath.get_ino())) &&
6689 !(destdnl->is_remote() &&
6690 destdnl->get_remote_ino() == srci->ino())) {
6691 respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
6692 return;
6693 }
6694
6695 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
6696 (srcdnl->is_primary() || destdnl->is_primary()));
6697 if (linkmerge)
6698 dout(10) << " this is a link merge" << dendl;
6699
6700 // -- create stray dentry? --
6701 CDentry *straydn = NULL;
6702 if (destdnl->is_primary() && !linkmerge) {
6703 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
6704 if (!straydn)
6705 return;
6706 dout(10) << " straydn is " << *straydn << dendl;
6707 } else if (mdr->straydn) {
6708 mdr->unpin(mdr->straydn);
6709 mdr->straydn = NULL;
6710 }
6711
6712 // -- prepare witness list --
6713 /*
6714 * NOTE: we use _all_ replicas as witnesses.
6715 * this probably isn't totally necessary (esp for file renames),
6716 * but if/when we change that, we have to make sure rejoin is
6717 * sufficiently robust to handle strong rejoins from survivors
6718 * with totally wrong dentry->inode linkage.
6719 * (currently, it can ignore rename effects, because the resolve
6720 * stage will sort them out.)
6721 */
6722 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
6723 if (srcdn->is_auth())
6724 srcdn->list_replicas(witnesses);
6725 else
6726 witnesses.insert(srcdn->authority().first);
6727 if (srcdnl->is_remote() && !srci->is_auth())
6728 witnesses.insert(srci->authority().first);
6729 destdn->list_replicas(witnesses);
6730 if (destdnl->is_remote() && !oldin->is_auth())
6731 witnesses.insert(oldin->authority().first);
6732 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
6733
6734
6735 // -- locks --
6736 map<SimpleLock*, mds_rank_t> remote_wrlocks;
6737
6738 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
6739 for (int i=0; i<(int)srctrace.size(); i++)
6740 rdlocks.insert(&srctrace[i]->lock);
6741 xlocks.insert(&srcdn->lock);
6742 mds_rank_t srcdirauth = srcdn->get_dir()->authority().first;
6743 if (srcdirauth != mds->get_nodeid()) {
6744 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
6745 remote_wrlocks[&srcdn->get_dir()->inode->filelock] = srcdirauth;
6746 remote_wrlocks[&srcdn->get_dir()->inode->nestlock] = srcdirauth;
6747 if (srci->is_dir())
6748 rdlocks.insert(&srci->dirfragtreelock);
6749 } else {
6750 wrlocks.insert(&srcdn->get_dir()->inode->filelock);
6751 wrlocks.insert(&srcdn->get_dir()->inode->nestlock);
6752 }
6753 mds->locker->include_snap_rdlocks(rdlocks, srcdn->get_dir()->inode);
6754
6755 // straydn?
6756 if (straydn) {
6757 wrlocks.insert(&straydn->get_dir()->inode->filelock);
6758 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
6759 xlocks.insert(&straydn->lock);
6760 }
6761
6762 // xlock versionlock on dentries if there are witnesses.
6763 // replicas can't see projected dentry linkages, and will get
6764 // confused if we try to pipeline things.
6765 if (!witnesses.empty()) {
6766 // take xlock on all projected ancestor dentries for srcdn and destdn.
6767 // this ensures the srcdn and destdn can be traversed to by the witnesses.
6768 for (int i= 0; i<(int)srctrace.size(); i++) {
6769 if (srctrace[i]->is_auth() && srctrace[i]->is_projected())
6770 xlocks.insert(&srctrace[i]->versionlock);
6771 }
6772 for (int i=0; i<(int)desttrace.size(); i++) {
6773 if (desttrace[i]->is_auth() && desttrace[i]->is_projected())
6774 xlocks.insert(&desttrace[i]->versionlock);
6775 }
6776 // xlock srci and oldin's primary dentries, so witnesses can call
6777 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
6778 // is traversed.
6779 if (srcdnl->is_remote())
6780 xlocks.insert(&srci->get_projected_parent_dn()->lock);
6781 if (destdnl->is_remote())
6782 xlocks.insert(&oldin->get_projected_parent_dn()->lock);
6783 }
6784
6785 // we need to update srci's ctime. xlock its least contended lock to do that...
6786 xlocks.insert(&srci->linklock);
6787
6788 // xlock oldin (for nlink--)
6789 if (oldin) {
6790 xlocks.insert(&oldin->linklock);
6791 if (oldin->is_dir())
6792 rdlocks.insert(&oldin->filelock);
6793 }
6794 if (srcdnl->is_primary() && srci->is_dir())
6795 // FIXME: this should happen whenever we are renamning between
6796 // realms, regardless of the file type
6797 // FIXME: If/when this changes, make sure to update the
6798 // "allowance" in handle_slave_rename_prep
6799 xlocks.insert(&srci->snaplock); // FIXME: an auth bcast could be sufficient?
6800 else
6801 rdlocks.insert(&srci->snaplock);
6802
6803 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL;
6804 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks,
6805 &remote_wrlocks, auth_pin_freeze))
6806 return;
6807
6808 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6809 if (!check_access(mdr, srcdn->get_dir()->get_inode(), MAY_WRITE))
6810 return;
6811
6812 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
6813 return;
6814
6815 if (!check_fragment_space(mdr, destdn->get_dir()))
6816 return;
6817
6818 if (!check_access(mdr, srci, MAY_WRITE))
6819 return;
6820 }
6821
6822 // with read lock, really verify oldin is empty
6823 if (oldin &&
6824 oldin->is_dir() &&
6825 _dir_is_nonempty(mdr, oldin)) {
6826 respond_to_request(mdr, -ENOTEMPTY);
6827 return;
6828 }
6829
6830 /* project_past_snaprealm_parent() will do this job
6831 *
6832 // moving between snaprealms?
6833 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
6834 SnapRealm *srcrealm = srci->find_snaprealm();
6835 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
6836 if (srcrealm != destrealm &&
6837 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
6838 destrealm->get_newest_seq() + 1 > srcdn->first)) {
6839 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
6840 mdcache->snaprealm_create(mdr, srci);
6841 return;
6842 }
6843 }
6844 */
6845
6846 assert(g_conf->mds_kill_rename_at != 1);
6847
6848 // -- open all srcdn inode frags, if any --
6849 // we need these open so that auth can properly delegate from inode to dirfrags
6850 // after the inode is _ours_.
6851 if (srcdnl->is_primary() &&
6852 !srcdn->is_auth() &&
6853 srci->is_dir()) {
6854 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
6855 mdr->set_stickydirs(srci);
6856
6857 list<frag_t> frags;
6858 srci->dirfragtree.get_leaves(frags);
6859 for (list<frag_t>::iterator p = frags.begin();
6860 p != frags.end();
6861 ++p) {
6862 CDir *dir = srci->get_dirfrag(*p);
6863 if (!dir) {
6864 dout(10) << " opening " << *p << " under " << *srci << dendl;
6865 mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr));
6866 return;
6867 }
6868 }
6869 }
6870
6871 // -- prepare witnesses --
6872
6873 // do srcdn auth last
6874 mds_rank_t last = MDS_RANK_NONE;
6875 if (!srcdn->is_auth()) {
6876 last = srcdn->authority().first;
6877 mdr->more()->srcdn_auth_mds = last;
6878 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
6879 // are involved in the rename operation.
6880 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
6881 dout(10) << " preparing ambiguous auth for srci" << dendl;
6882 assert(mdr->more()->is_remote_frozen_authpin);
6883 assert(mdr->more()->rename_inode == srci);
6884 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6885 return;
6886 }
6887 }
6888
6889 for (set<mds_rank_t>::iterator p = witnesses.begin();
6890 p != witnesses.end();
6891 ++p) {
6892 if (*p == last) continue; // do it last!
6893 if (mdr->more()->witnessed.count(*p)) {
6894 dout(10) << " already witnessed by mds." << *p << dendl;
6895 } else if (mdr->more()->waiting_on_slave.count(*p)) {
6896 dout(10) << " already waiting on witness mds." << *p << dendl;
6897 } else {
6898 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
6899 return;
6900 }
6901 }
6902 if (!mdr->more()->waiting_on_slave.empty())
6903 return; // we're waiting for a witness.
6904
6905 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
6906 dout(10) << " preparing last witness (srcdn auth)" << dendl;
6907 assert(mdr->more()->waiting_on_slave.count(last) == 0);
6908 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6909 return;
6910 }
6911
6912 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
6913 if (!mdr->more()->slaves.empty() && !srci->is_dir())
6914 assert(g_conf->mds_kill_rename_at != 3);
6915 if (!mdr->more()->slaves.empty() && srci->is_dir())
6916 assert(g_conf->mds_kill_rename_at != 4);
6917
6918 // -- declare now --
6919 mdr->set_mds_stamp(ceph_clock_now());
6920
6921 // -- prepare journal entry --
6922 mdr->ls = mdlog->get_current_segment();
6923 EUpdate *le = new EUpdate(mdlog, "rename");
6924 mdlog->start_entry(le);
6925 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6926 if (!mdr->more()->witnessed.empty()) {
6927 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6928
6929 le->reqid = mdr->reqid;
6930 le->had_slaves = true;
6931
6932 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6933 // no need to send frozen auth pin to recovring auth MDS of srci
6934 mdr->more()->is_remote_frozen_authpin = false;
6935 }
6936
6937 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
6938 if (le->client_map.length())
6939 le->cmapv = mds->sessionmap.get_projected();
6940
6941 // -- commit locally --
6942 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
6943
6944 journal_and_reply(mdr, srci, destdn, le, fin);
6945 }
6946
6947
6948 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
6949 {
6950 dout(10) << "_rename_finish " << *mdr << dendl;
6951
6952 if (!mdr->more()->witnessed.empty())
6953 mdcache->logged_master_update(mdr->reqid);
6954
6955 // apply
6956 _rename_apply(mdr, srcdn, destdn, straydn);
6957
6958 mdcache->send_dentry_link(destdn, mdr);
6959
6960 CDentry::linkage_t *destdnl = destdn->get_linkage();
6961 CInode *in = destdnl->get_inode();
6962 bool need_eval = mdr->more()->cap_imports.count(in);
6963
6964 // test hack: test slave commit
6965 if (!mdr->more()->slaves.empty() && !in->is_dir())
6966 assert(g_conf->mds_kill_rename_at != 5);
6967 if (!mdr->more()->slaves.empty() && in->is_dir())
6968 assert(g_conf->mds_kill_rename_at != 6);
6969
6970 // bump popularity
6971 utime_t now = ceph_clock_now();
6972 mds->balancer->hit_dir(now, srcdn->get_dir(), META_POP_IWR);
6973 if (destdnl->is_remote() && in->is_auth())
6974 mds->balancer->hit_inode(now, in, META_POP_IWR);
6975
6976 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
6977
6978 assert(g_conf->mds_kill_rename_at != 7);
6979
6980 // reply
6981 respond_to_request(mdr, 0);
6982
6983 if (need_eval)
6984 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
6985
6986 // clean up?
6987 // respond_to_request() drops locks. So stray reintegration can race with us.
6988 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6989 mdcache->notify_stray(straydn);
6990 }
6991 }
6992
6993
6994
6995 // helpers
6996
6997 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
6998 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
6999 {
7000 if (mds->is_cluster_degraded() &&
7001 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7002 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
7003 if (mdr->more()->waiting_on_slave.empty())
7004 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7005 return false;
7006 }
7007
7008 dout(10) << "_rename_prepare_witness mds." << who << dendl;
7009 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7010 MMDSSlaveRequest::OP_RENAMEPREP);
7011
7012 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
7013 for (auto dn : srctrace)
7014 req->srcdnpath.push_dentry(dn->get_name());
7015 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
7016 for (auto dn : dsttrace)
7017 req->destdnpath.push_dentry(dn->get_name());
7018 if (straydn)
7019 mdcache->replicate_stray(straydn, who, req->stray);
7020
7021 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
7022
7023 // srcdn auth will verify our current witness list is sufficient
7024 req->witnesses = witnesse;
7025
7026 req->op_stamp = mdr->get_op_stamp();
7027 mds->send_message_mds(req, who);
7028
7029 assert(mdr->more()->waiting_on_slave.count(who) == 0);
7030 mdr->more()->waiting_on_slave.insert(who);
7031 return true;
7032 }
7033
7034 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
7035 {
7036 version_t oldpv = mdr->more()->inode_import_v;
7037
7038 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7039
7040 /* import node */
7041 bufferlist::iterator blp = mdr->more()->inode_import.begin();
7042
7043 // imported caps
7044 map<client_t,entity_inst_t> client_map;
7045 decode(client_map, blp);
7046 prepare_force_open_sessions(client_map, mdr->more()->imported_session_map);
7047 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
7048
7049 list<ScatterLock*> updated_scatterlocks;
7050 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
7051 mdr->more()->cap_imports, updated_scatterlocks);
7052
7053 // hack: force back to !auth and clean, temporarily
7054 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
7055 srcdnl->get_inode()->mark_clean();
7056
7057 return oldpv;
7058 }
7059
7060 bool Server::_need_force_journal(CInode *diri, bool empty)
7061 {
7062 list<CDir*> ls;
7063 diri->get_dirfrags(ls);
7064
7065 bool force_journal = false;
7066 if (empty) {
7067 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7068 if ((*p)->is_subtree_root() && (*p)->get_dir_auth().first == mds->get_nodeid()) {
7069 dout(10) << " frag " << (*p)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
7070 force_journal = true;
7071 break;
7072 } else
7073 dout(20) << " frag " << (*p)->get_frag() << " is not auth subtree dirfrag" << dendl;
7074 }
7075 } else {
7076 // see if any children of our frags are auth subtrees.
7077 list<CDir*> subtrees;
7078 mdcache->list_subtrees(subtrees);
7079 dout(10) << " subtrees " << subtrees << " frags " << ls << dendl;
7080 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7081 CDir *dir = *p;
7082 for (list<CDir*>::iterator q = subtrees.begin(); q != subtrees.end(); ++q) {
7083 if (dir->contains(*q)) {
7084 if ((*q)->get_dir_auth().first == mds->get_nodeid()) {
7085 dout(10) << " frag " << (*p)->get_frag() << " contains (maybe) auth subtree, will force journal "
7086 << **q << dendl;
7087 force_journal = true;
7088 break;
7089 } else
7090 dout(20) << " frag " << (*p)->get_frag() << " contains but isn't auth for " << **q << dendl;
7091 } else
7092 dout(20) << " frag " << (*p)->get_frag() << " does not contain " << **q << dendl;
7093 }
7094 if (force_journal)
7095 break;
7096 }
7097 }
7098 return force_journal;
7099 }
7100
7101 void Server::_rename_prepare(MDRequestRef& mdr,
7102 EMetaBlob *metablob, bufferlist *client_map_bl,
7103 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7104 {
7105 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7106 if (straydn)
7107 dout(10) << " straydn " << *straydn << dendl;
7108
7109 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7110 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7111 CInode *srci = srcdnl->get_inode();
7112 CInode *oldin = destdnl->get_inode();
7113
7114 // primary+remote link merge?
7115 bool linkmerge = (srci == destdnl->get_inode() &&
7116 (srcdnl->is_primary() || destdnl->is_primary()));
7117 bool silent = srcdn->get_dir()->inode->is_stray();
7118
7119 bool force_journal_dest = false;
7120 if (srci->is_dir() && !destdn->is_auth()) {
7121 if (srci->is_auth()) {
7122 // if we are auth for srci and exporting it, force journal because journal replay needs
7123 // the source inode to create auth subtrees.
7124 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
7125 force_journal_dest = true;
7126 } else
7127 force_journal_dest = _need_force_journal(srci, false);
7128 }
7129
7130 bool force_journal_stray = false;
7131 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
7132 force_journal_stray = _need_force_journal(oldin, true);
7133
7134 if (linkmerge)
7135 dout(10) << " merging remote and primary links to the same inode" << dendl;
7136 if (silent)
7137 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
7138 if (force_journal_dest)
7139 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
7140 if (force_journal_stray)
7141 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
7142
7143 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
7144 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
7145 metablob->renamed_dirino = srci->ino();
7146 } else if (oldin && oldin->is_dir() && force_journal_stray) {
7147 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
7148 metablob->renamed_dirino = oldin->ino();
7149 }
7150
7151 // prepare
7152 CInode::mempool_inode *spi = 0; // renamed inode
7153 CInode::mempool_inode *tpi = 0; // target/overwritten inode
7154
7155 // target inode
7156 if (!linkmerge) {
7157 if (destdnl->is_primary()) {
7158 assert(straydn); // moving to straydn.
7159 // link--, and move.
7160 if (destdn->is_auth()) {
7161 auto &pi= oldin->project_inode(); //project_snaprealm
7162 pi.inode.version = straydn->pre_dirty(pi.inode.version);
7163 pi.inode.update_backtrace();
7164 tpi = &pi.inode;
7165 }
7166 straydn->push_projected_linkage(oldin);
7167 } else if (destdnl->is_remote()) {
7168 // nlink-- targeti
7169 if (oldin->is_auth()) {
7170 auto &pi = oldin->project_inode();
7171 pi.inode.version = oldin->pre_dirty();
7172 tpi = &pi.inode;
7173 }
7174 }
7175 }
7176
7177 // dest
7178 if (srcdnl->is_remote()) {
7179 if (!linkmerge) {
7180 // destdn
7181 if (destdn->is_auth())
7182 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
7183 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7184 // srci
7185 if (srci->is_auth()) {
7186 auto &pi = srci->project_inode();
7187 pi.inode.version = srci->pre_dirty();
7188 spi = &pi.inode;
7189 }
7190 } else {
7191 dout(10) << " will merge remote onto primary link" << dendl;
7192 if (destdn->is_auth()) {
7193 auto &pi = oldin->project_inode();
7194 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
7195 spi = &pi.inode;
7196 }
7197 }
7198 } else { // primary
7199 if (destdn->is_auth()) {
7200 version_t oldpv;
7201 if (srcdn->is_auth())
7202 oldpv = srci->get_projected_version();
7203 else {
7204 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
7205
7206 // note which dirfrags have child subtrees in the journal
7207 // event, so that we can open those (as bounds) during replay.
7208 if (srci->is_dir()) {
7209 list<CDir*> ls;
7210 srci->get_dirfrags(ls);
7211 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7212 CDir *dir = *p;
7213 if (!dir->is_auth())
7214 metablob->renamed_dir_frags.push_back(dir->get_frag());
7215 }
7216 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
7217 }
7218 }
7219 auto &pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
7220 // & srcdnl->snaprealm
7221 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
7222 pi.inode.update_backtrace();
7223 spi = &pi.inode;
7224 }
7225 destdn->push_projected_linkage(srci);
7226 }
7227
7228 // src
7229 if (srcdn->is_auth())
7230 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
7231 srcdn->push_projected_linkage(); // push null linkage
7232
7233 if (!silent) {
7234 if (spi) {
7235 spi->ctime = spi->rstat.rctime = mdr->get_op_stamp();
7236 spi->change_attr++;
7237 if (linkmerge)
7238 spi->nlink--;
7239 }
7240 if (tpi) {
7241 tpi->ctime = tpi->rstat.rctime = mdr->get_op_stamp();
7242 tpi->change_attr++;
7243 {
7244 std::string t;
7245 destdn->make_path_string(t, true);
7246 tpi->stray_prior_path = mempool::mds_co::string(boost::string_view(t));
7247 }
7248 tpi->nlink--;
7249 if (tpi->nlink == 0)
7250 oldin->state_set(CInode::STATE_ORPHAN);
7251 }
7252 }
7253
7254 // prepare nesting, mtime updates
7255 int predirty_dir = silent ? 0:PREDIRTY_DIR;
7256
7257 // guarantee stray dir is processed first during journal replay. unlink the old inode,
7258 // then link the source inode to destdn
7259 if (destdnl->is_primary()) {
7260 assert(straydn);
7261 if (straydn->is_auth()) {
7262 metablob->add_dir_context(straydn->get_dir());
7263 metablob->add_dir(straydn->get_dir(), true);
7264 }
7265 }
7266
7267 // sub off target
7268 if (destdn->is_auth() && !destdnl->is_null()) {
7269 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
7270 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
7271 if (destdnl->is_primary()) {
7272 assert(straydn);
7273 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
7274 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7275 }
7276 }
7277
7278 // move srcdn
7279 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
7280 int flags = predirty_dir | predirty_primary;
7281 if (srcdn->is_auth())
7282 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
7283 if (destdn->is_auth())
7284 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
7285
7286 SnapRealm *src_realm = srci->find_snaprealm();
7287 SnapRealm *dest_realm = destdn->get_dir()->inode->find_snaprealm();
7288 snapid_t next_dest_snap = dest_realm->get_newest_seq() + 1;
7289
7290 // add it all to the metablob
7291 // target inode
7292 if (!linkmerge) {
7293 if (destdnl->is_primary()) {
7294 assert(straydn);
7295 if (destdn->is_auth()) {
7296 // project snaprealm, too
7297 if (oldin->snaprealm || dest_realm->get_newest_seq() + 1 > oldin->get_oldest_snap())
7298 oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
7299 straydn->first = MAX(oldin->first, next_dest_snap);
7300 metablob->add_primary_dentry(straydn, oldin, true, true);
7301 } else if (force_journal_stray) {
7302 dout(10) << " forced journaling straydn " << *straydn << dendl;
7303 metablob->add_dir_context(straydn->get_dir());
7304 metablob->add_primary_dentry(straydn, oldin, true);
7305 }
7306 } else if (destdnl->is_remote()) {
7307 if (oldin->is_auth()) {
7308 // auth for targeti
7309 metablob->add_dir_context(oldin->get_projected_parent_dir());
7310 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
7311 CEPH_NOSNAP, 0, destdnl);
7312 metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
7313 }
7314 }
7315 }
7316
7317 // dest
7318 if (srcdnl->is_remote()) {
7319 if (!linkmerge) {
7320 if (destdn->is_auth() && !destdnl->is_null())
7321 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7322 else
7323 destdn->first = MAX(destdn->first, next_dest_snap);
7324
7325 if (destdn->is_auth())
7326 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7327 if (srci->get_projected_parent_dn()->is_auth()) { // it's remote
7328 metablob->add_dir_context(srci->get_projected_parent_dir());
7329 mdcache->journal_cow_dentry(mdr.get(), metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl);
7330 metablob->add_primary_dentry(srci->get_projected_parent_dn(), srci, true);
7331 }
7332 } else {
7333 if (destdn->is_auth() && !destdnl->is_null())
7334 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7335 else
7336 destdn->first = MAX(destdn->first, next_dest_snap);
7337
7338 if (destdn->is_auth())
7339 metablob->add_primary_dentry(destdn, destdnl->get_inode(), true, true);
7340 }
7341 } else if (srcdnl->is_primary()) {
7342 // project snap parent update?
7343 if (destdn->is_auth() && src_realm != dest_realm &&
7344 (srci->snaprealm || src_realm->get_newest_seq() + 1 > srci->get_oldest_snap()))
7345 srci->project_past_snaprealm_parent(dest_realm);
7346
7347 if (destdn->is_auth() && !destdnl->is_null())
7348 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7349 else
7350 destdn->first = MAX(destdn->first, next_dest_snap);
7351
7352 if (destdn->is_auth())
7353 metablob->add_primary_dentry(destdn, srci, true, true);
7354 else if (force_journal_dest) {
7355 dout(10) << " forced journaling destdn " << *destdn << dendl;
7356 metablob->add_dir_context(destdn->get_dir());
7357 metablob->add_primary_dentry(destdn, srci, true);
7358 if (srcdn->is_auth() && srci->is_dir()) {
7359 // journal new subtrees root dirfrags
7360 list<CDir*> ls;
7361 srci->get_dirfrags(ls);
7362 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7363 CDir *dir = *p;
7364 if (dir->is_auth())
7365 metablob->add_dir(dir, true);
7366 }
7367 }
7368 }
7369 }
7370
7371 // src
7372 if (srcdn->is_auth()) {
7373 dout(10) << " journaling srcdn " << *srcdn << dendl;
7374 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
7375 // also journal the inode in case we need do slave rename rollback. It is Ok to add
7376 // both primary and NULL dentries. Because during journal replay, null dentry is
7377 // processed after primary dentry.
7378 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
7379 metablob->add_primary_dentry(srcdn, srci, true);
7380 metablob->add_null_dentry(srcdn, true);
7381 } else
7382 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
7383
7384 // make renamed inode first track the dn
7385 if (srcdnl->is_primary() && destdn->is_auth())
7386 srci->first = destdn->first;
7387
7388 if (oldin && oldin->is_dir()) {
7389 assert(straydn);
7390 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
7391 }
7392 if (srci->is_dir())
7393 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
7394
7395 }
7396
7397
7398 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7399 {
7400 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7401 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
7402
7403 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7404 CDentry::linkage_t *destdnl = destdn->get_linkage();
7405
7406 CInode *oldin = destdnl->get_inode();
7407
7408 // primary+remote link merge?
7409 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7410 (srcdnl->is_primary() || destdnl->is_primary()));
7411
7412 // target inode
7413 if (!linkmerge) {
7414 if (destdnl->is_primary()) {
7415 assert(straydn);
7416 dout(10) << "straydn is " << *straydn << dendl;
7417 destdn->get_dir()->unlink_inode(destdn, false);
7418
7419 straydn->pop_projected_linkage();
7420 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7421 assert(!straydn->is_projected()); // no other projected
7422
7423 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
7424
7425 // nlink-- targeti
7426 if (destdn->is_auth()) {
7427 bool hadrealm = (oldin->snaprealm ? true : false);
7428 oldin->pop_and_dirty_projected_inode(mdr->ls);
7429 if (oldin->snaprealm && !hadrealm)
7430 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT);
7431 } else {
7432 // FIXME this snaprealm is not filled out correctly
7433 //oldin->open_snaprealm(); might be sufficient..
7434 }
7435 } else if (destdnl->is_remote()) {
7436 destdn->get_dir()->unlink_inode(destdn, false);
7437 if (oldin->is_auth())
7438 oldin->pop_and_dirty_projected_inode(mdr->ls);
7439 }
7440 }
7441
7442 // unlink src before we relink it at dest
7443 CInode *in = srcdnl->get_inode();
7444 assert(in);
7445
7446 bool srcdn_was_remote = srcdnl->is_remote();
7447 srcdn->get_dir()->unlink_inode(srcdn);
7448
7449 // dest
7450 if (srcdn_was_remote) {
7451 if (!linkmerge) {
7452 // destdn
7453 destdnl = destdn->pop_projected_linkage();
7454 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7455 assert(!destdn->is_projected()); // no other projected
7456
7457 destdn->link_remote(destdnl, in);
7458 if (destdn->is_auth())
7459 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
7460 // in
7461 if (in->is_auth())
7462 in->pop_and_dirty_projected_inode(mdr->ls);
7463 } else {
7464 dout(10) << "merging remote onto primary link" << dendl;
7465 oldin->pop_and_dirty_projected_inode(mdr->ls);
7466 }
7467 } else { // primary
7468 if (linkmerge) {
7469 dout(10) << "merging primary onto remote link" << dendl;
7470 destdn->get_dir()->unlink_inode(destdn, false);
7471 }
7472 destdnl = destdn->pop_projected_linkage();
7473 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7474 assert(!destdn->is_projected()); // no other projected
7475
7476 // srcdn inode import?
7477 if (!srcdn->is_auth() && destdn->is_auth()) {
7478 assert(mdr->more()->inode_import.length() > 0);
7479
7480 map<client_t,Capability::Import> imported_caps;
7481
7482 // finish cap imports
7483 finish_force_open_sessions(mdr->more()->imported_session_map);
7484 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
7485 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
7486 mdr->more()->srcdn_auth_mds, true,
7487 mdr->more()->imported_session_map,
7488 mdr->more()->cap_imports[destdnl->get_inode()],
7489 imported_caps);
7490 }
7491
7492 mdr->more()->inode_import.clear();
7493 ::encode(imported_caps, mdr->more()->inode_import);
7494
7495 /* hack: add an auth pin for each xlock we hold. These were
7496 * remote xlocks previously but now they're local and
7497 * we're going to try and unpin when we xlock_finish. */
7498 for (set<SimpleLock *>::iterator i = mdr->xlocks.begin();
7499 i != mdr->xlocks.end();
7500 ++i)
7501 if ((*i)->get_parent() == destdnl->get_inode() &&
7502 !(*i)->is_locallock())
7503 mds->locker->xlock_import(*i);
7504
7505 // hack: fix auth bit
7506 in->state_set(CInode::STATE_AUTH);
7507
7508 mdr->clear_ambiguous_auth();
7509 }
7510
7511 if (destdn->is_auth()) {
7512 in->pop_and_dirty_projected_inode(mdr->ls);
7513
7514 } else {
7515 // FIXME: fix up snaprealm!
7516 }
7517 }
7518
7519 // src
7520 if (srcdn->is_auth())
7521 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
7522 srcdn->pop_projected_linkage();
7523 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7524 assert(!srcdn->is_projected()); // no other projected
7525
7526 // apply remaining projected inodes (nested)
7527 mdr->apply();
7528
7529 // update subtree map?
7530 if (destdnl->is_primary() && in->is_dir())
7531 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
7532
7533 if (straydn && oldin->is_dir())
7534 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
7535
7536 // removing a new dn?
7537 if (srcdn->is_auth())
7538 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
7539 }
7540
7541
7542
7543 // ------------
7544 // SLAVE
7545
7546 class C_MDS_SlaveRenamePrep : public ServerLogContext {
7547 CDentry *srcdn, *destdn, *straydn;
7548 public:
7549 C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7550 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
7551 void finish(int r) override {
7552 server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
7553 }
7554 };
7555
7556 class C_MDS_SlaveRenameCommit : public ServerContext {
7557 MDRequestRef mdr;
7558 CDentry *srcdn, *destdn, *straydn;
7559 public:
7560 C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7561 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
7562 void finish(int r) override {
7563 server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
7564 }
7565 };
7566
7567 class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
7568 MDRequestRef mdr;
7569 public:
7570 C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
7571 ServerContext(s), mdr(r) {}
7572 void finish(int r) override {
7573 server->_slave_rename_sessions_flushed(mdr);
7574 }
7575 };
7576
7577 /* This function DOES put the mdr->slave_request before returning*/
7578 void Server::handle_slave_rename_prep(MDRequestRef& mdr)
7579 {
7580 dout(10) << "handle_slave_rename_prep " << *mdr
7581 << " " << mdr->slave_request->srcdnpath
7582 << " to " << mdr->slave_request->destdnpath
7583 << dendl;
7584
7585 if (mdr->slave_request->is_interrupted()) {
7586 dout(10) << " slave request interrupted, sending noop reply" << dendl;
7587 MMDSSlaveRequest *reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7588 reply->mark_interrupted();
7589 mds->send_message_mds(reply, mdr->slave_to_mds);
7590 mdr->slave_request->put();
7591 mdr->slave_request = 0;
7592 return;
7593 }
7594
7595 // discover destdn
7596 filepath destpath(mdr->slave_request->destdnpath);
7597 dout(10) << " dest " << destpath << dendl;
7598 vector<CDentry*> trace;
7599 int r = mdcache->path_traverse(mdr, NULL, NULL, destpath, &trace, NULL, MDS_TRAVERSE_DISCOVERXLOCK);
7600 if (r > 0) return;
7601 if (r == -ESTALE) {
7602 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
7603 mdr->slave_to_mds);
7604 return;
7605 }
7606 assert(r == 0); // we shouldn't get an error here!
7607
7608 CDentry *destdn = trace[trace.size()-1];
7609 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7610 dout(10) << " destdn " << *destdn << dendl;
7611 mdr->pin(destdn);
7612
7613 // discover srcdn
7614 filepath srcpath(mdr->slave_request->srcdnpath);
7615 dout(10) << " src " << srcpath << dendl;
7616 CInode *srci = nullptr;
7617 r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &srci, MDS_TRAVERSE_DISCOVERXLOCK);
7618 if (r > 0) return;
7619 assert(r == 0);
7620
7621 // srcpath must not point to a null dentry
7622 assert(srci != nullptr);
7623
7624 CDentry *srcdn = trace[trace.size()-1];
7625 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7626 dout(10) << " srcdn " << *srcdn << dendl;
7627 mdr->pin(srcdn);
7628 mdr->pin(srci);
7629
7630 // stray?
7631 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7632 (srcdnl->is_primary() || destdnl->is_primary()));
7633 CDentry *straydn = mdr->straydn;
7634 if (destdnl->is_primary() && !linkmerge)
7635 assert(straydn);
7636
7637 mdr->set_op_stamp(mdr->slave_request->op_stamp);
7638 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
7639
7640 // set up commit waiter (early, to clean up any freezing etc we do)
7641 if (!mdr->more()->slave_commit)
7642 mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
7643
7644 // am i srcdn auth?
7645 if (srcdn->is_auth()) {
7646 set<mds_rank_t> srcdnrep;
7647 srcdn->list_replicas(srcdnrep);
7648
7649 bool reply_witness = false;
7650 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
7651 // freeze?
7652 // we need this to
7653 // - avoid conflicting lock state changes
7654 // - avoid concurrent updates to the inode
7655 // (this could also be accomplished with the versionlock)
7656 int allowance = 2; // 1 for the mdr auth_pin, 1 for the link lock
7657 allowance += srcdnl->get_inode()->is_dir(); // for the snap lock
7658 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
7659 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
7660
7661 // unfreeze auth pin after freezing the inode to avoid queueing waiters
7662 if (srcdnl->get_inode()->is_frozen_auth_pin())
7663 mdr->unfreeze_auth_pin();
7664
7665 if (!frozen_inode) {
7666 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
7667 return;
7668 }
7669
7670 /*
7671 * set ambiguous auth for srci
7672 * NOTE: we don't worry about ambiguous cache expire as we do
7673 * with subtree migrations because all slaves will pin
7674 * srcdn->get_inode() for duration of this rename.
7675 */
7676 mdr->set_ambiguous_auth(srcdnl->get_inode());
7677
7678 // just mark the source inode as ambiguous auth if more than two MDS are involved.
7679 // the master will send another OP_RENAMEPREP slave request later.
7680 if (mdr->slave_request->witnesses.size() > 1) {
7681 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
7682 reply_witness = true;
7683 }
7684
7685 // make sure bystanders have received all lock related messages
7686 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7687 if (*p == mdr->slave_to_mds ||
7688 (mds->is_cluster_degraded() &&
7689 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
7690 continue;
7691 MMDSSlaveRequest *notify = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7692 MMDSSlaveRequest::OP_RENAMENOTIFY);
7693 mds->send_message_mds(notify, *p);
7694 mdr->more()->waiting_on_slave.insert(*p);
7695 }
7696
7697 // make sure clients have received all cap related messages
7698 set<client_t> export_client_set;
7699 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
7700
7701 MDSGatherBuilder gather(g_ceph_context);
7702 flush_client_sessions(export_client_set, gather);
7703 if (gather.has_subs()) {
7704 mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
7705 gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
7706 gather.activate();
7707 }
7708 }
7709
7710 // is witness list sufficient?
7711 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7712 if (*p == mdr->slave_to_mds ||
7713 mdr->slave_request->witnesses.count(*p)) continue;
7714 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
7715 reply_witness = true;
7716 break;
7717 }
7718
7719 if (reply_witness) {
7720 assert(!srcdnrep.empty());
7721 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7722 MMDSSlaveRequest::OP_RENAMEPREPACK);
7723 reply->witnesses.swap(srcdnrep);
7724 mds->send_message_mds(reply, mdr->slave_to_mds);
7725 mdr->slave_request->put();
7726 mdr->slave_request = 0;
7727 return;
7728 }
7729 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
7730 if (!mdr->more()->waiting_on_slave.empty()) {
7731 dout(10) << " still waiting for rename notify acks from "
7732 << mdr->more()->waiting_on_slave << dendl;
7733 return;
7734 }
7735 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
7736 // set ambiguous auth for srci on witnesses
7737 mdr->set_ambiguous_auth(srcdnl->get_inode());
7738 }
7739
7740 // encode everything we'd need to roll this back... basically, just the original state.
7741 rename_rollback rollback;
7742
7743 rollback.reqid = mdr->reqid;
7744
7745 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
7746 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7747 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
7748 rollback.orig_src.dname = std::string(srcdn->get_name());
7749 if (srcdnl->is_primary())
7750 rollback.orig_src.ino = srcdnl->get_inode()->ino();
7751 else {
7752 assert(srcdnl->is_remote());
7753 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
7754 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
7755 }
7756
7757 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
7758 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7759 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
7760 rollback.orig_dest.dname = std::string(destdn->get_name());
7761 if (destdnl->is_primary())
7762 rollback.orig_dest.ino = destdnl->get_inode()->ino();
7763 else if (destdnl->is_remote()) {
7764 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
7765 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
7766 }
7767
7768 if (straydn) {
7769 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
7770 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
7771 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
7772 rollback.stray.dname = std::string(straydn->get_name());
7773 }
7774 ::encode(rollback, mdr->more()->rollback_bl);
7775 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7776
7777 // journal.
7778 mdr->ls = mdlog->get_current_segment();
7779 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
7780 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
7781 mdlog->start_entry(le);
7782 le->rollback = mdr->more()->rollback_bl;
7783
7784 bufferlist blah; // inode import data... obviously not used if we're the slave
7785 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
7786
7787 if (le->commit.empty()) {
7788 dout(10) << " empty metablob, skipping journal" << dendl;
7789 mdlog->cancel_entry(le);
7790 mdr->ls = NULL;
7791 _logged_slave_rename(mdr, srcdn, destdn, straydn);
7792 } else {
7793 mdr->more()->slave_update_journaled = true;
7794 submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
7795 mdr, __func__);
7796 mdlog->flush();
7797 }
7798 }
7799
7800 void Server::_logged_slave_rename(MDRequestRef& mdr,
7801 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7802 {
7803 dout(10) << "_logged_slave_rename " << *mdr << dendl;
7804
7805 // prepare ack
7806 MMDSSlaveRequest *reply = NULL;
7807 if (!mdr->aborted) {
7808 reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7809 if (!mdr->more()->slave_update_journaled)
7810 reply->mark_not_journaled();
7811 }
7812
7813 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7814 CDentry::linkage_t *destdnl = NULL;
7815 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
7816
7817 // export srci?
7818 if (srcdn->is_auth() && srcdnl->is_primary()) {
7819 // set export bounds for CInode::encode_export()
7820 list<CDir*> bounds;
7821 if (srcdnl->get_inode()->is_dir()) {
7822 srcdnl->get_inode()->get_dirfrags(bounds);
7823 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7824 (*p)->state_set(CDir::STATE_EXPORTBOUND);
7825 }
7826
7827 map<client_t,entity_inst_t> exported_client_map;
7828 bufferlist inodebl;
7829 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
7830 exported_client_map);
7831
7832 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7833 (*p)->state_clear(CDir::STATE_EXPORTBOUND);
7834
7835 if (reply) {
7836 ::encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
7837 reply->inode_export.claim_append(inodebl);
7838 reply->inode_export_v = srcdnl->get_inode()->inode.version;
7839 }
7840
7841 // remove mdr auth pin
7842 mdr->auth_unpin(srcdnl->get_inode());
7843 mdr->more()->is_inode_exporter = true;
7844
7845 if (srcdnl->get_inode()->is_dirty())
7846 srcdnl->get_inode()->mark_clean();
7847
7848 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
7849 }
7850
7851 // apply
7852 _rename_apply(mdr, srcdn, destdn, straydn);
7853
7854 destdnl = destdn->get_linkage();
7855
7856 // bump popularity
7857 utime_t now = ceph_clock_now();
7858 mds->balancer->hit_dir(now, srcdn->get_dir(), META_POP_IWR);
7859 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
7860 mds->balancer->hit_inode(now, destdnl->get_inode(), META_POP_IWR);
7861
7862 // done.
7863 mdr->slave_request->put();
7864 mdr->slave_request = 0;
7865 mdr->straydn = 0;
7866
7867 if (reply) {
7868 mds->send_message_mds(reply, mdr->slave_to_mds);
7869 } else {
7870 assert(mdr->aborted);
7871 dout(10) << " abort flag set, finishing" << dendl;
7872 mdcache->request_finish(mdr);
7873 }
7874 }
7875
7876 void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
7877 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7878 {
7879 dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
7880
7881 CDentry::linkage_t *destdnl = destdn->get_linkage();
7882
7883 list<MDSInternalContextBase*> finished;
7884 if (r == 0) {
7885 // unfreeze+singleauth inode
7886 // hmm, do i really need to delay this?
7887 if (mdr->more()->is_inode_exporter) {
7888
7889 CInode *in = destdnl->get_inode();
7890
7891 // drop our pins
7892 // we exported, clear out any xlocks that we moved to another MDS
7893 set<SimpleLock*>::iterator i = mdr->xlocks.begin();
7894 while (i != mdr->xlocks.end()) {
7895 SimpleLock *lock = *i++;
7896
7897 // we only care about xlocks on the exported inode
7898 if (lock->get_parent() == in &&
7899 !lock->is_locallock())
7900 mds->locker->xlock_export(lock, mdr.get());
7901 }
7902
7903 map<client_t,Capability::Import> peer_imported;
7904 bufferlist::iterator bp = mdr->more()->inode_import.begin();
7905 ::decode(peer_imported, bp);
7906
7907 dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl;
7908 mdcache->migrator->finish_export_inode(destdnl->get_inode(), ceph_clock_now(),
7909 mdr->slave_to_mds, peer_imported, finished);
7910 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
7911
7912 // unfreeze
7913 assert(destdnl->get_inode()->is_frozen_inode());
7914 destdnl->get_inode()->unfreeze_inode(finished);
7915 }
7916
7917 // singleauth
7918 if (mdr->more()->is_ambiguous_auth) {
7919 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
7920 mdr->more()->is_ambiguous_auth = false;
7921 }
7922
7923 if (straydn && mdr->more()->slave_update_journaled) {
7924 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7925 if (strayin && !strayin->snaprealm)
7926 mdcache->clear_dirty_bits_for_stray(strayin);
7927 }
7928
7929 mds->queue_waiters(finished);
7930 mdr->cleanup();
7931
7932 if (mdr->more()->slave_update_journaled) {
7933 // write a commit to the journal
7934 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
7935 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
7936 ESlaveUpdate::RENAME);
7937 mdlog->start_entry(le);
7938 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
7939 mdlog->flush();
7940 } else {
7941 _committed_slave(mdr);
7942 }
7943 } else {
7944
7945 // abort
7946 // rollback_bl may be empty if we froze the inode but had to provide an expanded
7947 // witness list from the master, and they failed before we tried prep again.
7948 if (mdr->more()->rollback_bl.length()) {
7949 if (mdr->more()->is_inode_exporter) {
7950 dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl;
7951 destdnl->get_inode()->abort_export();
7952 }
7953 if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
7954 mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
7955 // rollback but preserve the slave request
7956 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
7957 mdr->more()->rollback_bl.clear();
7958 } else
7959 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
7960 } else {
7961 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
7962 // singleauth
7963 if (mdr->more()->is_ambiguous_auth) {
7964 if (srcdn->is_auth())
7965 mdr->more()->rename_inode->unfreeze_inode(finished);
7966
7967 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
7968 mdr->more()->is_ambiguous_auth = false;
7969 }
7970 mds->queue_waiters(finished);
7971 mdcache->request_finish(mdr);
7972 }
7973 }
7974 }
7975
7976 void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
7977 bool isdir, int linkunlink, nest_info_t &rstat)
7978 {
7979 fnode_t *pf;
7980 pf = dir->project_fnode();
7981 mut->add_projected_fnode(dir);
7982 pf->version = dir->pre_dirty();
7983
7984 if (isdir) {
7985 pf->fragstat.nsubdirs += linkunlink;
7986 } else {
7987 pf->fragstat.nfiles += linkunlink;
7988 }
7989 if (r.ino) {
7990 pf->rstat.rbytes += linkunlink * rstat.rbytes;
7991 pf->rstat.rfiles += linkunlink * rstat.rfiles;
7992 pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
7993 pf->rstat.rsnaprealms += linkunlink * rstat.rsnaprealms;
7994 }
7995 if (pf->fragstat.mtime == ctime) {
7996 pf->fragstat.mtime = r.dirfrag_old_mtime;
7997 if (pf->rstat.rctime == ctime)
7998 pf->rstat.rctime = r.dirfrag_old_rctime;
7999 }
8000 mut->add_updated_lock(&dir->get_inode()->filelock);
8001 mut->add_updated_lock(&dir->get_inode()->nestlock);
8002 }
8003
8004 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
8005 MutationRef mut;
8006 CDentry *srcdn;
8007 version_t srcdnpv;
8008 CDentry *destdn;
8009 CDentry *straydn;
8010 bool finish_mdr;
8011 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
8012 CDentry *sd, version_t pv, CDentry *dd,
8013 CDentry *st, bool f) :
8014 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
8015 straydn(st), finish_mdr(f) {}
8016 void finish(int r) override {
8017 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
8018 destdn, straydn, finish_mdr);
8019 }
8020 };
8021
8022 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
8023 bool finish_mdr)
8024 {
8025 rename_rollback rollback;
8026 bufferlist::iterator p = rbl.begin();
8027 ::decode(rollback, p);
8028
8029 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
8030 // need to finish this update before sending resolve to claim the subtree
8031 mdcache->add_rollback(rollback.reqid, master);
8032
8033 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
8034 mut->ls = mds->mdlog->get_current_segment();
8035
8036 CDentry *srcdn = NULL;
8037 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
8038 if (!srcdir)
8039 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
8040 if (srcdir) {
8041 dout(10) << " srcdir " << *srcdir << dendl;
8042 srcdn = srcdir->lookup(rollback.orig_src.dname);
8043 if (srcdn) {
8044 dout(10) << " srcdn " << *srcdn << dendl;
8045 assert(srcdn->get_linkage()->is_null());
8046 } else
8047 dout(10) << " srcdn not found" << dendl;
8048 } else
8049 dout(10) << " srcdir not found" << dendl;
8050
8051 CDentry *destdn = NULL;
8052 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
8053 if (!destdir)
8054 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
8055 if (destdir) {
8056 dout(10) << " destdir " << *destdir << dendl;
8057 destdn = destdir->lookup(rollback.orig_dest.dname);
8058 if (destdn)
8059 dout(10) << " destdn " << *destdn << dendl;
8060 else
8061 dout(10) << " destdn not found" << dendl;
8062 } else
8063 dout(10) << " destdir not found" << dendl;
8064
8065 CInode *in = NULL;
8066 if (rollback.orig_src.ino) {
8067 in = mdcache->get_inode(rollback.orig_src.ino);
8068 if (in && in->is_dir())
8069 assert(srcdn && destdn);
8070 } else
8071 in = mdcache->get_inode(rollback.orig_src.remote_ino);
8072
8073 CDir *straydir = NULL;
8074 CDentry *straydn = NULL;
8075 if (rollback.stray.dirfrag.ino) {
8076 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
8077 if (straydir) {
8078 dout(10) << "straydir " << *straydir << dendl;
8079 straydn = straydir->lookup(rollback.stray.dname);
8080 if (straydn) {
8081 dout(10) << " straydn " << *straydn << dendl;
8082 assert(straydn->get_linkage()->is_primary());
8083 } else
8084 dout(10) << " straydn not found" << dendl;
8085 } else
8086 dout(10) << "straydir not found" << dendl;
8087 }
8088
8089 CInode *target = NULL;
8090 if (rollback.orig_dest.ino) {
8091 target = mdcache->get_inode(rollback.orig_dest.ino);
8092 if (target)
8093 assert(destdn && straydn);
8094 } else if (rollback.orig_dest.remote_ino)
8095 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
8096
8097 // can't use is_auth() in the resolve stage
8098 mds_rank_t whoami = mds->get_nodeid();
8099 // slave
8100 assert(!destdn || destdn->authority().first != whoami);
8101 assert(!straydn || straydn->authority().first != whoami);
8102
8103 bool force_journal_src = false;
8104 bool force_journal_dest = false;
8105 if (in && in->is_dir() && srcdn->authority().first != whoami)
8106 force_journal_src = _need_force_journal(in, false);
8107 if (in && target && target->is_dir())
8108 force_journal_dest = _need_force_journal(in, true);
8109
8110 version_t srcdnpv = 0;
8111 // repair src
8112 if (srcdn) {
8113 if (srcdn->authority().first == whoami)
8114 srcdnpv = srcdn->pre_dirty();
8115 if (rollback.orig_src.ino) {
8116 assert(in);
8117 srcdn->push_projected_linkage(in);
8118 } else
8119 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
8120 rollback.orig_src.remote_d_type);
8121 }
8122
8123 CInode::mempool_inode *pip = 0;
8124 if (in) {
8125 if (in->authority().first == whoami) {
8126 auto &pi = in->project_inode();
8127 mut->add_projected_inode(in);
8128 pi.inode.version = in->pre_dirty();
8129 pip = &pi.inode;
8130 } else
8131 pip = in->get_projected_inode();
8132 if (pip->ctime == rollback.ctime)
8133 pip->ctime = pip->rstat.rctime = rollback.orig_src.old_ctime;
8134 }
8135
8136 if (srcdn && srcdn->authority().first == whoami) {
8137 nest_info_t blah;
8138 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
8139 in ? in->is_dir() : false, 1, pip ? pip->accounted_rstat : blah);
8140 }
8141
8142 // repair dest
8143 if (destdn) {
8144 if (rollback.orig_dest.ino && target) {
8145 destdn->push_projected_linkage(target);
8146 } else if (rollback.orig_dest.remote_ino) {
8147 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
8148 rollback.orig_dest.remote_d_type);
8149 } else {
8150 // the dentry will be trimmed soon, it's ok to have wrong linkage
8151 if (rollback.orig_dest.ino)
8152 assert(mds->is_resolve());
8153 destdn->push_projected_linkage();
8154 }
8155 }
8156
8157 if (straydn)
8158 straydn->push_projected_linkage();
8159
8160 if (target) {
8161 CInode::mempool_inode *ti = NULL;
8162 if (target->authority().first == whoami) {
8163 auto &pi = target->project_inode();
8164 mut->add_projected_inode(target);
8165 pi.inode.version = target->pre_dirty();
8166 ti = &pi.inode;
8167 } else
8168 ti = target->get_projected_inode();
8169 if (ti->ctime == rollback.ctime)
8170 ti->ctime = ti->rstat.rctime = rollback.orig_dest.old_ctime;
8171 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
8172 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
8173 assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
8174 else
8175 assert(rollback.orig_dest.remote_ino &&
8176 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
8177 } else
8178 ti->nlink++;
8179 }
8180
8181 if (srcdn)
8182 dout(0) << " srcdn back to " << *srcdn << dendl;
8183 if (in)
8184 dout(0) << " srci back to " << *in << dendl;
8185 if (destdn)
8186 dout(0) << " destdn back to " << *destdn << dendl;
8187 if (target)
8188 dout(0) << " desti back to " << *target << dendl;
8189
8190 // journal it
8191 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
8192 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
8193 mdlog->start_entry(le);
8194
8195 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
8196 le->commit.add_dir_context(srcdir);
8197 if (rollback.orig_src.ino)
8198 le->commit.add_primary_dentry(srcdn, 0, true);
8199 else
8200 le->commit.add_remote_dentry(srcdn, true);
8201 }
8202
8203 if (!rollback.orig_src.ino && // remote linkage
8204 in && in->authority().first == whoami) {
8205 le->commit.add_dir_context(in->get_projected_parent_dir());
8206 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
8207 }
8208
8209 if (force_journal_dest) {
8210 assert(rollback.orig_dest.ino);
8211 le->commit.add_dir_context(destdir);
8212 le->commit.add_primary_dentry(destdn, 0, true);
8213 }
8214
8215 // slave: no need to journal straydn
8216
8217 if (target && target != in && target->authority().first == whoami) {
8218 assert(rollback.orig_dest.remote_ino);
8219 le->commit.add_dir_context(target->get_projected_parent_dir());
8220 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
8221 }
8222
8223 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
8224 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
8225 le->commit.renamed_dirino = in->ino();
8226 if (srcdn->authority().first == whoami) {
8227 list<CDir*> ls;
8228 in->get_dirfrags(ls);
8229 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
8230 CDir *dir = *p;
8231 if (!dir->is_auth())
8232 le->commit.renamed_dir_frags.push_back(dir->get_frag());
8233 }
8234 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
8235 }
8236 } else if (force_journal_dest) {
8237 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
8238 le->commit.renamed_dirino = target->ino();
8239 }
8240
8241 if (target && target->is_dir()) {
8242 assert(destdn);
8243 mdcache->project_subtree_rename(target, straydir, destdir);
8244 }
8245
8246 if (in && in->is_dir()) {
8247 assert(srcdn);
8248 mdcache->project_subtree_rename(in, destdir, srcdir);
8249 }
8250
8251 if (mdr && !mdr->more()->slave_update_journaled) {
8252 assert(le->commit.empty());
8253 mdlog->cancel_entry(le);
8254 mut->ls = NULL;
8255 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, finish_mdr);
8256 } else {
8257 assert(!le->commit.empty());
8258 if (mdr)
8259 mdr->more()->slave_update_journaled = false;
8260 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr, srcdn, srcdnpv,
8261 destdn, straydn, finish_mdr);
8262 submit_mdlog_entry(le, fin, mdr, __func__);
8263 mdlog->flush();
8264 }
8265 }
8266
8267 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
8268 version_t srcdnpv, CDentry *destdn,
8269 CDentry *straydn, bool finish_mdr)
8270 {
8271 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
8272
8273 if (straydn) {
8274 straydn->get_dir()->unlink_inode(straydn);
8275 straydn->pop_projected_linkage();
8276 }
8277 if (destdn) {
8278 destdn->get_dir()->unlink_inode(destdn);
8279 destdn->pop_projected_linkage();
8280 }
8281 if (srcdn) {
8282 srcdn->pop_projected_linkage();
8283 if (srcdn->authority().first == mds->get_nodeid())
8284 srcdn->mark_dirty(srcdnpv, mut->ls);
8285 }
8286
8287 mut->apply();
8288
8289 if (srcdn && srcdn->get_linkage()->is_primary()) {
8290 CInode *in = srcdn->get_linkage()->get_inode();
8291 if (srcdn->authority().first == mds->get_nodeid())
8292 in->state_set(CInode::STATE_AUTH);
8293 // update subtree map?
8294 if (in && in->is_dir()) {
8295 assert(destdn);
8296 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
8297 }
8298 }
8299
8300 if (destdn) {
8301 CInode *oldin = destdn->get_linkage()->get_inode();
8302 // update subtree map?
8303 if (oldin && oldin->is_dir()) {
8304 assert(straydn);
8305 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
8306 }
8307 }
8308
8309 if (mds->is_resolve()) {
8310 CDir *root = NULL;
8311 if (straydn)
8312 root = mdcache->get_subtree_root(straydn->get_dir());
8313 else if (destdn)
8314 root = mdcache->get_subtree_root(destdn->get_dir());
8315 if (root)
8316 mdcache->try_trim_non_auth_subtree(root);
8317 }
8318
8319 if (mdr) {
8320 list<MDSInternalContextBase*> finished;
8321 if (mdr->more()->is_ambiguous_auth) {
8322 if (srcdn->is_auth())
8323 mdr->more()->rename_inode->unfreeze_inode(finished);
8324
8325 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8326 mdr->more()->is_ambiguous_auth = false;
8327 }
8328 mds->queue_waiters(finished);
8329 if (finish_mdr || mdr->aborted)
8330 mdcache->request_finish(mdr);
8331 else
8332 mdr->more()->slave_rolling_back = false;
8333 }
8334
8335 mdcache->finish_rollback(mut->reqid);
8336
8337 mut->cleanup();
8338 }
8339
8340 /* This function DOES put the passed message before returning*/
8341 void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8342 {
8343 dout(10) << "handle_slave_rename_prep_ack " << *mdr
8344 << " witnessed by " << ack->get_source()
8345 << " " << *ack << dendl;
8346 mds_rank_t from = mds_rank_t(ack->get_source().num());
8347
8348 // note slave
8349 mdr->more()->slaves.insert(from);
8350 if (mdr->more()->srcdn_auth_mds == from &&
8351 mdr->more()->is_remote_frozen_authpin &&
8352 !mdr->more()->is_ambiguous_auth) {
8353 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
8354 }
8355
8356 // witnessed? or add extra witnesses?
8357 assert(mdr->more()->witnessed.count(from) == 0);
8358 if (ack->is_interrupted()) {
8359 dout(10) << " slave request interrupted, noop" << dendl;
8360 } else if (ack->witnesses.empty()) {
8361 mdr->more()->witnessed.insert(from);
8362 if (!ack->is_not_journaled())
8363 mdr->more()->has_journaled_slaves = true;
8364 } else {
8365 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
8366 mdr->more()->extra_witnesses.swap(ack->witnesses);
8367 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
8368 }
8369
8370 // srci import?
8371 if (ack->inode_export.length()) {
8372 dout(10) << " got srci import" << dendl;
8373 mdr->more()->inode_import.claim(ack->inode_export);
8374 mdr->more()->inode_import_v = ack->inode_export_v;
8375 }
8376
8377 // remove from waiting list
8378 assert(mdr->more()->waiting_on_slave.count(from));
8379 mdr->more()->waiting_on_slave.erase(from);
8380
8381 if (mdr->more()->waiting_on_slave.empty())
8382 dispatch_client_request(mdr); // go again!
8383 else
8384 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
8385 }
8386
8387 void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8388 {
8389 dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
8390 << ack->get_source() << dendl;
8391 assert(mdr->is_slave());
8392 mds_rank_t from = mds_rank_t(ack->get_source().num());
8393
8394 if (mdr->more()->waiting_on_slave.count(from)) {
8395 mdr->more()->waiting_on_slave.erase(from);
8396
8397 if (mdr->more()->waiting_on_slave.empty()) {
8398 if (mdr->slave_request)
8399 dispatch_slave_request(mdr);
8400 } else
8401 dout(10) << " still waiting for rename notify acks from "
8402 << mdr->more()->waiting_on_slave << dendl;
8403 }
8404 }
8405
8406 void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
8407 {
8408 dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
8409
8410 if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
8411 mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
8412
8413 if (mdr->more()->waiting_on_slave.empty()) {
8414 if (mdr->slave_request)
8415 dispatch_slave_request(mdr);
8416 } else
8417 dout(10) << " still waiting for rename notify acks from "
8418 << mdr->more()->waiting_on_slave << dendl;
8419 }
8420 }
8421
8422 // snaps
8423 /* This function takes responsibility for the passed mdr*/
8424 void Server::handle_client_lssnap(MDRequestRef& mdr)
8425 {
8426 MClientRequest *req = mdr->client_request;
8427
8428 // traverse to path
8429 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8430 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8431 respond_to_request(mdr, -ESTALE);
8432 return;
8433 }
8434 if (!diri->is_auth()) {
8435 mdcache->request_forward(mdr, diri->authority().first);
8436 return;
8437 }
8438 if (!diri->is_dir()) {
8439 respond_to_request(mdr, -ENOTDIR);
8440 return;
8441 }
8442 dout(10) << "lssnap on " << *diri << dendl;
8443
8444 // lock snap
8445 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8446 mds->locker->include_snap_rdlocks(rdlocks, diri);
8447 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8448 return;
8449
8450 if (!check_access(mdr, diri, MAY_READ))
8451 return;
8452
8453 SnapRealm *realm = diri->find_snaprealm();
8454 map<snapid_t,SnapInfo*> infomap;
8455 realm->get_snap_info(infomap, diri->get_oldest_snap());
8456
8457 unsigned max_entries = req->head.args.readdir.max_entries;
8458 if (!max_entries)
8459 max_entries = infomap.size();
8460 int max_bytes = req->head.args.readdir.max_bytes;
8461 if (!max_bytes)
8462 // make sure at least one item can be encoded
8463 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
8464
8465 __u64 last_snapid = 0;
8466 string offset_str = req->get_path2();
8467 if (!offset_str.empty())
8468 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
8469
8470 bufferlist dirbl;
8471 encode_empty_dirstat(dirbl);
8472
8473 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
8474
8475 __u32 num = 0;
8476 bufferlist dnbl;
8477 map<snapid_t,SnapInfo*>::iterator p = infomap.upper_bound(last_snapid);
8478 for (; p != infomap.end() && num < max_entries; ++p) {
8479 dout(10) << p->first << " -> " << *p->second << dendl;
8480
8481 // actual
8482 string snap_name;
8483 if (p->second->ino == diri->ino())
8484 snap_name = std::string(p->second->name);
8485 else
8486 snap_name = std::string(p->second->get_long_name());
8487
8488 unsigned start_len = dnbl.length();
8489 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
8490 break;
8491
8492 ::encode(snap_name, dnbl);
8493 encode_infinite_lease(dnbl);
8494
8495 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
8496 if (r < 0) {
8497 bufferlist keep;
8498 keep.substr_of(dnbl, 0, start_len);
8499 dnbl.swap(keep);
8500 break;
8501 }
8502 ++num;
8503 }
8504
8505 ::encode(num, dirbl);
8506 __u16 flags = 0;
8507 if (p == infomap.end()) {
8508 flags = CEPH_READDIR_FRAG_END;
8509 if (last_snapid == 0)
8510 flags |= CEPH_READDIR_FRAG_COMPLETE;
8511 }
8512 ::encode(flags, dirbl);
8513 dirbl.claim_append(dnbl);
8514
8515 mdr->reply_extra_bl = dirbl;
8516 mdr->tracei = diri;
8517 respond_to_request(mdr, 0);
8518 }
8519
8520
8521 // MKSNAP
8522
8523 struct C_MDS_mksnap_finish : public ServerLogContext {
8524 CInode *diri;
8525 SnapInfo info;
8526 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
8527 ServerLogContext(s, r), diri(di), info(i) {}
8528 void finish(int r) override {
8529 server->_mksnap_finish(mdr, diri, info);
8530 }
8531 };
8532
8533 /* This function takes responsibility for the passed mdr*/
8534 void Server::handle_client_mksnap(MDRequestRef& mdr)
8535 {
8536 if (!mds->mdsmap->allows_snaps()) {
8537 // you can't make snapshots until you set an option right now
8538 respond_to_request(mdr, -EPERM);
8539 return;
8540 }
8541
8542 MClientRequest *req = mdr->client_request;
8543 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8544 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8545 respond_to_request(mdr, -ESTALE);
8546 return;
8547 }
8548
8549 if (!diri->is_auth()) { // fw to auth?
8550 mdcache->request_forward(mdr, diri->authority().first);
8551 return;
8552 }
8553
8554 // dir only
8555 if (!diri->is_dir()) {
8556 respond_to_request(mdr, -ENOTDIR);
8557 return;
8558 }
8559 if (diri->is_system() && !diri->is_root()) {
8560 // no snaps in system dirs (root is ok)
8561 respond_to_request(mdr, -EPERM);
8562 return;
8563 }
8564
8565 boost::string_view snapname = req->get_filepath().last_dentry();
8566
8567 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8568 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8569 respond_to_request(mdr, -EPERM);
8570 return;
8571 }
8572
8573 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
8574
8575 // lock snap
8576 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8577
8578 mds->locker->include_snap_rdlocks(rdlocks, diri);
8579 rdlocks.erase(&diri->snaplock);
8580 xlocks.insert(&diri->snaplock);
8581
8582 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8583 return;
8584
8585 if (!check_access(mdr, diri, MAY_WRITE))
8586 return;
8587
8588 // make sure name is unique
8589 if (diri->snaprealm &&
8590 diri->snaprealm->exists(snapname)) {
8591 respond_to_request(mdr, -EEXIST);
8592 return;
8593 }
8594 if (snapname.length() == 0 ||
8595 snapname[0] == '_') {
8596 respond_to_request(mdr, -EINVAL);
8597 return;
8598 }
8599
8600 // allocate a snapid
8601 if (!mdr->more()->stid) {
8602 // prepare an stid
8603 mds->snapclient->prepare_create(diri->ino(), snapname,
8604 mdr->get_mds_stamp(),
8605 &mdr->more()->stid, &mdr->more()->snapidbl,
8606 new C_MDS_RetryRequest(mdcache, mdr));
8607 return;
8608 }
8609
8610 version_t stid = mdr->more()->stid;
8611 snapid_t snapid;
8612 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8613 ::decode(snapid, p);
8614 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
8615
8616 // journal
8617 SnapInfo info;
8618 info.ino = diri->ino();
8619 info.snapid = snapid;
8620 info.name = std::string(snapname);
8621 info.stamp = mdr->get_op_stamp();
8622
8623 auto &pi = diri->project_inode(false, true);
8624 pi.inode.ctime = pi.inode.rstat.rctime = info.stamp;
8625 pi.inode.version = diri->pre_dirty();
8626
8627 // project the snaprealm
8628 auto &newsnap = *pi.snapnode;
8629 newsnap.created = snapid;
8630 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
8631 if (!em.second)
8632 em.first->second = info;
8633 newsnap.seq = snapid;
8634 newsnap.last_created = snapid;
8635
8636 // journal the inode changes
8637 mdr->ls = mdlog->get_current_segment();
8638 EUpdate *le = new EUpdate(mdlog, "mksnap");
8639 mdlog->start_entry(le);
8640
8641 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8642 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8643 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8644 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8645
8646 // journal the snaprealm changes
8647 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
8648 mdr, __func__);
8649 mdlog->flush();
8650 }
8651
8652 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
8653 {
8654 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
8655
8656 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
8657
8658 diri->pop_and_dirty_projected_inode(mdr->ls);
8659 mdr->apply();
8660
8661 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8662
8663 // create snap
8664 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8665
8666 mdcache->do_realm_invalidate_and_update_notify(diri, op);
8667
8668 // yay
8669 mdr->in[0] = diri;
8670 mdr->snapid = info.snapid;
8671 mdr->tracei = diri;
8672 respond_to_request(mdr, 0);
8673 }
8674
8675
8676 // RMSNAP
8677
8678 struct C_MDS_rmsnap_finish : public ServerLogContext {
8679 CInode *diri;
8680 snapid_t snapid;
8681 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8682 ServerLogContext(s, r), diri(di), snapid(sn) {}
8683 void finish(int r) override {
8684 server->_rmsnap_finish(mdr, diri, snapid);
8685 }
8686 };
8687
8688 /* This function takes responsibility for the passed mdr*/
8689 void Server::handle_client_rmsnap(MDRequestRef& mdr)
8690 {
8691 MClientRequest *req = mdr->client_request;
8692
8693 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8694 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8695 respond_to_request(mdr, -ESTALE);
8696 return;
8697 }
8698 if (!diri->is_auth()) { // fw to auth?
8699 mdcache->request_forward(mdr, diri->authority().first);
8700 return;
8701 }
8702 if (!diri->is_dir()) {
8703 respond_to_request(mdr, -ENOTDIR);
8704 return;
8705 }
8706
8707 boost::string_view snapname = req->get_filepath().last_dentry();
8708
8709 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8710 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8711 respond_to_request(mdr, -EPERM);
8712 return;
8713 }
8714
8715 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
8716
8717 // does snap exist?
8718 if (snapname.length() == 0 || snapname[0] == '_') {
8719 respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
8720 return;
8721 }
8722 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
8723 respond_to_request(mdr, -ENOENT);
8724 return;
8725 }
8726 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
8727 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
8728
8729 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8730 mds->locker->include_snap_rdlocks(rdlocks, diri);
8731 rdlocks.erase(&diri->snaplock);
8732 xlocks.insert(&diri->snaplock);
8733
8734 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8735 return;
8736
8737 if (!check_access(mdr, diri, MAY_WRITE))
8738 return;
8739
8740 // prepare
8741 if (!mdr->more()->stid) {
8742 mds->snapclient->prepare_destroy(diri->ino(), snapid,
8743 &mdr->more()->stid, &mdr->more()->snapidbl,
8744 new C_MDS_RetryRequest(mdcache, mdr));
8745 return;
8746 }
8747 version_t stid = mdr->more()->stid;
8748 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8749 snapid_t seq;
8750 ::decode(seq, p);
8751 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8752
8753 // journal
8754 auto &pi = diri->project_inode(false, true);
8755 pi.inode.version = diri->pre_dirty();
8756 pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
8757
8758 mdr->ls = mdlog->get_current_segment();
8759 EUpdate *le = new EUpdate(mdlog, "rmsnap");
8760 mdlog->start_entry(le);
8761
8762 // project the snaprealm
8763 auto &newnode = *pi.snapnode;
8764 newnode.snaps.erase(snapid);
8765 newnode.seq = seq;
8766 newnode.last_destroyed = seq;
8767
8768 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8769 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8770 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8771 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8772
8773 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
8774 mdr, __func__);
8775 mdlog->flush();
8776 }
8777
8778 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8779 {
8780 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
8781 snapid_t stid = mdr->more()->stid;
8782 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8783 snapid_t seq;
8784 ::decode(seq, p);
8785
8786 diri->pop_and_dirty_projected_inode(mdr->ls);
8787 mdr->apply();
8788
8789 mds->snapclient->commit(stid, mdr->ls);
8790
8791 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8792
8793 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
8794
8795 // yay
8796 mdr->in[0] = diri;
8797 respond_to_request(mdr, 0);
8798
8799 // purge snapshot data
8800 if (diri->snaprealm->have_past_parents_open())
8801 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
8802 }
8803
8804 struct C_MDS_renamesnap_finish : public ServerLogContext {
8805 CInode *diri;
8806 snapid_t snapid;
8807 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8808 ServerLogContext(s, r), diri(di), snapid(sn) {}
8809 void finish(int r) override {
8810 server->_renamesnap_finish(mdr, diri, snapid);
8811 }
8812 };
8813
8814 /* This function takes responsibility for the passed mdr*/
8815 void Server::handle_client_renamesnap(MDRequestRef& mdr)
8816 {
8817 MClientRequest *req = mdr->client_request;
8818 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
8819 respond_to_request(mdr, -EINVAL);
8820 return;
8821 }
8822
8823 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8824 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8825 respond_to_request(mdr, -ESTALE);
8826 return;
8827 }
8828
8829 if (!diri->is_auth()) { // fw to auth?
8830 mdcache->request_forward(mdr, diri->authority().first);
8831 return;
8832 }
8833
8834 if (!diri->is_dir()) { // dir only
8835 respond_to_request(mdr, -ENOTDIR);
8836 return;
8837 }
8838
8839 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid ||
8840 mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8841 respond_to_request(mdr, -EPERM);
8842 return;
8843 }
8844
8845 boost::string_view dstname = req->get_filepath().last_dentry();
8846 boost::string_view srcname = req->get_filepath2().last_dentry();
8847 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
8848
8849 if (srcname.length() == 0 || srcname[0] == '_') {
8850 respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
8851 return;
8852 }
8853 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
8854 respond_to_request(mdr, -ENOENT);
8855 return;
8856 }
8857 if (dstname.length() == 0 || dstname[0] == '_') {
8858 respond_to_request(mdr, -EINVAL);
8859 return;
8860 }
8861 if (diri->snaprealm->exists(dstname)) {
8862 respond_to_request(mdr, -EEXIST);
8863 return;
8864 }
8865
8866 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
8867 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
8868
8869 // lock snap
8870 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8871
8872 mds->locker->include_snap_rdlocks(rdlocks, diri);
8873 rdlocks.erase(&diri->snaplock);
8874 xlocks.insert(&diri->snaplock);
8875
8876 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8877 return;
8878
8879 if (!check_access(mdr, diri, MAY_WRITE))
8880 return;
8881
8882 // prepare
8883 if (!mdr->more()->stid) {
8884 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
8885 &mdr->more()->stid, &mdr->more()->snapidbl,
8886 new C_MDS_RetryRequest(mdcache, mdr));
8887 return;
8888 }
8889
8890 version_t stid = mdr->more()->stid;
8891 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8892 snapid_t seq;
8893 ::decode(seq, p);
8894 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8895
8896 // journal
8897 auto &pi = diri->project_inode(false, true);
8898 pi.inode.ctime = pi.inode.rstat.rctime = mdr->get_op_stamp();
8899 pi.inode.version = diri->pre_dirty();
8900
8901 // project the snaprealm
8902 auto &newsnap = *pi.snapnode;
8903 auto it = newsnap.snaps.find(snapid);
8904 assert(it != newsnap.snaps.end());
8905 it->second.name = std::string(dstname);
8906
8907 // journal the inode changes
8908 mdr->ls = mdlog->get_current_segment();
8909 EUpdate *le = new EUpdate(mdlog, "renamesnap");
8910 mdlog->start_entry(le);
8911
8912 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8913 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8914 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8915 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8916
8917 // journal the snaprealm changes
8918 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
8919 mdr, __func__);
8920 mdlog->flush();
8921 }
8922
8923 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8924 {
8925 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
8926
8927 diri->pop_and_dirty_projected_inode(mdr->ls);
8928 mdr->apply();
8929
8930 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8931
8932 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8933
8934 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE, true);
8935
8936 // yay
8937 mdr->in[0] = diri;
8938 mdr->tracei = diri;
8939 mdr->snapid = snapid;
8940 respond_to_request(mdr, 0);
8941 }
8942
8943 /**
8944 * Return true if server is in state RECONNECT and this
8945 * client has not yet reconnected.
8946 */
8947 bool Server::waiting_for_reconnect(client_t c) const
8948 {
8949 return client_reconnect_gather.count(c) > 0;
8950 }
8951
8952 void Server::dump_reconnect_status(Formatter *f) const
8953 {
8954 f->open_object_section("reconnect_status");
8955 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
8956 f->close_section();
8957 }