]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/Server.cc
update sources to v12.1.3
[ceph.git] / ceph / src / mds / Server.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <boost/lexical_cast.hpp>
16#include "include/assert.h" // lexical_cast includes system assert.h
17
18#include <boost/config/warning_disable.hpp>
19#include <boost/fusion/include/std_pair.hpp>
20
21#include "MDSRank.h"
22#include "Server.h"
23#include "Locker.h"
24#include "MDCache.h"
25#include "MDLog.h"
26#include "Migrator.h"
27#include "MDBalancer.h"
28#include "InoTable.h"
29#include "SnapClient.h"
30#include "Mutation.h"
31
32#include "msg/Messenger.h"
33
34#include "osdc/Objecter.h"
35
36#include "messages/MClientSession.h"
37#include "messages/MClientRequest.h"
38#include "messages/MClientReply.h"
39#include "messages/MClientReconnect.h"
40#include "messages/MClientCaps.h"
41#include "messages/MClientSnap.h"
42
43#include "messages/MMDSSlaveRequest.h"
44
45#include "messages/MLock.h"
46
47#include "events/EUpdate.h"
48#include "events/ESlaveUpdate.h"
49#include "events/ESession.h"
50#include "events/EOpen.h"
51#include "events/ECommitted.h"
52
53#include "include/filepath.h"
54#include "common/errno.h"
55#include "common/Timer.h"
56#include "common/perf_counters.h"
57#include "include/compat.h"
58#include "osd/OSDMap.h"
59
60#include <errno.h>
61
62#include <list>
63#include <iostream>
64using namespace std;
65
66#include "common/config.h"
67
68#define dout_context g_ceph_context
69#define dout_subsys ceph_subsys_mds
70#undef dout_prefix
71#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
72
7c673cae
FG
73class ServerContext : public MDSInternalContextBase {
74 protected:
75 Server *server;
76 MDSRank *get_mds() override
77 {
78 return server->mds;
79 }
80
81 public:
82 explicit ServerContext(Server *s) : server(s) {
83 assert(server != NULL);
84 }
85};
86
87class ServerLogContext : public MDSLogContextBase {
88protected:
89 Server *server;
90 MDSRank *get_mds() override
91 {
92 return server->mds;
93 }
94
95 MDRequestRef mdr;
96 void pre_finish(int r) override {
97 if (mdr)
98 mdr->mark_event("journal_committed: ");
99 }
100public:
101 explicit ServerLogContext(Server *s) : server(s) {
102 assert(server != NULL);
103 }
104 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
105 assert(server != NULL);
106 }
107};
108
109void Server::create_logger()
110{
111 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
112 plb.add_u64_counter(l_mdss_handle_client_request,"handle_client_request",
c07f9fc5 113 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae 114 plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
c07f9fc5 115 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae 116 plb.add_u64_counter(l_mdss_handle_client_session, "handle_client_session",
c07f9fc5 117 "Client session messages", "hcs", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae
FG
118 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", "Client requests dispatched");
119 plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request", "Server requests dispatched");
120 plb.add_u64_counter(l_mdss_req_lookuphash, "req_lookuphash",
121 "Request type lookup hash of inode");
122 plb.add_u64_counter(l_mdss_req_lookupino, "req_lookupino",
123 "Request type lookup inode");
124 plb.add_u64_counter(l_mdss_req_lookupparent, "req_lookupparent",
125 "Request type lookup parent");
126 plb.add_u64_counter(l_mdss_req_lookupname, "req_lookupname",
127 "Request type lookup name");
128 plb.add_u64_counter(l_mdss_req_lookup, "req_lookup",
129 "Request type lookup");
130 plb.add_u64_counter(l_mdss_req_lookupsnap, "req_lookupsnap",
131 "Request type lookup snapshot");
132 plb.add_u64_counter(l_mdss_req_getattr, "req_getattr",
133 "Request type get attribute");
134 plb.add_u64_counter(l_mdss_req_setattr, "req_setattr",
135 "Request type set attribute");
136 plb.add_u64_counter(l_mdss_req_setlayout, "req_setlayout",
137 "Request type set file layout");
138 plb.add_u64_counter(l_mdss_req_setdirlayout, "req_setdirlayout",
139 "Request type set directory layout");
140 plb.add_u64_counter(l_mdss_req_setxattr, "req_setxattr",
141 "Request type set extended attribute");
142 plb.add_u64_counter(l_mdss_req_rmxattr, "req_rmxattr",
143 "Request type remove extended attribute");
144 plb.add_u64_counter(l_mdss_req_readdir, "req_readdir",
145 "Request type read directory");
146 plb.add_u64_counter(l_mdss_req_setfilelock, "req_setfilelock",
147 "Request type set file lock");
148 plb.add_u64_counter(l_mdss_req_getfilelock, "req_getfilelock",
149 "Request type get file lock");
150 plb.add_u64_counter(l_mdss_req_create, "req_create",
151 "Request type create");
152 plb.add_u64_counter(l_mdss_req_open, "req_open",
153 "Request type open");
154 plb.add_u64_counter(l_mdss_req_mknod, "req_mknod",
155 "Request type make node");
156 plb.add_u64_counter(l_mdss_req_link, "req_link",
157 "Request type link");
158 plb.add_u64_counter(l_mdss_req_unlink, "req_unlink",
159 "Request type unlink");
160 plb.add_u64_counter(l_mdss_req_rmdir, "req_rmdir",
161 "Request type remove directory");
162 plb.add_u64_counter(l_mdss_req_rename, "req_rename",
163 "Request type rename");
164 plb.add_u64_counter(l_mdss_req_mkdir, "req_mkdir",
165 "Request type make directory");
166 plb.add_u64_counter(l_mdss_req_symlink, "req_symlink",
167 "Request type symbolic link");
168 plb.add_u64_counter(l_mdss_req_lssnap, "req_lssnap",
169 "Request type list snapshot");
170 plb.add_u64_counter(l_mdss_req_mksnap, "req_mksnap",
171 "Request type make snapshot");
172 plb.add_u64_counter(l_mdss_req_rmsnap, "req_rmsnap",
173 "Request type remove snapshot");
174 plb.add_u64_counter(l_mdss_req_renamesnap, "req_renamesnap",
175 "Request type rename snapshot");
176 logger = plb.create_perf_counters();
177 g_ceph_context->get_perfcounters_collection()->add(logger);
178}
179
180Server::Server(MDSRank *m) :
181 mds(m),
182 mdcache(mds->mdcache), mdlog(mds->mdlog),
183 logger(0),
184 is_full(false),
185 reconnect_done(NULL),
186 failed_reconnects(0),
31f18b77 187 reconnect_evicting(false),
7c673cae
FG
188 terminating_sessions(false)
189{
190}
191
192
193/* This function DOES put the passed message before returning*/
194void Server::dispatch(Message *m)
195{
196 switch (m->get_type()) {
197 case CEPH_MSG_CLIENT_RECONNECT:
198 handle_client_reconnect(static_cast<MClientReconnect*>(m));
199 return;
200 }
201
202 // active?
203 if (!mds->is_active() &&
204 !(mds->is_stopping() && m->get_source().is_mds())) {
205 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST &&
206 (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT)) {
207 MClientRequest *req = static_cast<MClientRequest*>(m);
208 Session *session = get_session(req);
209 if (!session || session->is_closed()) {
210 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
211 req->put();
212 return;
213 }
214 bool queue_replay = false;
215 if (req->is_replay()) {
216 dout(3) << "queuing replayed op" << dendl;
217 queue_replay = true;
218 } else if (req->get_retry_attempt()) {
219 // process completed request in clientreplay stage. The completed request
220 // might have created new file/directorie. This guarantees MDS sends a reply
221 // to client before other request modifies the new file/directorie.
222 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
223 dout(3) << "queuing completed op" << dendl;
224 queue_replay = true;
225 }
226 // this request was created before the cap reconnect message, drop any embedded
227 // cap releases.
228 req->releases.clear();
229 }
230 if (queue_replay) {
231 req->mark_queued_for_replay();
232 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
233 return;
234 }
235 }
236
237 bool wait_for_active = true;
238 if (m->get_type() == MSG_MDS_SLAVE_REQUEST) {
239 // handle_slave_request() will wait if necessary
240 wait_for_active = false;
241 } else if (mds->is_clientreplay()) {
242 // session open requests need to be handled during replay,
243 // close requests need to be delayed
244 if ((m->get_type() == CEPH_MSG_CLIENT_SESSION &&
245 (static_cast<MClientSession*>(m))->get_op() != CEPH_SESSION_REQUEST_CLOSE)) {
246 wait_for_active = false;
247 } else if (m->get_type() == CEPH_MSG_CLIENT_REQUEST) {
248 MClientRequest *req = static_cast<MClientRequest*>(m);
249 if (req->is_queued_for_replay()) {
250 wait_for_active = false;
251 }
252 }
253 }
254 if (wait_for_active) {
255 dout(3) << "not active yet, waiting" << dendl;
256 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
257 return;
258 }
259 }
260
261 switch (m->get_type()) {
262 case CEPH_MSG_CLIENT_SESSION:
263 handle_client_session(static_cast<MClientSession*>(m));
264 return;
265 case CEPH_MSG_CLIENT_REQUEST:
266 handle_client_request(static_cast<MClientRequest*>(m));
267 return;
268 case MSG_MDS_SLAVE_REQUEST:
269 handle_slave_request(static_cast<MMDSSlaveRequest*>(m));
270 return;
271 default:
272 derr << "server unknown message " << m->get_type() << dendl;
273 assert(0 == "server unknown message");
274 }
275}
276
277
278
279// ----------------------------------------------------------
280// SESSION management
281
282class C_MDS_session_finish : public ServerLogContext {
283 Session *session;
284 uint64_t state_seq;
285 bool open;
286 version_t cmapv;
287 interval_set<inodeno_t> inos;
288 version_t inotablev;
289 Context *fin;
290public:
291 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
292 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
293 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
294 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { }
295 void finish(int r) override {
296 assert(r == 0);
297 server->_session_logged(session, state_seq, open, cmapv, inos, inotablev);
298 if (fin) {
299 fin->complete(r);
300 }
301 }
302};
303
304Session *Server::get_session(Message *m)
305{
306 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
307 if (session) {
308 dout(20) << "get_session have " << session << " " << session->info.inst
309 << " state " << session->get_state_name() << dendl;
310 session->put(); // not carry ref
311 } else {
312 dout(20) << "get_session dne for " << m->get_source_inst() << dendl;
313 }
314 return session;
315}
316
317/* This function DOES put the passed message before returning*/
318void Server::handle_client_session(MClientSession *m)
319{
320 version_t pv;
31f18b77 321 bool blacklisted = false;
7c673cae
FG
322 Session *session = get_session(m);
323
324 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
325 assert(m->get_source().is_client()); // should _not_ come from an mds!
326
327 if (!session) {
328 dout(0) << " ignoring sessionless msg " << *m << dendl;
329 m->put();
330 return;
331 }
332
333 if (logger)
334 logger->inc(l_mdss_handle_client_session);
335
336 uint64_t sseq = 0;
337 switch (m->get_op()) {
338 case CEPH_SESSION_REQUEST_OPEN:
339 if (session->is_opening() ||
340 session->is_open() ||
341 session->is_stale() ||
342 session->is_killing()) {
343 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
344 m->put();
345 return;
346 }
347 assert(session->is_closed() ||
348 session->is_closing());
349
31f18b77
FG
350 blacklisted = mds->objecter->with_osdmap(
351 [session](const OSDMap &osd_map) -> bool {
352 return osd_map.is_blacklisted(session->info.inst.addr);
353 });
354
355 if (blacklisted) {
356 dout(10) << "ignoring blacklisted client " << session->info.inst.addr << dendl;
357 m->put();
358 return;
359 }
360
7c673cae
FG
361 session->set_client_metadata(m->client_meta);
362 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN "
363 << session->info.client_metadata.size() << " metadata entries:" << dendl;
364 for (map<string, string>::iterator i = session->info.client_metadata.begin();
365 i != session->info.client_metadata.end(); ++i) {
366 dout(20) << " " << i->first << ": " << i->second << dendl;
367 }
368
369 // Special case for the 'root' metadata path; validate that the claimed
370 // root is actually within the caps of the session
371 if (session->info.client_metadata.count("root")) {
372 const auto claimed_root = session->info.client_metadata.at("root");
373 // claimed_root has a leading "/" which we strip before passing
374 // into caps check
375 if (claimed_root.empty() || claimed_root[0] != '/' ||
376 !session->auth_caps.path_capable(claimed_root.substr(1))) {
377 derr << __func__ << " forbidden path claimed as mount root: "
378 << claimed_root << " by " << m->get_source() << dendl;
379 // Tell the client we're rejecting their open
380 mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
381 mds->clog->warn() << "client session with invalid root '" <<
382 claimed_root << "' denied (" << session->info.inst << ")";
383 session->clear();
384 // Drop out; don't record this session in SessionMap or journal it.
385 break;
386 }
387 }
388
389 if (session->is_closed())
390 mds->sessionmap.add_session(session);
391
392 pv = mds->sessionmap.mark_projected(session);
393 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
394 mds->sessionmap.touch_session(session);
395 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, m->client_meta),
396 new C_MDS_session_finish(this, session, sseq, true, pv));
397 mdlog->flush();
398 break;
399
400 case CEPH_SESSION_REQUEST_RENEWCAPS:
401 if (session->is_open() ||
402 session->is_stale()) {
403 mds->sessionmap.touch_session(session);
404 if (session->is_stale()) {
405 mds->sessionmap.set_state(session, Session::STATE_OPEN);
406 mds->locker->resume_stale_caps(session);
407 mds->sessionmap.touch_session(session);
408 }
409 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_RENEWCAPS, m->get_seq()));
410 } else {
411 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
412 }
413 break;
414
415 case CEPH_SESSION_REQUEST_CLOSE:
416 {
417 if (session->is_closed() ||
418 session->is_closing() ||
419 session->is_killing()) {
420 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
421 m->put();
422 return;
423 }
424 if (session->is_importing()) {
425 dout(10) << "ignoring close req on importing session" << dendl;
426 m->put();
427 return;
428 }
429 assert(session->is_open() ||
430 session->is_stale() ||
431 session->is_opening());
432 if (m->get_seq() < session->get_push_seq()) {
433 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
434 << ", dropping" << dendl;
435 m->put();
436 return;
437 }
438 // We are getting a seq that is higher than expected.
439 // Handle the same as any other seqn error.
440 //
441 if (m->get_seq() != session->get_push_seq()) {
442 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
443 << ", BUGGY!" << dendl;
444 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
445 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
446 m->put();
447 return;
448 }
449 journal_close_session(session, Session::STATE_CLOSING, NULL);
450 }
451 break;
452
453 case CEPH_SESSION_FLUSHMSG_ACK:
454 finish_flush_session(session, m->get_seq());
455 break;
456
31f18b77
FG
457 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
458 mdlog->flush();
459 break;
460
7c673cae
FG
461 default:
462 ceph_abort();
463 }
464 m->put();
465}
466
467void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
468{
469 for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
470 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
471 assert(session);
472 if (!session->is_open() ||
473 !session->connection.get() ||
474 !session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER))
475 continue;
476 version_t seq = session->wait_for_flush(gather.new_sub());
477 mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
478 }
479}
480
481void Server::finish_flush_session(Session *session, version_t seq)
482{
483 list<MDSInternalContextBase*> finished;
484 session->finish_flush(seq, finished);
485 mds->queue_waiters(finished);
486}
487
488void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
489 interval_set<inodeno_t>& inos, version_t piv)
490{
491 dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close")
492 << " " << pv << dendl;
493
494 if (piv) {
495 assert(session->is_closing() || session->is_killing() ||
496 session->is_opening()); // re-open closing session
497 session->info.prealloc_inos.subtract(inos);
498 mds->inotable->apply_release_ids(inos);
499 assert(mds->inotable->get_version() == piv);
500 }
501
502 mds->sessionmap.mark_dirty(session);
503
504 // apply
505 if (session->get_state_seq() != state_seq) {
506 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
507 << ", noop" << dendl;
508 // close must have been canceled (by an import?), or any number of other things..
509 } else if (open) {
510 assert(session->is_opening());
511 mds->sessionmap.set_state(session, Session::STATE_OPEN);
512 mds->sessionmap.touch_session(session);
513 assert(session->connection != NULL);
514 session->connection->send_message(new MClientSession(CEPH_SESSION_OPEN));
515 if (mdcache->is_readonly())
516 session->connection->send_message(new MClientSession(CEPH_SESSION_FORCE_RO));
517 } else if (session->is_closing() ||
518 session->is_killing()) {
519 // kill any lingering capabilities, leases, requests
520 while (!session->caps.empty()) {
521 Capability *cap = session->caps.front();
522 CInode *in = cap->get_inode();
523 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
524 mds->locker->remove_client_cap(in, session->info.inst.name.num());
525 }
526 while (!session->leases.empty()) {
527 ClientLease *r = session->leases.front();
528 CDentry *dn = static_cast<CDentry*>(r->parent);
529 dout(20) << " killing client lease of " << *dn << dendl;
530 dn->remove_client_lease(r, mds->locker);
531 }
532 if (client_reconnect_gather.count(session->info.get_client())) {
533 dout(20) << " removing client from reconnect set" << dendl;
534 client_reconnect_gather.erase(session->info.get_client());
535
536 if (client_reconnect_gather.empty()) {
537 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
538 reconnect_gather_finish();
539 }
540 }
541
542 if (session->is_closing()) {
543 // mark con disposable. if there is a fault, we will get a
544 // reset and clean it up. if the client hasn't received the
545 // CLOSE message yet, they will reconnect and get an
546 // ms_handle_remote_reset() and realize they had in fact closed.
547 // do this *before* sending the message to avoid a possible
548 // race.
549 if (session->connection != NULL) {
550 // Conditional because terminate_sessions will indiscrimately
551 // put sessions in CLOSING whether they ever had a conn or not.
552 session->connection->mark_disposable();
553 }
554
555 // reset session
556 mds->send_message_client(new MClientSession(CEPH_SESSION_CLOSE), session);
557 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
558 session->clear();
559 mds->sessionmap.remove_session(session);
560 } else if (session->is_killing()) {
561 // destroy session, close connection
562 if (session->connection != NULL) {
563 session->connection->mark_down();
564 }
565 mds->sessionmap.remove_session(session);
566 } else {
567 ceph_abort();
568 }
569 } else {
570 ceph_abort();
571 }
572}
573
574/**
575 * Inject sessions from some source other than actual connections.
576 *
577 * For example:
578 * - sessions inferred from journal replay
579 * - sessions learned from other MDSs during rejoin
580 * - sessions learned from other MDSs during dir/caps migration
581 * - sessions learned from other MDSs during a cross-MDS rename
582 */
583version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
584 map<client_t,uint64_t>& sseqmap)
585{
586 version_t pv = mds->sessionmap.get_projected();
587
588 dout(10) << "prepare_force_open_sessions " << pv
589 << " on " << cm.size() << " clients"
590 << dendl;
591 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
592
593 Session *session = mds->sessionmap.get_or_add_session(p->second);
594 pv = mds->sessionmap.mark_projected(session);
595 if (session->is_closed() ||
596 session->is_closing() ||
597 session->is_killing())
598 sseqmap[p->first] = mds->sessionmap.set_state(session, Session::STATE_OPENING);
599 else
600 assert(session->is_open() ||
601 session->is_opening() ||
602 session->is_stale());
603 session->inc_importing();
604 }
605 return pv;
606}
607
608void Server::finish_force_open_sessions(map<client_t,entity_inst_t>& cm,
609 map<client_t,uint64_t>& sseqmap,
610 bool dec_import)
611{
612 /*
613 * FIXME: need to carefully consider the race conditions between a
614 * client trying to close a session and an MDS doing an import
615 * trying to force open a session...
616 */
617 dout(10) << "finish_force_open_sessions on " << cm.size() << " clients,"
618 << " initial v " << mds->sessionmap.get_version() << dendl;
619
620
621 int sessions_inserted = 0;
622 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
623 sessions_inserted++;
624
625 Session *session = mds->sessionmap.get_session(p->second.name);
626 assert(session);
627
628 if (sseqmap.count(p->first)) {
629 uint64_t sseq = sseqmap[p->first];
630 if (session->get_state_seq() != sseq) {
631 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
632 } else {
633 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
634 mds->sessionmap.set_state(session, Session::STATE_OPEN);
635 mds->sessionmap.touch_session(session);
636 mds->send_message_client(new MClientSession(CEPH_SESSION_OPEN), session);
637 if (mdcache->is_readonly())
638 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
639 }
640 } else {
641 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
642 assert(session->is_open() || session->is_stale());
643 }
644
645 if (dec_import) {
646 session->dec_importing();
647 }
648
649 mds->sessionmap.mark_dirty(session);
650 }
651
652 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
653}
654
655class C_MDS_TerminatedSessions : public ServerContext {
656 void finish(int r) override {
657 server->terminating_sessions = false;
658 }
659 public:
660 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
661};
662
663void Server::terminate_sessions()
664{
665 dout(2) << "terminate_sessions" << dendl;
666
667 terminating_sessions = true;
668
669 // kill them off. clients will retry etc.
670 set<Session*> sessions;
671 mds->sessionmap.get_client_session_set(sessions);
672 for (set<Session*>::const_iterator p = sessions.begin();
673 p != sessions.end();
674 ++p) {
675 Session *session = *p;
676 if (session->is_closing() ||
677 session->is_killing() ||
678 session->is_closed())
679 continue;
680 journal_close_session(session, Session::STATE_CLOSING, NULL);
681 }
682
683 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
684}
685
686
687void Server::find_idle_sessions()
688{
689 dout(10) << "find_idle_sessions. laggy until " << mds->get_laggy_until() << dendl;
690
691 // timeout/stale
692 // (caps go stale, lease die)
693 utime_t now = ceph_clock_now();
694 utime_t cutoff = now;
695 cutoff -= g_conf->mds_session_timeout;
696 while (1) {
697 Session *session = mds->sessionmap.get_oldest_session(Session::STATE_OPEN);
698 if (!session) break;
699 dout(20) << "laggiest active session is " << session->info.inst << dendl;
700 if (session->last_cap_renew >= cutoff) {
701 dout(20) << "laggiest active session is " << session->info.inst << " and sufficiently new ("
702 << session->last_cap_renew << ")" << dendl;
703 break;
704 }
705
706 dout(10) << "new stale session " << session->info.inst << " last " << session->last_cap_renew << dendl;
707 mds->sessionmap.set_state(session, Session::STATE_STALE);
708 mds->locker->revoke_stale_caps(session);
709 mds->locker->remove_stale_leases(session);
710 mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
711 finish_flush_session(session, session->get_push_seq());
712 }
713
714 // autoclose
715 cutoff = now;
716 cutoff -= g_conf->mds_session_autoclose;
717
718 // don't kick clients if we've been laggy
719 if (mds->get_laggy_until() > cutoff) {
720 dout(10) << " laggy_until " << mds->get_laggy_until() << " > cutoff " << cutoff
721 << ", not kicking any clients to be safe" << dendl;
722 return;
723 }
724
725 if (mds->sessionmap.get_sessions().size() == 1 &&
726 mds->mdsmap->get_num_in_mds() == 1) {
727 dout(20) << "not evicting a slow client, because there is only one"
728 << dendl;
729 return;
730 }
731
31f18b77
FG
732 // Collect a list of sessions exceeding the autoclose threshold
733 std::vector<Session *> to_evict;
734 const auto sessions_p = mds->sessionmap.by_state.find(Session::STATE_STALE);
735 if (sessions_p == mds->sessionmap.by_state.end() || sessions_p->second->empty()) {
736 return;
737 }
738 const auto &stale_sessions = sessions_p->second;
739 assert(stale_sessions != nullptr);
740
741 for (const auto &session: *stale_sessions) {
7c673cae
FG
742 if (session->is_importing()) {
743 dout(10) << "stopping at importing session " << session->info.inst << dendl;
744 break;
745 }
746 assert(session->is_stale());
747 if (session->last_cap_renew >= cutoff) {
748 dout(20) << "oldest stale session is " << session->info.inst << " and sufficiently new ("
749 << session->last_cap_renew << ")" << dendl;
750 break;
751 }
31f18b77
FG
752
753 to_evict.push_back(session);
754 }
755
756 for (const auto &session: to_evict) {
7c673cae
FG
757 utime_t age = now;
758 age -= session->last_cap_renew;
31f18b77
FG
759 mds->clog->warn() << "evicting unresponsive client " << *session
760 << ", after " << age << " seconds";
761 dout(10) << "autoclosing stale session " << session->info.inst << " last "
762 << session->last_cap_renew << dendl;
763
764 if (g_conf->mds_session_blacklist_on_timeout) {
765 std::stringstream ss;
766 mds->evict_client(session->info.inst.name.num(), false, true,
767 ss, nullptr);
768 } else {
769 kill_session(session, NULL);
770 }
7c673cae
FG
771 }
772}
773
774/*
775 * XXX bump in the interface here, not using an MDSInternalContextBase here
776 * because all the callers right now happen to use a SaferCond
777 */
778void Server::kill_session(Session *session, Context *on_safe)
779{
31f18b77
FG
780 assert(mds->mds_lock.is_locked_by_me());
781
7c673cae
FG
782 if ((session->is_opening() ||
783 session->is_open() ||
784 session->is_stale()) &&
785 !session->is_importing()) {
786 dout(10) << "kill_session " << session << dendl;
787 journal_close_session(session, Session::STATE_KILLING, on_safe);
788 } else {
789 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
790 assert(session->is_closing() ||
791 session->is_closed() ||
792 session->is_killing() ||
793 session->is_importing());
794 if (on_safe) {
795 on_safe->complete(0);
796 }
797 }
798}
799
31f18b77
FG
800size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
801{
802 std::list<Session*> victims;
803 const auto sessions = mds->sessionmap.get_sessions();
804 for (const auto p : sessions) {
805 if (!p.first.is_client()) {
806 // Do not apply OSDMap blacklist to MDS daemons, we find out
807 // about their death via MDSMap.
808 continue;
809 }
810
811 Session *s = p.second;
812 if (blacklist.count(s->info.inst.addr)) {
813 victims.push_back(s);
814 }
815 }
816
817 for (const auto s : victims) {
818 kill_session(s, nullptr);
819 }
820
821 dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
822
823 return victims.size();
824}
825
7c673cae
FG
826void Server::journal_close_session(Session *session, int state, Context *on_safe)
827{
828 uint64_t sseq = mds->sessionmap.set_state(session, state);
829 version_t pv = mds->sessionmap.mark_projected(session);
830 version_t piv = 0;
831
832 // release alloc and pending-alloc inos for this session
833 // and wipe out session state, in case the session close aborts for some reason
834 interval_set<inodeno_t> both;
835 both.insert(session->info.prealloc_inos);
836 both.insert(session->pending_prealloc_inos);
837 if (both.size()) {
838 mds->inotable->project_release_ids(both);
839 piv = mds->inotable->get_projected_version();
840 } else
841 piv = 0;
842
843 mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv),
844 new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe));
845 mdlog->flush();
846
847 // clean up requests, too
848 elist<MDRequestImpl*>::iterator p =
849 session->requests.begin(member_offset(MDRequestImpl,
850 item_session_request));
851 while (!p.end()) {
852 MDRequestRef mdr = mdcache->request_get((*p)->reqid);
853 ++p;
854 mdcache->request_kill(mdr);
855 }
856
857 finish_flush_session(session, session->get_push_seq());
858}
859
860void Server::reconnect_clients(MDSInternalContext *reconnect_done_)
861{
862 reconnect_done = reconnect_done_;
863 mds->sessionmap.get_client_set(client_reconnect_gather);
864
865 if (client_reconnect_gather.empty()) {
866 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
867 reconnect_gather_finish();
868 return;
869 }
870
871 // clients will get the mdsmap and discover we're reconnecting via the monitor.
872
873 reconnect_start = ceph_clock_now();
874 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
875 mds->sessionmap.dump();
876}
877
878/* This function DOES put the passed message before returning*/
879void Server::handle_client_reconnect(MClientReconnect *m)
880{
881 dout(7) << "handle_client_reconnect " << m->get_source() << dendl;
882 client_t from = m->get_source().num();
883 Session *session = get_session(m);
884 assert(session);
885
886 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
887 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
888 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
889 return;
890 }
891
892 utime_t delay = ceph_clock_now();
893 delay -= reconnect_start;
894 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
895
896 bool deny = false;
897 if (!mds->is_reconnect()) {
898 // XXX maybe in the future we can do better than this?
899 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
900 mds->clog->info() << "denied reconnect attempt (mds is "
901 << ceph_mds_state_name(mds->get_state())
902 << ") from " << m->get_source_inst()
903 << " after " << delay << " (allowed interval " << g_conf->mds_reconnect_timeout << ")";
904 deny = true;
905 } else if (session->is_closed()) {
906 dout(1) << " session is closed, ignoring reconnect, sending close" << dendl;
907 mds->clog->info() << "denied reconnect attempt (mds is "
908 << ceph_mds_state_name(mds->get_state())
909 << ") from " << m->get_source_inst() << " (session is closed)";
910 deny = true;
911 } else if (mdcache->is_readonly()) {
912 dout(1) << " read-only FS, ignoring reconnect, sending close" << dendl;
913 mds->clog->info() << "denied reconnect attempt (mds is read-only)";
914 deny = true;
915 }
916
917 if (deny) {
918 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_CLOSE));
919 m->put();
920 return;
921 }
922
923 // notify client of success with an OPEN
924 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_OPEN));
925 session->last_cap_renew = ceph_clock_now();
926 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
927
928 // snaprealms
929 for (vector<ceph_mds_snaprealm_reconnect>::iterator p = m->realms.begin();
930 p != m->realms.end();
931 ++p) {
932 CInode *in = mdcache->get_inode(inodeno_t(p->ino));
933 if (in && in->state_test(CInode::STATE_PURGING))
934 continue;
935 if (in) {
936 assert(in->snaprealm);
937 if (in->snaprealm->have_past_parents_open()) {
938 dout(15) << "open snaprealm (w/ past parents) on " << *in << dendl;
939 mdcache->finish_snaprealm_reconnect(from, in->snaprealm, snapid_t(p->seq));
940 } else {
941 dout(15) << "open snaprealm (w/o past parents) on " << *in << dendl;
942 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
943 }
944 } else {
945 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(p->ino)
946 << " seq " << p->seq << dendl;
947 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
948 }
949 }
950
951 // caps
952 for (map<inodeno_t, cap_reconnect_t>::iterator p = m->caps.begin();
953 p != m->caps.end();
954 ++p) {
955 // make sure our last_cap_id is MAX over all issued caps
956 if (p->second.capinfo.cap_id > mdcache->last_cap_id)
957 mdcache->last_cap_id = p->second.capinfo.cap_id;
958
959 CInode *in = mdcache->get_inode(p->first);
960 if (in && in->state_test(CInode::STATE_PURGING))
961 continue;
962 if (in && in->is_auth()) {
963 // we recovered it, and it's ours. take note.
964 dout(15) << "open cap realm " << inodeno_t(p->second.capinfo.snaprealm)
965 << " on " << *in << dendl;
966 in->reconnect_cap(from, p->second, session);
967 mdcache->add_reconnected_cap(from, p->first, p->second);
968 recover_filelocks(in, p->second.flockbl, m->get_orig_source().num());
969 continue;
970 }
971
972 if (in && !in->is_auth()) {
973 // not mine.
974 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
975 // add to cap export list.
976 p->second.path.clear(); // we don't need path
977 mdcache->rejoin_export_caps(p->first, from, p->second,
978 in->authority().first);
979 } else {
980 // don't know if the inode is mine
981 dout(10) << "missing ino " << p->first << ", will load later" << dendl;
982 p->second.path.clear(); // we don't need path
983 mdcache->rejoin_recovered_caps(p->first, from, p->second, MDS_RANK_NONE);
984 }
985 }
986
987 // remove from gather set
988 client_reconnect_gather.erase(from);
989 if (client_reconnect_gather.empty())
990 reconnect_gather_finish();
991
992 m->put();
993}
994
995
996
997void Server::reconnect_gather_finish()
998{
999 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1000 assert(reconnect_done);
1001 reconnect_done->complete(0);
1002 reconnect_done = NULL;
1003}
1004
1005void Server::reconnect_tick()
1006{
31f18b77
FG
1007 if (reconnect_evicting) {
1008 dout(4) << "reconnect_tick: waiting for evictions" << dendl;
1009 return;
1010 }
1011
7c673cae
FG
1012 utime_t reconnect_end = reconnect_start;
1013 reconnect_end += g_conf->mds_reconnect_timeout;
1014 if (ceph_clock_now() >= reconnect_end &&
1015 !client_reconnect_gather.empty()) {
1016 dout(10) << "reconnect timed out" << dendl;
31f18b77
FG
1017
1018 // If we're doing blacklist evictions, use this to wait for them before
1019 // proceeding to reconnect_gather_finish
1020 MDSGatherBuilder gather(g_ceph_context);
1021
7c673cae
FG
1022 for (set<client_t>::iterator p = client_reconnect_gather.begin();
1023 p != client_reconnect_gather.end();
1024 ++p) {
1025 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
1026 assert(session);
1027 dout(1) << "reconnect gave up on " << session->info.inst << dendl;
31f18b77
FG
1028
1029 mds->clog->warn() << "evicting unresponsive client " << *session
1030 << ", after waiting " << g_conf->mds_reconnect_timeout
1031 << " seconds during MDS startup";
1032
1033 if (g_conf->mds_session_blacklist_on_timeout) {
1034 std::stringstream ss;
1035 mds->evict_client(session->info.inst.name.num(), false, true, ss,
1036 gather.new_sub());
1037 } else {
1038 kill_session(session, NULL);
1039 }
1040
7c673cae
FG
1041 failed_reconnects++;
1042 }
1043 client_reconnect_gather.clear();
31f18b77
FG
1044
1045 if (gather.has_subs()) {
1046 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1047 gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
1048 [this](int r){reconnect_gather_finish();})));
1049 gather.activate();
1050 reconnect_evicting = true;
1051 } else {
1052 reconnect_gather_finish();
1053 }
7c673cae
FG
1054 }
1055}
1056
1057void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1058{
1059 if (!locks.length()) return;
1060 int numlocks;
1061 ceph_filelock lock;
1062 bufferlist::iterator p = locks.begin();
1063 ::decode(numlocks, p);
1064 for (int i = 0; i < numlocks; ++i) {
1065 ::decode(lock, p);
1066 lock.client = client;
1067 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1068 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1069 }
1070 ::decode(numlocks, p);
1071 for (int i = 0; i < numlocks; ++i) {
1072 ::decode(lock, p);
1073 lock.client = client;
1074 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1075 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1076 }
1077}
1078
1079
1080/**
1081 * Call this when the MDCache is oversized, to send requests to the clients
1082 * to trim some caps, and consequently unpin some inodes in the MDCache so
1083 * that it can trim too.
1084 */
1085void Server::recall_client_state(float ratio)
1086{
1087 int max_caps_per_client = (int)(g_conf->mds_cache_size * .8);
1088 int min_caps_per_client = 100;
1089
1090 dout(10) << "recall_client_state " << ratio
1091 << ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
1092 << dendl;
1093
1094 set<Session*> sessions;
1095 mds->sessionmap.get_client_session_set(sessions);
1096 for (set<Session*>::const_iterator p = sessions.begin();
1097 p != sessions.end();
1098 ++p) {
1099 Session *session = *p;
1100 if (!session->is_open() ||
1101 !session->info.inst.name.is_client())
1102 continue;
1103
1104 dout(10) << " session " << session->info.inst
1105 << " caps " << session->caps.size()
1106 << ", leases " << session->leases.size()
1107 << dendl;
1108
1109 if (session->caps.size() > min_caps_per_client) {
1110 int newlim = MIN((int)(session->caps.size() * ratio), max_caps_per_client);
1111 if (session->caps.size() > newlim) {
1112 MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
1113 m->head.max_caps = newlim;
1114 mds->send_message_client(m, session);
1115 session->notify_recall_sent(newlim);
1116 }
1117 }
1118 }
1119}
1120
1121void Server::force_clients_readonly()
1122{
1123 dout(10) << "force_clients_readonly" << dendl;
1124 set<Session*> sessions;
1125 mds->sessionmap.get_client_session_set(sessions);
1126 for (set<Session*>::const_iterator p = sessions.begin();
1127 p != sessions.end();
1128 ++p) {
1129 Session *session = *p;
1130 if (!session->info.inst.name.is_client() ||
1131 !(session->is_open() || session->is_stale()))
1132 continue;
1133 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
1134 }
1135}
1136
1137/*******
1138 * some generic stuff for finishing off requests
1139 */
1140void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1141{
1142 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1143 assert(!mdr->has_completed);
1144
1145 // note trace items for eventual reply.
1146 mdr->tracei = in;
1147 if (in)
1148 mdr->pin(in);
1149
1150 mdr->tracedn = dn;
1151 if (dn)
1152 mdr->pin(dn);
1153
1154 early_reply(mdr, in, dn);
1155
1156 mdr->committing = true;
1157 submit_mdlog_entry(le, fin, mdr, __func__);
1158
1159 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1160 if (mds->queue_one_replay()) {
1161 dout(10) << " queued next replay op" << dendl;
1162 } else {
1163 dout(10) << " journaled last replay op, flushing" << dendl;
1164 mdlog->flush();
1165 }
1166 } else if (mdr->did_early_reply)
1167 mds->locker->drop_rdlocks(mdr.get());
1168 else
1169 mdlog->flush();
1170}
1171
1172void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1173 const char *event)
1174{
1175 if (mdr) {
1176 string event_str("submit entry: ");
1177 event_str += event;
1178 mdr->mark_event_string(event_str);
1179 }
1180 mdlog->submit_entry(le, fin);
1181}
1182
1183/*
1184 * send response built from mdr contents and error code; clean up mdr
1185 */
1186void Server::respond_to_request(MDRequestRef& mdr, int r)
1187{
1188 if (mdr->client_request) {
1189 reply_client_request(mdr, new MClientReply(mdr->client_request, r));
1190
1191 // add here to avoid counting ops multiple times (e.g., locks, loading)
1192 switch(mdr->client_request->get_op()) {
1193 case CEPH_MDS_OP_LOOKUPHASH:
1194 logger->inc(l_mdss_req_lookuphash);
1195 break;
1196 case CEPH_MDS_OP_LOOKUPINO:
1197 logger->inc(l_mdss_req_lookupino);
1198 break;
1199 case CEPH_MDS_OP_LOOKUPPARENT:
1200 logger->inc(l_mdss_req_lookupparent);
1201 break;
1202 case CEPH_MDS_OP_LOOKUPNAME:
1203 logger->inc(l_mdss_req_lookupname);
1204 break;
1205 case CEPH_MDS_OP_LOOKUP:
1206 logger->inc(l_mdss_req_lookup);
1207 break;
1208 case CEPH_MDS_OP_LOOKUPSNAP:
1209 logger->inc(l_mdss_req_lookupsnap);
1210 break;
1211 case CEPH_MDS_OP_GETATTR:
1212 logger->inc(l_mdss_req_getattr);
1213 break;
1214 case CEPH_MDS_OP_SETATTR:
1215 logger->inc(l_mdss_req_setattr);
1216 break;
1217 case CEPH_MDS_OP_SETLAYOUT:
1218 logger->inc(l_mdss_req_setlayout);
1219 break;
1220 case CEPH_MDS_OP_SETDIRLAYOUT:
1221 logger->inc(l_mdss_req_setdirlayout);
1222 break;
1223 case CEPH_MDS_OP_SETXATTR:
1224 logger->inc(l_mdss_req_setxattr);
1225 break;
1226 case CEPH_MDS_OP_RMXATTR:
1227 logger->inc(l_mdss_req_rmxattr);
1228 break;
1229 case CEPH_MDS_OP_READDIR:
1230 logger->inc(l_mdss_req_readdir);
1231 break;
1232 case CEPH_MDS_OP_SETFILELOCK:
1233 logger->inc(l_mdss_req_setfilelock);
1234 break;
1235 case CEPH_MDS_OP_GETFILELOCK:
1236 logger->inc(l_mdss_req_getfilelock);
1237 break;
1238 case CEPH_MDS_OP_CREATE:
1239 logger->inc(l_mdss_req_create);
1240 case CEPH_MDS_OP_OPEN:
1241 logger->inc(l_mdss_req_open);
1242 break;
1243 case CEPH_MDS_OP_MKNOD:
1244 logger->inc(l_mdss_req_mknod);
1245 break;
1246 case CEPH_MDS_OP_LINK:
1247 logger->inc(l_mdss_req_link);
1248 break;
1249 case CEPH_MDS_OP_UNLINK:
1250 logger->inc(l_mdss_req_unlink);
1251 break;
1252 case CEPH_MDS_OP_RMDIR:
1253 logger->inc(l_mdss_req_rmdir);
1254 break;
1255 case CEPH_MDS_OP_RENAME:
1256 logger->inc(l_mdss_req_rename);
1257 break;
1258 case CEPH_MDS_OP_MKDIR:
1259 logger->inc(l_mdss_req_mkdir);
1260 break;
1261 case CEPH_MDS_OP_SYMLINK:
1262 logger->inc(l_mdss_req_symlink);
1263 break;
1264 case CEPH_MDS_OP_LSSNAP:
1265 logger->inc(l_mdss_req_lssnap);
1266 break;
1267 case CEPH_MDS_OP_MKSNAP:
1268 logger->inc(l_mdss_req_mksnap);
1269 break;
1270 case CEPH_MDS_OP_RMSNAP:
1271 logger->inc(l_mdss_req_rmsnap);
1272 break;
1273 case CEPH_MDS_OP_RENAMESNAP:
1274 logger->inc(l_mdss_req_renamesnap);
1275 break;
1276 }
1277 } else if (mdr->internal_op > -1) {
1278 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1279 if (!mdr->internal_op_finish)
1280 assert(0 == "trying to respond to internal op without finisher");
1281 mdr->internal_op_finish->complete(r);
1282 mdcache->request_finish(mdr);
1283 }
1284}
1285
1286void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
1287{
1288 if (!g_conf->mds_early_reply)
1289 return;
1290
1291 if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
1292 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
1293 return;
1294 }
1295
1296 if (mdr->alloc_ino) {
1297 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
1298 return;
1299 }
1300
1301 MClientRequest *req = mdr->client_request;
1302 entity_inst_t client_inst = req->get_source_inst();
1303 if (client_inst.name.is_mds())
1304 return;
1305
1306 if (req->is_replay()) {
1307 dout(10) << " no early reply on replay op" << dendl;
1308 return;
1309 }
1310
1311
1312 MClientReply *reply = new MClientReply(req, 0);
1313 reply->set_unsafe();
1314
1315 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1316 //
1317 //_rename_finish() does not send dentry link/unlink message to replicas.
1318 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1319 // that have projected linkages from getting new replica.
1320 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
1321
1322 dout(10) << "early_reply " << reply->get_result()
1323 << " (" << cpp_strerror(reply->get_result())
1324 << ") " << *req << dendl;
1325
1326 if (tracei || tracedn) {
1327 if (tracei)
1328 mdr->cap_releases.erase(tracei->vino());
1329 if (tracedn)
1330 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1331
1332 set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid,
1333 req->get_dentry_wanted(), mdr);
1334 }
1335
1336 reply->set_extra_bl(mdr->reply_extra_bl);
1337 req->get_connection()->send_message(reply);
1338
1339 mdr->did_early_reply = true;
1340
1341 mds->logger->inc(l_mds_reply);
1342 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
1343 mds->logger->tinc(l_mds_reply_latency, lat);
1344 dout(20) << "lat " << lat << dendl;
1345
1346 mdr->mark_event("early_replied");
1347}
1348
1349/*
1350 * send given reply
1351 * include a trace to tracei
1352 * Clean up mdr
1353 */
1354void Server::reply_client_request(MDRequestRef& mdr, MClientReply *reply)
1355{
1356 assert(mdr.get());
1357 MClientRequest *req = mdr->client_request;
1358
1359 dout(7) << "reply_client_request " << reply->get_result()
1360 << " (" << cpp_strerror(reply->get_result())
1361 << ") " << *req << dendl;
1362
1363 mdr->mark_event("replying");
1364
1365 Session *session = mdr->session;
1366
1367 // note successful request in session map?
1368 //
1369 // setfilelock requests are special, they only modify states in MDS memory.
1370 // The states get lost when MDS fails. If Client re-send a completed
1371 // setfilelock request, it means that client did not receive corresponding
1372 // setfilelock reply. So MDS should re-execute the setfilelock request.
1373 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
1374 reply->get_result() == 0 && session) {
1375 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
1376 session->add_completed_request(mdr->reqid.tid, created);
1377 if (mdr->ls) {
1378 mdr->ls->touched_sessions.insert(session->info.inst.name);
1379 }
1380 }
1381
1382 // give any preallocated inos to the session
1383 apply_allocated_inos(mdr, session);
1384
1385 // get tracei/tracedn from mdr?
1386 snapid_t snapid = mdr->snapid;
1387 CInode *tracei = mdr->tracei;
1388 CDentry *tracedn = mdr->tracedn;
1389
1390 bool is_replay = mdr->client_request->is_replay();
1391 bool did_early_reply = mdr->did_early_reply;
1392 entity_inst_t client_inst = req->get_source_inst();
1393 int dentry_wanted = req->get_dentry_wanted();
1394
1395 if (!did_early_reply && !is_replay) {
1396
1397 mds->logger->inc(l_mds_reply);
1398 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
1399 mds->logger->tinc(l_mds_reply_latency, lat);
1400 dout(20) << "lat " << lat << dendl;
1401
1402 if (tracei)
1403 mdr->cap_releases.erase(tracei->vino());
1404 if (tracedn)
1405 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1406 }
1407
1408 // drop non-rdlocks before replying, so that we can issue leases
1409 mdcache->request_drop_non_rdlocks(mdr);
1410
1411 // reply at all?
1412 if (client_inst.name.is_mds() || !session) {
1413 reply->put(); // mds doesn't need a reply
1414 reply = 0;
1415 } else {
1416 // send reply.
1417 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
1418 (tracei || tracedn)) {
1419 if (is_replay) {
1420 if (tracei)
1421 mdcache->try_reconnect_cap(tracei, session);
1422 } else {
1423 // include metadata in reply
1424 set_trace_dist(session, reply, tracei, tracedn,
1425 snapid, dentry_wanted,
1426 mdr);
1427 }
1428 }
1429
1430 // We can set the extra bl unconditionally: if it's already been sent in the
1431 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
1432 reply->set_extra_bl(mdr->reply_extra_bl);
1433
1434 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
1435 req->get_connection()->send_message(reply);
1436 }
1437
1438 if (req->is_queued_for_replay() &&
1439 (mdr->has_completed || reply->get_result() < 0)) {
1440 if (reply->get_result() < 0) {
1441 int r = reply->get_result();
1442 derr << "reply_client_request: failed to replay " << *req
1443 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
1444 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
1445 }
1446 mds->queue_one_replay();
1447 }
1448
1449 // clean up request
1450 mdcache->request_finish(mdr);
1451
1452 // take a closer look at tracei, if it happens to be a remote link
1453 if (tracei &&
1454 tracedn &&
1455 tracedn->get_projected_linkage()->is_remote()) {
1456 mdcache->eval_remote(tracedn);
1457 }
1458}
1459
1460
1461void Server::encode_empty_dirstat(bufferlist& bl)
1462{
1463 static DirStat empty;
1464 empty.encode(bl);
1465}
1466
1467void Server::encode_infinite_lease(bufferlist& bl)
1468{
1469 LeaseStat e;
1470 e.seq = 0;
1471 e.mask = -1;
1472 e.duration_ms = -1;
1473 ::encode(e, bl);
1474 dout(20) << "encode_infinite_lease " << e << dendl;
1475}
1476
1477void Server::encode_null_lease(bufferlist& bl)
1478{
1479 LeaseStat e;
1480 e.seq = 0;
1481 e.mask = 0;
1482 e.duration_ms = 0;
1483 ::encode(e, bl);
1484 dout(20) << "encode_null_lease " << e << dendl;
1485}
1486
1487
1488/*
1489 * pass inode OR dentry (not both, or we may get confused)
1490 *
1491 * trace is in reverse order (i.e. root inode comes last)
1492 */
1493void Server::set_trace_dist(Session *session, MClientReply *reply,
1494 CInode *in, CDentry *dn,
1495 snapid_t snapid,
1496 int dentry_wanted,
1497 MDRequestRef& mdr)
1498{
1499 // skip doing this for debugging purposes?
1500 if (g_conf->mds_inject_traceless_reply_probability &&
1501 mdr->ls && !mdr->o_trunc &&
1502 (rand() % 10000 < g_conf->mds_inject_traceless_reply_probability * 10000.0)) {
1503 dout(5) << "deliberately skipping trace for " << *reply << dendl;
1504 return;
1505 }
1506
1507 // inode, dentry, dir, ..., inode
1508 bufferlist bl;
1509 mds_rank_t whoami = mds->get_nodeid();
1510 client_t client = session->get_client();
1511 utime_t now = ceph_clock_now();
1512
1513 dout(20) << "set_trace_dist snapid " << snapid << dendl;
1514
1515 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
1516
1517 // realm
1518 if (snapid == CEPH_NOSNAP) {
1519 SnapRealm *realm;
1520 if (in)
1521 realm = in->find_snaprealm();
1522 else
1523 realm = dn->get_dir()->get_inode()->find_snaprealm();
1524 reply->snapbl = realm->get_snap_trace();
1525 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
1526 }
1527
1528 // dir + dentry?
1529 if (dn) {
1530 reply->head.is_dentry = 1;
1531 CDir *dir = dn->get_dir();
1532 CInode *diri = dir->get_inode();
1533
1534 diri->encode_inodestat(bl, session, NULL, snapid);
1535 dout(20) << "set_trace_dist added diri " << *diri << dendl;
1536
1537#ifdef MDS_VERIFY_FRAGSTAT
1538 if (dir->is_complete())
1539 dir->verify_fragstat();
1540#endif
1541 dir->encode_dirstat(bl, whoami);
1542 dout(20) << "set_trace_dist added dir " << *dir << dendl;
1543
1544 ::encode(dn->get_name(), bl);
1545 if (snapid == CEPH_NOSNAP)
1546 mds->locker->issue_client_lease(dn, client, bl, now, session);
1547 else
1548 encode_null_lease(bl);
1549 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
1550 } else
1551 reply->head.is_dentry = 0;
1552
1553 // inode
1554 if (in) {
1555 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
1556 dout(20) << "set_trace_dist added in " << *in << dendl;
1557 reply->head.is_target = 1;
1558 } else
1559 reply->head.is_target = 0;
1560
1561 reply->set_trace(bl);
1562}
1563
1564
1565
1566
1567/***
1568 * process a client request
1569 * This function DOES put the passed message before returning
1570 */
1571void Server::handle_client_request(MClientRequest *req)
1572{
1573 dout(4) << "handle_client_request " << *req << dendl;
1574
1575 if (mds->logger)
1576 mds->logger->inc(l_mds_request);
1577 if (logger)
1578 logger->inc(l_mdss_handle_client_request);
1579
1580 if (!mdcache->is_open()) {
1581 dout(5) << "waiting for root" << dendl;
1582 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
1583 return;
1584 }
1585
1586 // active session?
1587 Session *session = 0;
1588 if (req->get_source().is_client()) {
1589 session = get_session(req);
1590 if (!session) {
1591 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
1592 } else if (session->is_closed() ||
1593 session->is_closing() ||
1594 session->is_killing()) {
1595 dout(5) << "session closed|closing|killing, dropping" << dendl;
1596 session = NULL;
1597 }
1598 if (!session) {
1599 if (req->is_queued_for_replay())
1600 mds->queue_one_replay();
1601 req->put();
1602 return;
1603 }
1604 }
1605
1606 // old mdsmap?
1607 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
1608 // send it? hrm, this isn't ideal; they may get a lot of copies if
1609 // they have a high request rate.
1610 }
1611
1612 // completed request?
1613 bool has_completed = false;
1614 if (req->is_replay() || req->get_retry_attempt()) {
1615 assert(session);
1616 inodeno_t created;
1617 if (session->have_completed_request(req->get_reqid().tid, &created)) {
1618 has_completed = true;
1619 // Don't send traceless reply if the completed request has created
1620 // new inode. Treat the request as lookup request instead.
1621 if (req->is_replay() ||
1622 ((created == inodeno_t() || !mds->is_clientreplay()) &&
1623 req->get_op() != CEPH_MDS_OP_OPEN &&
1624 req->get_op() != CEPH_MDS_OP_CREATE)) {
1625 dout(5) << "already completed " << req->get_reqid() << dendl;
1626 MClientReply *reply = new MClientReply(req, 0);
1627 if (created != inodeno_t()) {
1628 bufferlist extra;
1629 ::encode(created, extra);
1630 reply->set_extra_bl(extra);
1631 }
1632 req->get_connection()->send_message(reply);
1633
1634 if (req->is_queued_for_replay())
1635 mds->queue_one_replay();
1636
1637 req->put();
1638 return;
1639 }
1640 if (req->get_op() != CEPH_MDS_OP_OPEN &&
1641 req->get_op() != CEPH_MDS_OP_CREATE) {
1642 dout(10) << " completed request which created new inode " << created
1643 << ", convert it to lookup request" << dendl;
1644 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
1645 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
1646 }
1647 }
1648 }
1649
1650 // trim completed_request list
1651 if (req->get_oldest_client_tid() > 0) {
1652 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
1653 assert(session);
1654 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
1655 // Sessions 'completed_requests' was dirtied, mark it to be
1656 // potentially flushed at segment expiry.
1657 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
1658
1659 if (session->get_num_trim_requests_warnings() > 0 &&
1660 session->get_num_completed_requests() * 2 < g_conf->mds_max_completed_requests)
1661 session->reset_num_trim_requests_warnings();
1662 } else {
1663 if (session->get_num_completed_requests() >=
1664 (g_conf->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
1665 session->inc_num_trim_requests_warnings();
1666 stringstream ss;
1667 ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
1668 << req->get_oldest_client_tid() << "), "
1669 << session->get_num_completed_requests()
1670 << " completed requests recorded in session\n";
1671 mds->clog->warn() << ss.str();
1672 dout(20) << __func__ << " " << ss.str() << dendl;
1673 }
1674 }
1675 }
1676
1677 // register + dispatch
1678 MDRequestRef mdr = mdcache->request_start(req);
1679 if (!mdr.get())
1680 return;
1681
1682 if (session) {
1683 mdr->session = session;
1684 session->requests.push_back(&mdr->item_session_request);
1685 }
1686
1687 if (has_completed)
1688 mdr->has_completed = true;
1689
1690 // process embedded cap releases?
1691 // (only if NOT replay!)
1692 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
1693 client_t client = req->get_source().num();
1694 for (vector<MClientRequest::Release>::iterator p = req->releases.begin();
1695 p != req->releases.end();
1696 ++p)
1697 mds->locker->process_request_cap_release(mdr, client, p->item, p->dname);
1698 req->releases.clear();
1699 }
1700
1701 dispatch_client_request(mdr);
1702 return;
1703}
1704
1705void Server::handle_osd_map()
1706{
1707 /* Note that we check the OSDMAP_FULL flag directly rather than
1708 * using osdmap_full_flag(), because we want to know "is the flag set"
1709 * rather than "does the flag apply to us?" */
1710 mds->objecter->with_osdmap([this](const OSDMap& o) {
1711 is_full = o.test_flag(CEPH_OSDMAP_FULL);
1712 dout(7) << __func__ << ": full = " << is_full << " epoch = "
1713 << o.get_epoch() << dendl;
1714 });
1715}
1716
1717void Server::dispatch_client_request(MDRequestRef& mdr)
1718{
1719 // we shouldn't be waiting on anyone.
1720 assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
1721
1722 if (mdr->killed) {
1723 dout(10) << "request " << *mdr << " was killed" << dendl;
1724 return;
1725 }
1726
1727 MClientRequest *req = mdr->client_request;
1728
1729 if (logger) logger->inc(l_mdss_dispatch_client_request);
1730
1731 dout(7) << "dispatch_client_request " << *req << dendl;
1732
1733 if (req->may_write()) {
1734 if (mdcache->is_readonly()) {
1735 dout(10) << " read-only FS" << dendl;
1736 respond_to_request(mdr, -EROFS);
1737 return;
1738 }
1739 if (mdr->has_more() && mdr->more()->slave_error) {
1740 dout(10) << " got error from slaves" << dendl;
1741 respond_to_request(mdr, mdr->more()->slave_error);
1742 return;
1743 }
1744 }
1745
1746 if (is_full) {
1747 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1748 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
1749 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1750 req->get_op() == CEPH_MDS_OP_RMXATTR ||
1751 req->get_op() == CEPH_MDS_OP_SETXATTR ||
1752 req->get_op() == CEPH_MDS_OP_CREATE ||
1753 req->get_op() == CEPH_MDS_OP_SYMLINK ||
1754 req->get_op() == CEPH_MDS_OP_MKSNAP ||
1755 ((req->get_op() == CEPH_MDS_OP_LINK ||
1756 req->get_op() == CEPH_MDS_OP_RENAME) &&
1757 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
1758 ) {
1759
1760 dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
1761 respond_to_request(mdr, -ENOSPC);
1762 return;
1763 } else {
1764 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
1765 }
1766 }
1767
1768 switch (req->get_op()) {
1769 case CEPH_MDS_OP_LOOKUPHASH:
1770 case CEPH_MDS_OP_LOOKUPINO:
1771 handle_client_lookup_ino(mdr, false, false);
1772 break;
1773 case CEPH_MDS_OP_LOOKUPPARENT:
1774 handle_client_lookup_ino(mdr, true, false);
1775 break;
1776 case CEPH_MDS_OP_LOOKUPNAME:
1777 handle_client_lookup_ino(mdr, false, true);
1778 break;
1779
1780 // inodes ops.
1781 case CEPH_MDS_OP_LOOKUP:
1782 handle_client_getattr(mdr, true);
1783 break;
1784
1785 case CEPH_MDS_OP_LOOKUPSNAP:
1786 // lookupsnap does not reference a CDentry; treat it as a getattr
1787 case CEPH_MDS_OP_GETATTR:
1788 handle_client_getattr(mdr, false);
1789 break;
1790
1791 case CEPH_MDS_OP_SETATTR:
1792 handle_client_setattr(mdr);
1793 break;
1794 case CEPH_MDS_OP_SETLAYOUT:
1795 handle_client_setlayout(mdr);
1796 break;
1797 case CEPH_MDS_OP_SETDIRLAYOUT:
1798 handle_client_setdirlayout(mdr);
1799 break;
1800 case CEPH_MDS_OP_SETXATTR:
1801 handle_client_setxattr(mdr);
1802 break;
1803 case CEPH_MDS_OP_RMXATTR:
1804 handle_client_removexattr(mdr);
1805 break;
1806
1807 case CEPH_MDS_OP_READDIR:
1808 handle_client_readdir(mdr);
1809 break;
1810
1811 case CEPH_MDS_OP_SETFILELOCK:
1812 handle_client_file_setlock(mdr);
1813 break;
1814
1815 case CEPH_MDS_OP_GETFILELOCK:
1816 handle_client_file_readlock(mdr);
1817 break;
1818
1819 // funky.
1820 case CEPH_MDS_OP_CREATE:
1821 if (mdr->has_completed)
1822 handle_client_open(mdr); // already created.. just open
1823 else
1824 handle_client_openc(mdr);
1825 break;
1826
1827 case CEPH_MDS_OP_OPEN:
1828 handle_client_open(mdr);
1829 break;
1830
1831 // namespace.
1832 // no prior locks.
1833 case CEPH_MDS_OP_MKNOD:
1834 handle_client_mknod(mdr);
1835 break;
1836 case CEPH_MDS_OP_LINK:
1837 handle_client_link(mdr);
1838 break;
1839 case CEPH_MDS_OP_UNLINK:
1840 case CEPH_MDS_OP_RMDIR:
1841 handle_client_unlink(mdr);
1842 break;
1843 case CEPH_MDS_OP_RENAME:
1844 handle_client_rename(mdr);
1845 break;
1846 case CEPH_MDS_OP_MKDIR:
1847 handle_client_mkdir(mdr);
1848 break;
1849 case CEPH_MDS_OP_SYMLINK:
1850 handle_client_symlink(mdr);
1851 break;
1852
1853
1854 // snaps
1855 case CEPH_MDS_OP_LSSNAP:
1856 handle_client_lssnap(mdr);
1857 break;
1858 case CEPH_MDS_OP_MKSNAP:
1859 handle_client_mksnap(mdr);
1860 break;
1861 case CEPH_MDS_OP_RMSNAP:
1862 handle_client_rmsnap(mdr);
1863 break;
1864 case CEPH_MDS_OP_RENAMESNAP:
1865 handle_client_renamesnap(mdr);
1866 break;
1867
1868 default:
1869 dout(1) << " unknown client op " << req->get_op() << dendl;
1870 respond_to_request(mdr, -EOPNOTSUPP);
1871 }
1872}
1873
1874
1875// ---------------------------------------
1876// SLAVE REQUESTS
1877
1878/* This function DOES put the passed message before returning*/
1879void Server::handle_slave_request(MMDSSlaveRequest *m)
1880{
1881 dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
1882 mds_rank_t from = mds_rank_t(m->get_source().num());
1883
1884 if (logger) logger->inc(l_mdss_handle_slave_request);
1885
1886 // reply?
1887 if (m->is_reply())
1888 return handle_slave_request_reply(m);
1889
1890 // the purpose of rename notify is enforcing causal message ordering. making sure
1891 // bystanders have received all messages from rename srcdn's auth MDS.
1892 if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
1893 MMDSSlaveRequest *reply = new MMDSSlaveRequest(m->get_reqid(), m->get_attempt(),
1894 MMDSSlaveRequest::OP_RENAMENOTIFYACK);
1895 mds->send_message(reply, m->get_connection());
1896 m->put();
1897 return;
1898 }
1899
1900 CDentry *straydn = NULL;
1901 if (m->stray.length() > 0) {
1902 straydn = mdcache->add_replica_stray(m->stray, from);
1903 assert(straydn);
1904 m->stray.clear();
1905 }
1906
1907 // am i a new slave?
1908 MDRequestRef mdr;
1909 if (mdcache->have_request(m->get_reqid())) {
1910 // existing?
1911 mdr = mdcache->request_get(m->get_reqid());
1912
1913 // is my request newer?
1914 if (mdr->attempt > m->get_attempt()) {
1915 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
1916 << ", dropping " << *m << dendl;
1917 m->put();
1918 return;
1919 }
1920
1921
1922 if (mdr->attempt < m->get_attempt()) {
1923 // mine is old, close it out
1924 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
1925 << ", closing out" << dendl;
1926 mdcache->request_finish(mdr);
1927 mdr.reset();
1928 } else if (mdr->slave_to_mds != from) {
1929 dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
1930 m->put();
1931 return;
1932 }
1933
1934 if (m->get_op() == MMDSSlaveRequest::OP_FINISH && m->is_abort()) {
1935 mdr->aborted = true;
1936 if (mdr->slave_request) {
1937 // only abort on-going xlock, wrlock and auth pin
1938 assert(!mdr->slave_did_prepare());
1939 } else {
1940 mdcache->request_finish(mdr);
1941 }
1942 return;
1943 }
1944 }
1945 if (!mdr.get()) {
1946 // new?
1947 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
1948 dout(10) << "missing slave request for " << m->get_reqid()
1949 << " OP_FINISH, must have lost race with a forward" << dendl;
1950 m->put();
1951 return;
1952 }
1953 mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
1954 mdr->set_op_stamp(m->op_stamp);
1955 }
1956 assert(mdr->slave_request == 0); // only one at a time, please!
1957
1958 if (straydn) {
1959 mdr->pin(straydn);
1960 mdr->straydn = straydn;
1961 }
1962
1963 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
1964 dout(3) << "not clientreplay|active yet, waiting" << dendl;
1965 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
1966 return;
1967 } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
1968 mdr->locks.empty()) {
1969 dout(3) << "not active yet, waiting" << dendl;
1970 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
1971 return;
1972 }
1973
1974 mdr->slave_request = m;
1975
1976 dispatch_slave_request(mdr);
1977}
1978
1979/* This function DOES put the passed message before returning*/
1980void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
1981{
1982 mds_rank_t from = mds_rank_t(m->get_source().num());
1983
1984 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
1985 metareqid_t r = m->get_reqid();
1986 if (!mdcache->have_uncommitted_master(r, from)) {
1987 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
1988 << from << " reqid " << r << dendl;
1989 m->put();
1990 return;
1991 }
1992 dout(3) << "not clientreplay|active yet, waiting" << dendl;
1993 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
1994 return;
1995 }
1996
1997 if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
1998 metareqid_t r = m->get_reqid();
1999 mdcache->committed_master_slave(r, from);
2000 m->put();
2001 return;
2002 }
2003
2004 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2005 if (m->get_attempt() != mdr->attempt) {
2006 dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
2007 << m->get_attempt() << dendl;
2008 m->put();
2009 return;
2010 }
2011
2012 switch (m->get_op()) {
2013 case MMDSSlaveRequest::OP_XLOCKACK:
2014 {
2015 // identify lock, master request
2016 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2017 m->get_object_info());
2018 mdr->more()->slaves.insert(from);
2019 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2020 mdr->xlocks.insert(lock);
2021 mdr->locks.insert(lock);
2022 mdr->finish_locking(lock);
2023 lock->get_xlock(mdr, mdr->get_client());
2024
2025 assert(mdr->more()->waiting_on_slave.count(from));
2026 mdr->more()->waiting_on_slave.erase(from);
2027 assert(mdr->more()->waiting_on_slave.empty());
2028 mdcache->dispatch_request(mdr);
2029 }
2030 break;
2031
2032 case MMDSSlaveRequest::OP_WRLOCKACK:
2033 {
2034 // identify lock, master request
2035 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2036 m->get_object_info());
2037 mdr->more()->slaves.insert(from);
2038 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2039 mdr->remote_wrlocks[lock] = from;
2040 mdr->locks.insert(lock);
2041 mdr->finish_locking(lock);
2042
2043 assert(mdr->more()->waiting_on_slave.count(from));
2044 mdr->more()->waiting_on_slave.erase(from);
2045 assert(mdr->more()->waiting_on_slave.empty());
2046 mdcache->dispatch_request(mdr);
2047 }
2048 break;
2049
2050 case MMDSSlaveRequest::OP_AUTHPINACK:
2051 handle_slave_auth_pin_ack(mdr, m);
2052 break;
2053
2054 case MMDSSlaveRequest::OP_LINKPREPACK:
2055 handle_slave_link_prep_ack(mdr, m);
2056 break;
2057
2058 case MMDSSlaveRequest::OP_RMDIRPREPACK:
2059 handle_slave_rmdir_prep_ack(mdr, m);
2060 break;
2061
2062 case MMDSSlaveRequest::OP_RENAMEPREPACK:
2063 handle_slave_rename_prep_ack(mdr, m);
2064 break;
2065
2066 case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
2067 handle_slave_rename_notify_ack(mdr, m);
2068 break;
2069
2070 default:
2071 ceph_abort();
2072 }
2073
2074 // done with reply.
2075 m->put();
2076}
2077
2078/* This function DOES put the mdr->slave_request before returning*/
2079void Server::dispatch_slave_request(MDRequestRef& mdr)
2080{
2081 dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
2082
2083 if (mdr->aborted) {
2084 dout(7) << " abort flag set, finishing" << dendl;
2085 mdcache->request_finish(mdr);
2086 return;
2087 }
2088
2089 if (logger) logger->inc(l_mdss_dispatch_slave_request);
2090
2091 int op = mdr->slave_request->get_op();
2092 switch (op) {
2093 case MMDSSlaveRequest::OP_XLOCK:
2094 case MMDSSlaveRequest::OP_WRLOCK:
2095 {
2096 // identify object
2097 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2098 mdr->slave_request->get_object_info());
2099
2100 if (!lock) {
2101 dout(10) << "don't have object, dropping" << dendl;
2102 ceph_abort(); // can this happen, if we auth pinned properly.
2103 }
2104 if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2105 dout(10) << "not auth for remote xlock attempt, dropping on "
2106 << *lock << " on " << *lock->get_parent() << dendl;
2107 } else {
2108 // use acquire_locks so that we get auth_pinning.
2109 set<SimpleLock*> rdlocks;
2110 set<SimpleLock*> wrlocks = mdr->wrlocks;
2111 set<SimpleLock*> xlocks = mdr->xlocks;
2112
2113 int replycode = 0;
2114 switch (op) {
2115 case MMDSSlaveRequest::OP_XLOCK:
2116 xlocks.insert(lock);
2117 replycode = MMDSSlaveRequest::OP_XLOCKACK;
2118 break;
2119 case MMDSSlaveRequest::OP_WRLOCK:
2120 wrlocks.insert(lock);
2121 replycode = MMDSSlaveRequest::OP_WRLOCKACK;
2122 break;
2123 }
2124
2125 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
2126 return;
2127
2128 // ack
2129 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, replycode);
2130 r->set_lock_type(lock->get_type());
2131 lock->get_parent()->set_object_info(r->get_object_info());
2132 mds->send_message(r, mdr->slave_request->get_connection());
2133 }
2134
2135 // done.
2136 mdr->slave_request->put();
2137 mdr->slave_request = 0;
2138 }
2139 break;
2140
2141 case MMDSSlaveRequest::OP_UNXLOCK:
2142 case MMDSSlaveRequest::OP_UNWRLOCK:
2143 {
2144 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2145 mdr->slave_request->get_object_info());
2146 assert(lock);
2147 bool need_issue = false;
2148 switch (op) {
2149 case MMDSSlaveRequest::OP_UNXLOCK:
2150 mds->locker->xlock_finish(lock, mdr.get(), &need_issue);
2151 break;
2152 case MMDSSlaveRequest::OP_UNWRLOCK:
2153 mds->locker->wrlock_finish(lock, mdr.get(), &need_issue);
2154 break;
2155 }
2156 if (need_issue)
2157 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2158
2159 // done. no ack necessary.
2160 mdr->slave_request->put();
2161 mdr->slave_request = 0;
2162 }
2163 break;
2164
2165 case MMDSSlaveRequest::OP_DROPLOCKS:
2166 mds->locker->drop_locks(mdr.get());
2167 mdr->slave_request->put();
2168 mdr->slave_request = 0;
2169 break;
2170
2171 case MMDSSlaveRequest::OP_AUTHPIN:
2172 handle_slave_auth_pin(mdr);
2173 break;
2174
2175 case MMDSSlaveRequest::OP_LINKPREP:
2176 case MMDSSlaveRequest::OP_UNLINKPREP:
2177 handle_slave_link_prep(mdr);
2178 break;
2179
2180 case MMDSSlaveRequest::OP_RMDIRPREP:
2181 handle_slave_rmdir_prep(mdr);
2182 break;
2183
2184 case MMDSSlaveRequest::OP_RENAMEPREP:
2185 handle_slave_rename_prep(mdr);
2186 break;
2187
2188 case MMDSSlaveRequest::OP_FINISH:
2189 // information about rename imported caps
2190 if (mdr->slave_request->inode_export.length() > 0)
2191 mdr->more()->inode_import.claim(mdr->slave_request->inode_export);
2192 // finish off request.
2193 mdcache->request_finish(mdr);
2194 break;
2195
2196 default:
2197 ceph_abort();
2198 }
2199}
2200
2201/* This function DOES put the mdr->slave_request before returning*/
2202void Server::handle_slave_auth_pin(MDRequestRef& mdr)
2203{
2204 dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
2205
2206 // build list of objects
2207 list<MDSCacheObject*> objects;
2208 CInode *auth_pin_freeze = NULL;
2209 bool fail = false, wouldblock = false, readonly = false;
2210
2211 if (mdcache->is_readonly()) {
2212 dout(10) << " read-only FS" << dendl;
2213 readonly = true;
2214 fail = true;
2215 }
2216
2217 if (!fail) {
2218 for (vector<MDSCacheObjectInfo>::iterator p = mdr->slave_request->get_authpins().begin();
2219 p != mdr->slave_request->get_authpins().end();
2220 ++p) {
2221 MDSCacheObject *object = mdcache->get_object(*p);
2222 if (!object) {
2223 dout(10) << " don't have " << *p << dendl;
2224 fail = true;
2225 break;
2226 }
2227
2228 objects.push_back(object);
2229 if (*p == mdr->slave_request->get_authpin_freeze())
2230 auth_pin_freeze = static_cast<CInode*>(object);
2231 }
2232 }
2233
2234 // can we auth pin them?
2235 if (!fail) {
2236 for (list<MDSCacheObject*>::iterator p = objects.begin();
2237 p != objects.end();
2238 ++p) {
2239 if (!(*p)->is_auth()) {
2240 dout(10) << " not auth for " << **p << dendl;
2241 fail = true;
2242 break;
2243 }
2244 if (mdr->is_auth_pinned(*p))
2245 continue;
2246 if (!mdr->can_auth_pin(*p)) {
2247 if (mdr->slave_request->is_nonblock()) {
2248 dout(10) << " can't auth_pin (freezing?) " << **p << " nonblocking" << dendl;
2249 fail = true;
2250 wouldblock = true;
2251 break;
2252 }
2253 // wait
2254 dout(10) << " waiting for authpinnable on " << **p << dendl;
2255 (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2256 mdr->drop_local_auth_pins();
2257
224ce89b 2258 mds->locker->notify_freeze_waiter(*p);
7c673cae
FG
2259 return;
2260 }
2261 }
2262 }
2263
2264 // auth pin!
2265 if (fail) {
2266 mdr->drop_local_auth_pins(); // just in case
2267 } else {
2268 /* freeze authpin wrong inode */
2269 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
2270 mdr->more()->rename_inode != auth_pin_freeze)
2271 mdr->unfreeze_auth_pin(true);
2272
2273 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2274 * on the source inode to complete. This happens after all locks for the rename
2275 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2276 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2277 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2278 * The solution is freeze the inode and prevent other MDRequests from getting new
2279 * auth pins.
2280 */
2281 if (auth_pin_freeze) {
2282 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
2283 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
2284 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
2285 mds->mdlog->flush();
2286 return;
2287 }
2288 }
2289 for (list<MDSCacheObject*>::iterator p = objects.begin();
2290 p != objects.end();
2291 ++p) {
2292 dout(10) << "auth_pinning " << **p << dendl;
2293 mdr->auth_pin(*p);
2294 }
2295 }
2296
2297 // ack!
2298 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
2299
2300 // return list of my auth_pins (if any)
2301 for (set<MDSCacheObject*>::iterator p = mdr->auth_pins.begin();
2302 p != mdr->auth_pins.end();
2303 ++p) {
2304 MDSCacheObjectInfo info;
2305 (*p)->set_object_info(info);
2306 reply->get_authpins().push_back(info);
2307 if (*p == (MDSCacheObject*)auth_pin_freeze)
2308 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
2309 }
2310
2311 if (wouldblock)
2312 reply->mark_error_wouldblock();
2313 if (readonly)
2314 reply->mark_error_rofs();
2315
2316 mds->send_message_mds(reply, mdr->slave_to_mds);
2317
2318 // clean up this request
2319 mdr->slave_request->put();
2320 mdr->slave_request = 0;
2321 return;
2322}
2323
2324/* This function DOES NOT put the passed ack before returning*/
2325void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
2326{
2327 dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
2328 mds_rank_t from = mds_rank_t(ack->get_source().num());
2329
2330 // added auth pins?
2331 set<MDSCacheObject*> pinned;
2332 for (vector<MDSCacheObjectInfo>::iterator p = ack->get_authpins().begin();
2333 p != ack->get_authpins().end();
2334 ++p) {
2335 MDSCacheObject *object = mdcache->get_object(*p);
2336 assert(object); // we pinned it
2337 dout(10) << " remote has pinned " << *object << dendl;
2338 if (!mdr->is_auth_pinned(object))
2339 mdr->remote_auth_pins[object] = from;
2340 if (*p == ack->get_authpin_freeze())
2341 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
2342 pinned.insert(object);
2343 }
2344
2345 // removed frozen auth pin ?
2346 if (mdr->more()->is_remote_frozen_authpin &&
2347 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
2348 auto p = mdr->remote_auth_pins.find(mdr->more()->rename_inode);
2349 assert(p != mdr->remote_auth_pins.end());
2350 if (p->second == from) {
2351 mdr->more()->is_remote_frozen_authpin = false;
2352 }
2353 }
2354
2355 // removed auth pins?
2356 map<MDSCacheObject*, mds_rank_t>::iterator p = mdr->remote_auth_pins.begin();
2357 while (p != mdr->remote_auth_pins.end()) {
2358 MDSCacheObject* object = p->first;
2359 if (p->second == from && pinned.count(object) == 0) {
2360 dout(10) << " remote has unpinned " << *object << dendl;
2361 mdr->remote_auth_pins.erase(p++);
2362 } else {
2363 ++p;
2364 }
2365 }
2366
2367 if (ack->is_error_rofs()) {
2368 mdr->more()->slave_error = -EROFS;
2369 mdr->aborted = true;
2370 } else if (ack->is_error_wouldblock()) {
2371 mdr->more()->slave_error = -EWOULDBLOCK;
2372 mdr->aborted = true;
2373 }
2374
2375 // note slave
2376 mdr->more()->slaves.insert(from);
2377
2378 // clear from waiting list
2379 assert(mdr->more()->waiting_on_slave.count(from));
2380 mdr->more()->waiting_on_slave.erase(from);
2381
2382 // go again?
2383 if (mdr->more()->waiting_on_slave.empty())
2384 mdcache->dispatch_request(mdr);
2385 else
2386 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
2387}
2388
2389
2390// ---------------------------------------
2391// HELPERS
2392
2393
2394/**
2395 * check whether we are permitted to complete a request
2396 *
2397 * Check whether we have permission to perform the operation specified
2398 * by mask on the given inode, based on the capability in the mdr's
2399 * session.
2400 */
2401bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
2402{
2403 if (mdr->session) {
2404 int r = mdr->session->check_access(
2405 in, mask,
2406 mdr->client_request->get_caller_uid(),
2407 mdr->client_request->get_caller_gid(),
2408 &mdr->client_request->get_caller_gid_list(),
2409 mdr->client_request->head.args.setattr.uid,
2410 mdr->client_request->head.args.setattr.gid);
2411 if (r < 0) {
2412 respond_to_request(mdr, r);
2413 return false;
2414 }
2415 }
2416 return true;
2417}
2418
2419/**
2420 * check whether fragment has reached maximum size
2421 *
2422 */
2423bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
2424{
2425 const auto size = in->get_frag_size();
2426 if (size >= g_conf->mds_bal_fragment_size_max) {
2427 dout(10) << "fragment " << *in << " size exceeds " << g_conf->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
2428 respond_to_request(mdr, -ENOSPC);
2429 return false;
2430 }
2431
2432 return true;
2433}
2434
2435
2436/** validate_dentry_dir
2437 *
2438 * verify that the dir exists and would own the dname.
2439 * do not check if the dentry exists.
2440 */
2441CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, const string& dname)
2442{
2443 // make sure parent is a dir?
2444 if (!diri->is_dir()) {
2445 dout(7) << "validate_dentry_dir: not a dir" << dendl;
2446 respond_to_request(mdr, -ENOTDIR);
2447 return NULL;
2448 }
2449
2450 // which dirfrag?
2451 frag_t fg = diri->pick_dirfrag(dname);
2452 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
2453 if (!dir)
2454 return 0;
2455
2456 // frozen?
2457 if (dir->is_frozen()) {
2458 dout(7) << "dir is frozen " << *dir << dendl;
2459 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2460 return NULL;
2461 }
2462
2463 return dir;
2464}
2465
2466
2467/** prepare_null_dentry
2468 * prepare a null (or existing) dentry in given dir.
2469 * wait for any dn lock.
2470 */
2471CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, const string& dname, bool okexist)
2472{
2473 dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
2474 assert(dir->is_auth());
2475
2476 client_t client = mdr->get_client();
2477
2478 // does it already exist?
2479 CDentry *dn = dir->lookup(dname);
2480 if (dn) {
2481 /*
2482 if (dn->lock.is_xlocked_by_other(mdr)) {
2483 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2484 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2485 return 0;
2486 }
2487 */
2488 if (!dn->get_linkage(client, mdr)->is_null()) {
2489 // name already exists
2490 dout(10) << "dentry " << dname << " exists in " << *dir << dendl;
2491 if (!okexist) {
2492 respond_to_request(mdr, -EEXIST);
2493 return 0;
2494 }
2495 } else {
2496 dn->first = dir->inode->find_snaprealm()->get_newest_seq() + 1;
2497 }
2498
2499 return dn;
2500 }
2501
2502 // make sure dir is complete
2503 if (!dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2504 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2505 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2506 return 0;
2507 }
2508
2509 // create
2510 dn = dir->add_null_dentry(dname, dir->inode->find_snaprealm()->get_newest_seq() + 1);
2511 dn->mark_new();
2512 dout(10) << "prepare_null_dentry added " << *dn << dendl;
2513 return dn;
2514}
2515
2516CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
2517{
2518 CDentry *straydn = mdr->straydn;
2519 if (straydn) {
2520 string straydname;
2521 in->name_stray_dentry(straydname);
2522 if (straydn->get_name() == straydname)
2523 return straydn;
2524
2525 assert(!mdr->done_locking);
2526 mdr->unpin(straydn);
2527 }
2528
2529 CDir *straydir = mdcache->get_stray_dir(in);
2530
2531 if (!mdr->client_request->is_replay() &&
2532 !check_fragment_space(mdr, straydir))
2533 return NULL;
2534
2535 straydn = mdcache->get_or_create_stray_dentry(in);
2536 mdr->straydn = straydn;
2537 mdr->pin(straydn);
2538 return straydn;
2539}
2540
2541/** prepare_new_inode
2542 *
2543 * create a new inode. set c/m/atime. hit dir pop.
2544 */
2545CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
2546 file_layout_t *layout)
2547{
2548 CInode *in = new CInode(mdcache);
2549
2550 // Server::prepare_force_open_sessions() can re-open session in closing
2551 // state. In that corner case, session's prealloc_inos are being freed.
2552 // To simplify the code, we disallow using/refilling session's prealloc_ino
2553 // while session is opening.
2554 bool allow_prealloc_inos = !mdr->session->is_opening();
2555
2556 // assign ino
2557 if (allow_prealloc_inos &&
2558 mdr->session->info.prealloc_inos.size()) {
2559 mdr->used_prealloc_ino =
2560 in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used
2561 mds->sessionmap.mark_projected(mdr->session);
2562
2563 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
2564 << " (" << mdr->session->info.prealloc_inos
2565 << ", " << mdr->session->info.prealloc_inos.size() << " left)"
2566 << dendl;
2567 } else {
2568 mdr->alloc_ino =
2569 in->inode.ino = mds->inotable->project_alloc_id();
2570 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
2571 }
2572
2573 if (useino && useino != in->inode.ino) {
2574 dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
2575 mds->clog->error() << mdr->client_request->get_source()
2576 << " specified ino " << useino
2577 << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
2578 //ceph_abort(); // just for now.
2579 }
2580
2581 if (allow_prealloc_inos &&
2582 mdr->session->get_num_projected_prealloc_inos() < g_conf->mds_client_prealloc_inos / 2) {
2583 int need = g_conf->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
2584 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
2585 assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
2586 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
2587 mds->sessionmap.mark_projected(mdr->session);
2588 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
2589 }
2590
2591 in->inode.version = 1;
2592 in->inode.xattr_version = 1;
2593 in->inode.nlink = 1; // FIXME
2594
2595 in->inode.mode = mode;
2596
2597 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
2598 if (in->inode.is_dir()) {
2599 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
2600 } else if (layout) {
2601 in->inode.layout = *layout;
2602 } else {
2603 in->inode.layout = mdcache->default_file_layout;
2604 }
2605
2606 in->inode.truncate_size = -1ull; // not truncated, yet!
2607 in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
2608
2609 CInode *diri = dir->get_inode();
2610
2611 dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
2612
2613 if (diri->inode.mode & S_ISGID) {
2614 dout(10) << " dir is sticky" << dendl;
2615 in->inode.gid = diri->inode.gid;
2616 if (S_ISDIR(mode)) {
2617 dout(10) << " new dir also sticky" << dendl;
2618 in->inode.mode |= S_ISGID;
2619 }
2620 } else
2621 in->inode.gid = mdr->client_request->get_caller_gid();
2622
2623 in->inode.uid = mdr->client_request->get_caller_uid();
2624
2625 in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
2626 mdr->get_op_stamp();
2627
2628 in->inode.change_attr = 0;
2629
2630 MClientRequest *req = mdr->client_request;
2631 if (req->get_data().length()) {
2632 bufferlist::iterator p = req->get_data().begin();
2633
2634 // xattrs on new inode?
2635 map<string,bufferptr> xattrs;
2636 ::decode(xattrs, p);
2637 for (map<string,bufferptr>::iterator p = xattrs.begin(); p != xattrs.end(); ++p) {
2638 dout(10) << "prepare_new_inode setting xattr " << p->first << dendl;
2639 in->xattrs[p->first] = p->second;
2640 }
2641 }
2642
2643 if (!mds->mdsmap->get_inline_data_enabled() ||
2644 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
2645 in->inode.inline_data.version = CEPH_INLINE_NONE;
2646
2647 mdcache->add_inode(in); // add
2648 dout(10) << "prepare_new_inode " << *in << dendl;
2649 return in;
2650}
2651
2652void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
2653{
2654 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
2655 << " inotablev " << mds->inotable->get_projected_version()
2656 << dendl;
2657 blob->set_ino_alloc(mdr->alloc_ino,
2658 mdr->used_prealloc_ino,
2659 mdr->prealloc_inos,
2660 mdr->client_request->get_source(),
2661 mds->sessionmap.get_projected(),
2662 mds->inotable->get_projected_version());
2663}
2664
2665void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
2666{
2667 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
2668 << " / " << mdr->prealloc_inos
2669 << " / " << mdr->used_prealloc_ino << dendl;
2670
2671 if (mdr->alloc_ino) {
2672 mds->inotable->apply_alloc_id(mdr->alloc_ino);
2673 }
2674 if (mdr->prealloc_inos.size()) {
2675 assert(session);
2676 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
2677 session->info.prealloc_inos.insert(mdr->prealloc_inos);
2678 mds->sessionmap.mark_dirty(session);
2679 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
2680 }
2681 if (mdr->used_prealloc_ino) {
2682 assert(session);
2683 session->info.used_inos.erase(mdr->used_prealloc_ino);
2684 mds->sessionmap.mark_dirty(session);
2685 }
2686}
2687
2688class C_MDS_TryFindInode : public ServerContext {
2689 MDRequestRef mdr;
2690public:
2691 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
2692 void finish(int r) override {
2693 if (r == -ESTALE) // :( find_ino_peers failed
2694 server->respond_to_request(mdr, r);
2695 else
2696 server->dispatch_client_request(mdr);
2697 }
2698};
2699
2700CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath)
2701{
2702 // figure parent dir vs dname
2703 if (refpath.depth() == 0) {
2704 dout(7) << "can't do that to root" << dendl;
2705 respond_to_request(mdr, -EINVAL);
2706 return 0;
2707 }
2708 string dname = refpath.last_dentry();
2709 refpath.pop_dentry();
2710
2711 dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl;
2712
2713 // traverse to parent dir
2714 CInode *diri;
2715 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &trace, &diri, MDS_TRAVERSE_FORWARD);
2716 if (r > 0) return 0; // delayed
2717 if (r < 0) {
2718 if (r == -ESTALE) {
2719 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2720 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
2721 return 0;
2722 }
2723 respond_to_request(mdr, r);
2724 return 0;
2725 }
2726
2727 // is it an auth dir?
2728 CDir *dir = validate_dentry_dir(mdr, diri, dname);
2729 if (!dir)
2730 return 0; // forwarded or waiting for freeze
2731
2732 dout(10) << "traverse_to_auth_dir " << *dir << dendl;
2733 return dir;
2734}
2735
2736/* If this returns null, the request has been handled
2737 * as appropriate: forwarded on, or the client's been replied to */
2738CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n,
2739 set<SimpleLock*> &rdlocks,
2740 bool want_auth,
2741 bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's
2742 a snapped dir */
2743 file_layout_t **layout,
2744 bool no_lookup) // true if we cannot return a null dentry lease
2745{
2746 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2747 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
2748
2749 if (mdr->done_locking)
2750 return mdr->in[n];
2751
2752 // traverse
2753 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &mdr->dn[n], &mdr->in[n], MDS_TRAVERSE_FORWARD);
2754 if (r > 0)
2755 return NULL; // delayed
2756 if (r < 0) { // error
2757 if (r == -ENOENT && n == 0 && mdr->dn[n].size()) {
2758 if (!no_lookup)
2759 mdr->tracedn = mdr->dn[n][mdr->dn[n].size()-1];
2760 respond_to_request(mdr, r);
2761 } else if (r == -ESTALE) {
2762 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2763 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
2764 mdcache->find_ino_peers(refpath.get_ino(), c);
2765 } else {
2766 dout(10) << "FAIL on error " << r << dendl;
2767 respond_to_request(mdr, r);
2768 }
2769 return 0;
2770 }
2771 CInode *ref = mdr->in[n];
2772 dout(10) << "ref is " << *ref << dendl;
2773
2774 // fw to inode auth?
2775 if (mdr->snapid != CEPH_NOSNAP && !no_want_auth)
2776 want_auth = true;
2777
2778 if (want_auth) {
2779 if (ref->is_ambiguous_auth()) {
2780 dout(10) << "waiting for single auth on " << *ref << dendl;
2781 ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
2782 return 0;
2783 }
2784 if (!ref->is_auth()) {
2785 dout(10) << "fw to auth for " << *ref << dendl;
2786 mdcache->request_forward(mdr, ref->authority().first);
2787 return 0;
2788 }
2789
2790 // auth_pin?
2791 // do NOT proceed if freezing, as cap release may defer in that case, and
2792 // we could deadlock when we try to lock @ref.
2793 // if we're already auth_pinned, continue; the release has already been processed.
2794 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
2795 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
2796 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
2797 ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2798 /* If we have any auth pins, this will deadlock.
2799 * But the only way to get here if we've already got auth pins
2800 * is because we're on an inode with snapshots that got updated
2801 * between dispatches of this request. So we're going to drop
2802 * our locks and our auth pins and reacquire them later.
2803 *
2804 * This is safe since we're only in this function when working on
2805 * a single MDS request; otherwise we'd be in
2806 * rdlock_path_xlock_dentry.
2807 */
2808 mds->locker->drop_locks(mdr.get(), NULL);
2809 mdr->drop_local_auth_pins();
224ce89b
WB
2810 if (!mdr->remote_auth_pins.empty())
2811 mds->locker->notify_freeze_waiter(ref);
7c673cae
FG
2812 return 0;
2813 }
2814
2815 mdr->auth_pin(ref);
2816 }
2817
2818 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2819 rdlocks.insert(&mdr->dn[n][i]->lock);
2820 if (layout)
2821 mds->locker->include_snap_rdlocks_wlayout(rdlocks, ref, layout);
2822 else
2823 mds->locker->include_snap_rdlocks(rdlocks, ref);
2824
2825 // set and pin ref
2826 mdr->pin(ref);
2827 return ref;
2828}
2829
2830
2831/** rdlock_path_xlock_dentry
2832 * traverse path to the directory that could/would contain dentry.
2833 * make sure i am auth for that dentry, forward as necessary.
2834 * create null dentry in place (or use existing if okexist).
2835 * get rdlocks on traversed dentries, xlock on new dentry.
2836 */
2837CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
2838 set<SimpleLock*>& rdlocks, set<SimpleLock*>& wrlocks, set<SimpleLock*>& xlocks,
2839 bool okexist, bool mustexist, bool alwaysxlock,
2840 file_layout_t **layout)
2841{
2842 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2843
2844 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
2845
2846 client_t client = mdr->get_client();
2847
2848 if (mdr->done_locking)
2849 return mdr->dn[n].back();
2850
2851 CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
2852 if (!dir) return 0;
7c673cae 2853
7c673cae
FG
2854 CInode *diri = dir->get_inode();
2855 if (!mdr->reqid.name.is_mds()) {
2856 if (diri->is_system() && !diri->is_root()) {
2857 respond_to_request(mdr, -EROFS);
2858 return 0;
2859 }
2860 }
2861 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
2862 respond_to_request(mdr, -ENOENT);
2863 return 0;
2864 }
2865
2866 // make a null dentry?
2867 const string &dname = refpath.last_dentry();
2868 CDentry *dn;
2869 if (mustexist) {
2870 dn = dir->lookup(dname);
2871
2872 // make sure dir is complete
2873 if (!dn && !dir->is_complete() &&
2874 (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2875 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2876 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2877 return 0;
2878 }
2879
2880 // readable?
2881 if (dn && !dn->lock.can_read(client) && dn->lock.get_xlock_by() != mdr) {
2882 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2883 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2884 return 0;
2885 }
2886
2887 // exists?
2888 if (!dn || dn->get_linkage(client, mdr)->is_null()) {
2889 dout(7) << "dentry " << dname << " dne in " << *dir << dendl;
2890 respond_to_request(mdr, -ENOENT);
2891 return 0;
2892 }
2893 } else {
2894 dn = prepare_null_dentry(mdr, dir, dname, okexist);
2895 if (!dn)
2896 return 0;
2897 }
2898
2899 mdr->dn[n].push_back(dn);
2900 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
2901 mdr->in[n] = dnl->get_inode();
2902
2903 // -- lock --
2904 // NOTE: rename takes the same set of locks for srcdn
2905 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2906 rdlocks.insert(&mdr->dn[n][i]->lock);
2907 if (alwaysxlock || dnl->is_null())
2908 xlocks.insert(&dn->lock); // new dn, xlock
2909 else
2910 rdlocks.insert(&dn->lock); // existing dn, rdlock
2911 wrlocks.insert(&dn->get_dir()->inode->filelock); // also, wrlock on dir mtime
2912 wrlocks.insert(&dn->get_dir()->inode->nestlock); // also, wrlock on dir mtime
2913 if (layout)
2914 mds->locker->include_snap_rdlocks_wlayout(rdlocks, dn->get_dir()->inode, layout);
2915 else
2916 mds->locker->include_snap_rdlocks(rdlocks, dn->get_dir()->inode);
2917
2918 return dn;
2919}
2920
2921
2922
2923
2924
2925/**
2926 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
2927 *
2928 * @param diri base inode
2929 * @param fg the exact frag we want
2930 * @param mdr request
2931 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
2932 */
2933CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
2934{
2935 CDir *dir = diri->get_dirfrag(fg);
2936
2937 // not open and inode not mine?
2938 if (!dir && !diri->is_auth()) {
2939 mds_rank_t inauth = diri->authority().first;
2940 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
2941 mdcache->request_forward(mdr, inauth);
2942 return 0;
2943 }
2944
2945 // not open and inode frozen?
2946 if (!dir && diri->is_frozen()) {
2947 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
2948 assert(diri->get_parent_dir());
224ce89b 2949 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae
FG
2950 return 0;
2951 }
2952
2953 // invent?
2954 if (!dir)
2955 dir = diri->get_or_open_dirfrag(mdcache, fg);
2956
2957 // am i auth for the dirfrag?
2958 if (!dir->is_auth()) {
2959 mds_rank_t auth = dir->authority().first;
2960 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
2961 << ", fw to mds." << auth << dendl;
2962 mdcache->request_forward(mdr, auth);
2963 return 0;
2964 }
2965
2966 return dir;
2967}
2968
2969
2970// ===============================================================================
2971// STAT
2972
2973void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
2974{
2975 MClientRequest *req = mdr->client_request;
2976 set<SimpleLock*> rdlocks, wrlocks, xlocks;
2977
2978 if (req->get_filepath().depth() == 0 && is_lookup) {
2979 // refpath can't be empty for lookup but it can for
2980 // getattr (we do getattr with empty refpath for mount of '/')
2981 respond_to_request(mdr, -EINVAL);
2982 return;
2983 }
2984
2985 CInode *ref = rdlock_path_pin_ref(mdr, 0, rdlocks, false, false, NULL, !is_lookup);
2986 if (!ref) return;
2987
2988 /*
2989 * if client currently holds the EXCL cap on a field, do not rdlock
2990 * it; client's stat() will result in valid info if _either_ EXCL
2991 * cap is held or MDS rdlocks and reads the value here.
2992 *
2993 * handling this case here is easier than weakening rdlock
2994 * semantics... that would cause problems elsewhere.
2995 */
2996 client_t client = mdr->get_client();
2997 int issued = 0;
2998 Capability *cap = ref->get_client_cap(client);
2999 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3000 mdr->snapid <= cap->client_follows))
3001 issued = cap->issued();
3002
3003 int mask = req->head.args.getattr.mask;
3004 if ((mask & CEPH_CAP_LINK_SHARED) && (issued & CEPH_CAP_LINK_EXCL) == 0) rdlocks.insert(&ref->linklock);
3005 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0) rdlocks.insert(&ref->authlock);
3006 if ((mask & CEPH_CAP_FILE_SHARED) && (issued & CEPH_CAP_FILE_EXCL) == 0) rdlocks.insert(&ref->filelock);
3007 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0) rdlocks.insert(&ref->xattrlock);
3008
3009 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3010 return;
3011
3012 if (!check_access(mdr, ref, MAY_READ))
3013 return;
3014
3015 // note which caps are requested, so we return at least a snapshot
3016 // value for them. (currently this matters for xattrs and inline data)
3017 mdr->getattr_caps = mask;
3018
3019 mds->balancer->hit_inode(ceph_clock_now(), ref, META_POP_IRD,
3020 req->get_source().num());
3021
3022 // reply
3023 dout(10) << "reply to stat on " << *req << dendl;
3024 mdr->tracei = ref;
3025 if (is_lookup)
3026 mdr->tracedn = mdr->dn[0].back();
3027 respond_to_request(mdr, 0);
3028}
3029
3030struct C_MDS_LookupIno2 : public ServerContext {
3031 MDRequestRef mdr;
3032 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3033 void finish(int r) override {
3034 server->_lookup_ino_2(mdr, r);
3035 }
3036};
3037
3038/* This function DOES clean up the mdr before returning*/
3039/*
3040 * filepath: ino
3041 */
3042void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3043 bool want_parent, bool want_dentry)
3044{
3045 MClientRequest *req = mdr->client_request;
3046
3047 inodeno_t ino = req->get_filepath().get_ino();
3048 CInode *in = mdcache->get_inode(ino);
3049 if (in && in->state_test(CInode::STATE_PURGING)) {
3050 respond_to_request(mdr, -ESTALE);
3051 return;
3052 }
3053 if (!in) {
3054 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3055 return;
3056 }
3057
3058 if (mdr && in->snaprealm && !in->snaprealm->is_open() &&
3059 !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
3060 return;
3061 }
3062
3063 // check for nothing (not read or write); this still applies the
3064 // path check.
3065 if (!check_access(mdr, in, 0))
3066 return;
3067
3068 CDentry *dn = in->get_projected_parent_dn();
3069 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3070
3071 set<SimpleLock*> rdlocks;
3072 if (dn && (want_parent || want_dentry)) {
3073 mdr->pin(dn);
3074 rdlocks.insert(&dn->lock);
3075 }
3076
3077 unsigned mask = req->head.args.getattr.mask;
3078 if (mask) {
3079 Capability *cap = in->get_client_cap(mdr->get_client());
3080 int issued = 0;
3081 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3082 issued = cap->issued();
3083 // permission bits, ACL/security xattrs
3084 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3085 rdlocks.insert(&in->authlock);
3086 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3087 rdlocks.insert(&in->xattrlock);
3088
3089 mdr->getattr_caps = mask;
3090 }
3091
3092 if (!rdlocks.empty()) {
3093 set<SimpleLock*> wrlocks, xlocks;
3094 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3095 return;
3096
d2e6a577
FG
3097 if (diri != NULL) {
3098 // need read access to directory inode
3099 if (!check_access(mdr, diri, MAY_READ))
3100 return;
3101 }
7c673cae
FG
3102 }
3103
3104 if (want_parent) {
3105 if (in->is_base()) {
3106 respond_to_request(mdr, -EINVAL);
3107 return;
3108 }
3109 if (!diri || diri->is_stray()) {
3110 respond_to_request(mdr, -ESTALE);
3111 return;
3112 }
3113 dout(10) << "reply to lookup_parent " << *in << dendl;
3114 mdr->tracei = diri;
3115 respond_to_request(mdr, 0);
3116 } else {
3117 if (want_dentry) {
3118 inodeno_t dirino = req->get_filepath2().get_ino();
3119 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3120 respond_to_request(mdr, -ENOENT);
3121 return;
3122 }
3123 dout(10) << "reply to lookup_name " << *in << dendl;
3124 } else
3125 dout(10) << "reply to lookup_ino " << *in << dendl;
3126
3127 mdr->tracei = in;
3128 if (want_dentry)
3129 mdr->tracedn = dn;
3130 respond_to_request(mdr, 0);
3131 }
3132}
3133
3134void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
3135{
3136 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
3137 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3138
3139 // `r` is a rank if >=0, else an error code
3140 if (r >= 0) {
3141 mds_rank_t dest_rank(r);
3142 if (dest_rank == mds->get_nodeid())
3143 dispatch_client_request(mdr);
3144 else
3145 mdcache->request_forward(mdr, dest_rank);
3146 return;
3147 }
3148
3149 // give up
3150 if (r == -ENOENT || r == -ENODATA)
3151 r = -ESTALE;
3152 respond_to_request(mdr, r);
3153}
3154
3155
3156/* This function takes responsibility for the passed mdr*/
3157void Server::handle_client_open(MDRequestRef& mdr)
3158{
3159 MClientRequest *req = mdr->client_request;
3160 dout(7) << "open on " << req->get_filepath() << dendl;
3161
3162 int flags = req->head.args.open.flags;
3163 int cmode = ceph_flags_to_mode(flags);
3164 if (cmode < 0) {
3165 respond_to_request(mdr, -EINVAL);
3166 return;
3167 }
3168
3169 bool need_auth = !file_mode_is_readonly(cmode) || (flags & CEPH_O_TRUNC);
3170
3171 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
3172 dout(7) << "read-only FS" << dendl;
3173 respond_to_request(mdr, -EROFS);
3174 return;
3175 }
3176
3177 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3178 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, need_auth);
3179 if (!cur)
3180 return;
3181
3182 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
3183 assert(!need_auth);
3184 mdr->done_locking = false;
3185 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3186 if (!cur)
3187 return;
3188 }
3189
3190 if (!cur->inode.is_file()) {
3191 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3192 cmode = CEPH_FILE_MODE_PIN;
3193 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3194 if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
3195 flags &= ~CEPH_O_TRUNC;
3196 }
3197
3198 dout(10) << "open flags = " << flags
3199 << ", filemode = " << cmode
3200 << ", need_auth = " << need_auth
3201 << dendl;
3202
3203 // regular file?
3204 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3205 dout(7) << "not a file or dir " << *cur << dendl;
3206 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3207 return;
3208 }*/
3209 if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
3210 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
3211 respond_to_request(mdr, -EINVAL);
3212 return;
3213 }
3214
3215 if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
3216 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
3217 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3218 respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
3219 return;
3220 }
3221
3222 if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
3223 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
3224 dout(7) << "old client cannot open inline data file " << *cur << dendl;
3225 respond_to_request(mdr, -EPERM);
3226 return;
3227 }
3228
3229 // snapped data is read only
3230 if (mdr->snapid != CEPH_NOSNAP &&
3231 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
3232 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
3233 respond_to_request(mdr, -EROFS);
3234 return;
3235 }
3236
3237 unsigned mask = req->head.args.open.mask;
3238 if (mask) {
3239 Capability *cap = cur->get_client_cap(mdr->get_client());
3240 int issued = 0;
3241 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3242 issued = cap->issued();
3243 // permission bits, ACL/security xattrs
3244 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3245 rdlocks.insert(&cur->authlock);
3246 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3247 rdlocks.insert(&cur->xattrlock);
3248
3249 mdr->getattr_caps = mask;
3250 }
3251
3252 // O_TRUNC
3253 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
3254 assert(cur->is_auth());
3255
3256 xlocks.insert(&cur->filelock);
3257 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3258 return;
3259
3260 if (!check_access(mdr, cur, MAY_WRITE))
3261 return;
3262
3263 // wait for pending truncate?
3264 const inode_t *pi = cur->get_projected_inode();
3265 if (pi->is_truncating()) {
3266 dout(10) << " waiting for pending truncate from " << pi->truncate_from
3267 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
3268 mds->locker->drop_locks(mdr.get());
3269 mdr->drop_local_auth_pins();
3270 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
3271 return;
3272 }
3273
3274 do_open_truncate(mdr, cmode);
3275 return;
3276 }
3277
3278 // sync filelock if snapped.
3279 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3280 // and that data itself is flushed so that we can read the snapped data off disk.
3281 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
3282 rdlocks.insert(&cur->filelock);
3283 }
3284
3285 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3286 return;
3287
3288 mask = MAY_READ;
3289 if (cmode & CEPH_FILE_MODE_WR)
3290 mask |= MAY_WRITE;
3291 if (!check_access(mdr, cur, mask))
3292 return;
3293
3294 if (cur->is_file() || cur->is_dir()) {
3295 if (mdr->snapid == CEPH_NOSNAP) {
3296 // register new cap
3297 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr->session, 0, req->is_replay());
3298 if (cap)
3299 dout(12) << "open issued caps " << ccap_string(cap->pending())
3300 << " for " << req->get_source()
3301 << " on " << *cur << dendl;
3302 } else {
3303 int caps = ceph_caps_for_mode(cmode);
3304 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
3305 << " for " << req->get_source()
3306 << " snapid " << mdr->snapid
3307 << " on " << *cur << dendl;
3308 mdr->snap_caps = caps;
3309 }
3310 }
3311
3312 // increase max_size?
3313 if (cmode & CEPH_FILE_MODE_WR)
3314 mds->locker->check_inode_max_size(cur);
3315
3316 // make sure this inode gets into the journal
3317 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
3318 !cur->item_open_file.is_on_list()) {
3319 LogSegment *ls = mds->mdlog->get_current_segment();
3320 EOpen *le = new EOpen(mds->mdlog);
3321 mdlog->start_entry(le);
3322 le->add_clean_inode(cur);
3323 ls->open_files.push_back(&cur->item_open_file);
3324 mdlog->submit_entry(le);
3325 }
3326
3327 // hit pop
3328 if (cmode & CEPH_FILE_MODE_WR)
3329 mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IWR);
3330 else
3331 mds->balancer->hit_inode(mdr->get_mds_stamp(), cur, META_POP_IRD,
3332 mdr->client_request->get_source().num());
3333
3334 CDentry *dn = 0;
3335 if (req->get_dentry_wanted()) {
3336 assert(mdr->dn[0].size());
3337 dn = mdr->dn[0].back();
3338 }
3339
3340 mdr->tracei = cur;
3341 mdr->tracedn = dn;
3342 respond_to_request(mdr, 0);
3343}
3344
3345class C_MDS_openc_finish : public ServerLogContext {
3346 CDentry *dn;
3347 CInode *newi;
3348 snapid_t follows;
3349public:
3350 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni, snapid_t f) :
3351 ServerLogContext(s, r), dn(d), newi(ni), follows(f) {}
3352 void finish(int r) override {
3353 assert(r == 0);
3354
3355 dn->pop_projected_linkage();
3356
3357 // dirty inode, dn, dir
3358 newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
3359 newi->mark_dirty(newi->inode.version+1, mdr->ls);
3360 newi->_mark_dirty_parent(mdr->ls, true);
3361
3362 mdr->apply();
3363
3364 get_mds()->locker->share_inode_max_size(newi);
3365
3366 MDRequestRef null_ref;
3367 get_mds()->mdcache->send_dentry_link(dn, null_ref);
3368
3369 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
3370
3371 server->respond_to_request(mdr, 0);
3372
3373 assert(g_conf->mds_kill_openc_at != 1);
3374 }
3375};
3376
3377/* This function takes responsibility for the passed mdr*/
3378void Server::handle_client_openc(MDRequestRef& mdr)
3379{
3380 MClientRequest *req = mdr->client_request;
3381 client_t client = mdr->get_client();
3382
3383 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
3384
3385 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
3386 if (cmode < 0) {
3387 respond_to_request(mdr, -EINVAL);
3388 return;
3389 }
3390
c07f9fc5
FG
3391 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
3392
3393 if (!excl) {
7c673cae
FG
3394 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(),
3395 &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
3396 if (r > 0) return;
3397 if (r == 0) {
3398 // it existed.
3399 handle_client_open(mdr);
3400 return;
3401 }
3402 if (r < 0 && r != -ENOENT) {
3403 if (r == -ESTALE) {
3404 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3405 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
3406 mdcache->find_ino_peers(req->get_filepath().get_ino(), c);
3407 } else {
3408 dout(10) << "FAIL on error " << r << dendl;
3409 respond_to_request(mdr, r);
3410 }
3411 return;
3412 }
7c673cae
FG
3413 }
3414
7c673cae
FG
3415 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3416 file_layout_t *dir_layout = NULL;
3417 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks,
3418 !excl, false, false, &dir_layout);
3419 if (!dn) return;
3420 if (mdr->snapid != CEPH_NOSNAP) {
3421 respond_to_request(mdr, -EROFS);
3422 return;
3423 }
3424 // set layout
3425 file_layout_t layout;
3426 if (dir_layout)
3427 layout = *dir_layout;
3428 else
3429 layout = mdcache->default_file_layout;
3430
3431 // What kind of client caps are required to complete this operation
3432 uint64_t access = MAY_WRITE;
3433
3434 const auto default_layout = layout;
3435
3436 // fill in any special params from client
3437 if (req->head.args.open.stripe_unit)
3438 layout.stripe_unit = req->head.args.open.stripe_unit;
3439 if (req->head.args.open.stripe_count)
3440 layout.stripe_count = req->head.args.open.stripe_count;
3441 if (req->head.args.open.object_size)
3442 layout.object_size = req->head.args.open.object_size;
3443 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
3444 (__s32)req->head.args.open.pool >= 0) {
3445 layout.pool_id = req->head.args.open.pool;
3446
3447 // make sure we have as new a map as the client
3448 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
3449 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
3450 return;
3451 }
3452 }
3453
3454 // If client doesn't have capability to modify layout pools, then
3455 // only permit this request if the requested pool matches what the
3456 // file would have inherited anyway from its parent.
3457 if (default_layout != layout) {
3458 access |= MAY_SET_VXATTR;
3459 }
3460
3461 if (!layout.is_valid()) {
3462 dout(10) << " invalid initial file layout" << dendl;
3463 respond_to_request(mdr, -EINVAL);
3464 return;
3465 }
3466 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
3467 dout(10) << " invalid data pool " << layout.pool_id << dendl;
3468 respond_to_request(mdr, -EINVAL);
3469 return;
3470 }
3471
c07f9fc5 3472 // created null dn.
7c673cae
FG
3473 CDir *dir = dn->get_dir();
3474 CInode *diri = dir->get_inode();
3475 rdlocks.insert(&diri->authlock);
3476 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3477 return;
3478
3479 if (!check_access(mdr, diri, access))
3480 return;
3481
3482 if (!check_fragment_space(mdr, dir))
3483 return;
3484
3485 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3486
3487 if (!dnl->is_null()) {
3488 // it existed.
3489 assert(req->head.args.open.flags & CEPH_O_EXCL);
3490 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
3491 mdr->tracei = dnl->get_inode();
3492 mdr->tracedn = dn;
3493 respond_to_request(mdr, -EEXIST);
3494 return;
3495 }
3496
7c673cae
FG
3497 // create inode.
3498 SnapRealm *realm = diri->find_snaprealm(); // use directory's realm; inode isn't attached yet.
3499 snapid_t follows = realm->get_newest_seq();
3500
3501 CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
3502 req->head.args.open.mode | S_IFREG, &layout);
3503 assert(in);
3504
3505 // it's a file.
3506 dn->push_projected_linkage(in);
3507
3508 in->inode.version = dn->pre_dirty();
3509 if (layout.pool_id != mdcache->default_file_layout.pool_id)
3510 in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
3511 in->inode.update_backtrace();
3512 if (cmode & CEPH_FILE_MODE_WR) {
3513 in->inode.client_ranges[client].range.first = 0;
3514 in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
3515 in->inode.client_ranges[client].follows = follows;
3516 }
3517 in->inode.rstat.rfiles = 1;
3518
3519 assert(dn->first == follows+1);
3520 in->first = dn->first;
3521
3522 // prepare finisher
3523 mdr->ls = mdlog->get_current_segment();
3524 EUpdate *le = new EUpdate(mdlog, "openc");
3525 mdlog->start_entry(le);
3526 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
3527 journal_allocated_inos(mdr, &le->metablob);
3528 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
3529 le->metablob.add_primary_dentry(dn, in, true, true, true);
3530
3531 // do the open
3532 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
3533 in->authlock.set_state(LOCK_EXCL);
3534 in->xattrlock.set_state(LOCK_EXCL);
3535
3536 // make sure this inode gets into the journal
3537 le->metablob.add_opened_ino(in->ino());
3538 LogSegment *ls = mds->mdlog->get_current_segment();
3539 ls->open_files.push_back(&in->item_open_file);
3540
3541 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in, follows);
3542
3543 if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
3544 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
3545 // add the file created flag onto the reply if create_flags features is supported
3546 ::encode(in->inode.ino, mdr->reply_extra_bl);
3547 }
3548
3549 journal_and_reply(mdr, in, dn, le, fin);
3550
3551 // We hit_dir (via hit_inode) in our finish callback, but by then we might
3552 // have overshot the split size (multiple opencs in flight), so here is
3553 // an early chance to split the dir if this openc makes it oversized.
3554 mds->balancer->maybe_fragment(dir, false);
3555}
3556
3557
3558
3559void Server::handle_client_readdir(MDRequestRef& mdr)
3560{
3561 MClientRequest *req = mdr->client_request;
3562 client_t client = req->get_source().num();
3563 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3564 CInode *diri = rdlock_path_pin_ref(mdr, 0, rdlocks, false, true);
3565 if (!diri) return;
3566
3567 // it's a directory, right?
3568 if (!diri->is_dir()) {
3569 // not a dir
3570 dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
3571 respond_to_request(mdr, -ENOTDIR);
3572 return;
3573 }
3574
3575 rdlocks.insert(&diri->filelock);
3576 rdlocks.insert(&diri->dirfragtreelock);
3577
3578 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3579 return;
3580
3581 if (!check_access(mdr, diri, MAY_READ))
3582 return;
3583
3584 // which frag?
3585 frag_t fg = (__u32)req->head.args.readdir.frag;
3586 unsigned req_flags = (__u32)req->head.args.readdir.flags;
3587 string offset_str = req->get_path2();
3588
3589 __u32 offset_hash = 0;
3590 if (!offset_str.empty())
3591 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
3592 else
3593 offset_hash = (__u32)req->head.args.readdir.offset_hash;
3594
3595 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
3596 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
3597
3598 // does the frag exist?
3599 if (diri->dirfragtree[fg.value()] != fg) {
3600 frag_t newfg;
3601 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3602 if (fg.contains((unsigned)offset_hash)) {
3603 newfg = diri->dirfragtree[offset_hash];
3604 } else {
3605 // client actually wants next frag
3606 newfg = diri->dirfragtree[fg.value()];
3607 }
3608 } else {
3609 offset_str.clear();
3610 newfg = diri->dirfragtree[fg.value()];
3611 }
3612 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
3613 fg = newfg;
3614 }
3615
3616 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
3617 if (!dir) return;
3618
3619 // ok!
3620 dout(10) << "handle_client_readdir on " << *dir << dendl;
3621 assert(dir->is_auth());
3622
3623 if (!dir->is_complete()) {
3624 if (dir->is_frozen()) {
3625 dout(7) << "dir is frozen " << *dir << dendl;
3626 mds->locker->drop_locks(mdr.get());
3627 mdr->drop_local_auth_pins();
3628 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3629 return;
3630 }
3631 // fetch
3632 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
3633 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
3634 return;
3635 }
3636
3637#ifdef MDS_VERIFY_FRAGSTAT
3638 dir->verify_fragstat();
3639#endif
3640
3641 utime_t now = ceph_clock_now();
3642 mdr->set_mds_stamp(now);
3643
3644 snapid_t snapid = mdr->snapid;
3645 dout(10) << "snapid " << snapid << dendl;
3646
3647 SnapRealm *realm = diri->find_snaprealm();
3648
3649 unsigned max = req->head.args.readdir.max_entries;
3650 if (!max)
3651 max = dir->get_num_any(); // whatever, something big.
3652 unsigned max_bytes = req->head.args.readdir.max_bytes;
3653 if (!max_bytes)
3654 // make sure at least one item can be encoded
3655 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
3656
3657 // start final blob
3658 bufferlist dirbl;
3659 dir->encode_dirstat(dirbl, mds->get_nodeid());
3660
3661 // count bytes available.
3662 // this isn't perfect, but we should capture the main variable/unbounded size items!
3663 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
3664 int bytes_left = max_bytes - front_bytes;
3665 bytes_left -= realm->get_snap_trace().length();
3666
3667 // build dir contents
3668 bufferlist dnbl;
3669 __u32 numfiles = 0;
3670 bool start = !offset_hash && offset_str.empty();
3671 bool end = (dir->begin() == dir->end());
3672 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
3673 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
3674 for (CDir::map_t::iterator it = start ? dir->begin() : dir->lower_bound(skip_key);
3675 !end && numfiles < max;
3676 end = (it == dir->end())) {
3677 CDentry *dn = it->second;
3678 ++it;
3679
3680 if (dn->state_test(CDentry::STATE_PURGING))
3681 continue;
3682
3683 bool dnp = dn->use_projected(client, mdr);
3684 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
3685
3686 if (dnl->is_null())
3687 continue;
3688
3689 if (dn->last < snapid || dn->first > snapid) {
3690 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
3691 continue;
3692 }
3693
3694 if (!start) {
3695 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
3696 if (!(offset_key < dn->key()))
3697 continue;
3698 }
3699
3700 CInode *in = dnl->get_inode();
3701
3702 if (in && in->ino() == CEPH_INO_CEPH)
3703 continue;
3704
3705 // remote link?
3706 // better for the MDS to do the work, if we think the client will stat any of these files.
3707 if (dnl->is_remote() && !in) {
3708 in = mdcache->get_inode(dnl->get_remote_ino());
3709 if (in) {
3710 dn->link_remote(dnl, in);
3711 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
3712 dout(10) << "skipping bad remote ino on " << *dn << dendl;
3713 continue;
3714 } else {
3715 // touch everything i _do_ have
3716 for (CDir::map_t::iterator p = dir->begin(); p != dir->end(); ++p)
3717 if (!p->second->get_linkage()->is_null())
3718 mdcache->lru.lru_touch(p->second);
3719
3720 // already issued caps and leases, reply immediately.
3721 if (dnbl.length() > 0) {
3722 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
3723 dout(10) << " open remote dentry after caps were issued, stopping at "
3724 << dnbl.length() << " < " << bytes_left << dendl;
3725 break;
3726 }
3727
3728 mds->locker->drop_locks(mdr.get());
3729 mdr->drop_local_auth_pins();
3730 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
3731 return;
3732 }
3733 }
3734 assert(in);
3735
3736 if ((int)(dnbl.length() + dn->name.length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
3737 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
3738 break;
3739 }
3740
3741 unsigned start_len = dnbl.length();
3742
3743 // dentry
3744 dout(12) << "including dn " << *dn << dendl;
3745 ::encode(dn->name, dnbl);
3746 mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session);
3747
3748 // inode
3749 dout(12) << "including inode " << *in << dendl;
3750 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
3751 if (r < 0) {
3752 // chop off dn->name, lease
3753 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
3754 bufferlist keep;
3755 keep.substr_of(dnbl, 0, start_len);
3756 dnbl.swap(keep);
3757 break;
3758 }
3759 assert(r >= 0);
3760 numfiles++;
3761
3762 // touch dn
3763 mdcache->lru.lru_touch(dn);
3764 }
3765
3766 __u16 flags = 0;
3767 if (end) {
3768 flags = CEPH_READDIR_FRAG_END;
3769 if (start)
3770 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
3771 }
3772 // client only understand END and COMPLETE flags ?
3773 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3774 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
3775 }
3776
3777 // finish final blob
3778 ::encode(numfiles, dirbl);
3779 ::encode(flags, dirbl);
3780 dirbl.claim_append(dnbl);
3781
3782 // yay, reply
3783 dout(10) << "reply to " << *req << " readdir num=" << numfiles
3784 << " bytes=" << dirbl.length()
3785 << " start=" << (int)start
3786 << " end=" << (int)end
3787 << dendl;
3788 mdr->reply_extra_bl = dirbl;
3789
3790 // bump popularity. NOTE: this doesn't quite capture it.
3791 mds->balancer->hit_dir(now, dir, META_POP_IRD, -1, numfiles);
3792
3793 // reply
3794 mdr->tracei = diri;
3795 respond_to_request(mdr, 0);
3796}
3797
3798
3799
3800// ===============================================================================
3801// INODE UPDATES
3802
3803
3804/*
3805 * finisher for basic inode updates
3806 */
3807class C_MDS_inode_update_finish : public ServerLogContext {
3808 CInode *in;
3809 bool truncating_smaller, changed_ranges;
3810public:
3811 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
3812 bool sm=false, bool cr=false) :
3813 ServerLogContext(s, r), in(i), truncating_smaller(sm), changed_ranges(cr) { }
3814 void finish(int r) override {
3815 assert(r == 0);
3816
3817 // apply
3818 in->pop_and_dirty_projected_inode(mdr->ls);
3819 mdr->apply();
3820
3821 // notify any clients
3822 if (truncating_smaller && in->inode.is_truncating()) {
3823 get_mds()->locker->issue_truncate(in);
3824 get_mds()->mdcache->truncate_inode(in, mdr->ls);
3825 }
3826
3827 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
3828
3829 server->respond_to_request(mdr, 0);
3830
3831 if (changed_ranges)
3832 get_mds()->locker->share_inode_max_size(in);
3833 }
3834};
3835
3836void Server::handle_client_file_setlock(MDRequestRef& mdr)
3837{
3838 MClientRequest *req = mdr->client_request;
3839 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3840
3841 // get the inode to operate on, and set up any locks needed for that
3842 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3843 if (!cur)
3844 return;
3845
3846 xlocks.insert(&cur->flocklock);
3847 /* acquire_locks will return true if it gets the locks. If it fails,
3848 it will redeliver this request at a later date, so drop the request.
3849 */
3850 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
3851 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
3852 return;
3853 }
3854
3855 // copy the lock change into a ceph_filelock so we can store/apply it
3856 ceph_filelock set_lock;
3857 set_lock.start = req->head.args.filelock_change.start;
3858 set_lock.length = req->head.args.filelock_change.length;
3859 set_lock.client = req->get_orig_source().num();
3860 set_lock.owner = req->head.args.filelock_change.owner;
3861 set_lock.pid = req->head.args.filelock_change.pid;
3862 set_lock.type = req->head.args.filelock_change.type;
3863 bool will_wait = req->head.args.filelock_change.wait;
3864
3865 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
3866
3867 ceph_lock_state_t *lock_state = NULL;
3868 bool interrupt = false;
3869
3870 // get the appropriate lock state
3871 switch (req->head.args.filelock_change.rule) {
3872 case CEPH_LOCK_FLOCK_INTR:
3873 interrupt = true;
3874 // fall-thru
3875 case CEPH_LOCK_FLOCK:
3876 lock_state = cur->get_flock_lock_state();
3877 break;
3878
3879 case CEPH_LOCK_FCNTL_INTR:
3880 interrupt = true;
3881 // fall-thru
3882 case CEPH_LOCK_FCNTL:
3883 lock_state = cur->get_fcntl_lock_state();
3884 break;
3885
3886 default:
3887 dout(10) << "got unknown lock type " << set_lock.type
3888 << ", dropping request!" << dendl;
3889 respond_to_request(mdr, -EOPNOTSUPP);
3890 return;
3891 }
3892
3893 dout(10) << " state prior to lock change: " << *lock_state << dendl;
3894 if (CEPH_LOCK_UNLOCK == set_lock.type) {
3895 list<ceph_filelock> activated_locks;
3896 list<MDSInternalContextBase*> waiters;
3897 if (lock_state->is_waiting(set_lock)) {
3898 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
3899 lock_state->remove_waiting(set_lock);
3900 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
3901 } else if (!interrupt) {
3902 dout(10) << " unlock attempt on " << set_lock << dendl;
3903 lock_state->remove_lock(set_lock, activated_locks);
3904 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
3905 }
3906 mds->queue_waiters(waiters);
3907
3908 respond_to_request(mdr, 0);
3909 } else {
3910 dout(10) << " lock attempt on " << set_lock << dendl;
3911 bool deadlock = false;
3912 if (mdr->more()->flock_was_waiting &&
3913 !lock_state->is_waiting(set_lock)) {
3914 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
3915 respond_to_request(mdr, -EINTR);
3916 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
3917 dout(10) << " it failed on this attempt" << dendl;
3918 // couldn't set lock right now
3919 if (deadlock) {
3920 respond_to_request(mdr, -EDEADLK);
3921 } else if (!will_wait) {
3922 respond_to_request(mdr, -EWOULDBLOCK);
3923 } else {
3924 dout(10) << " added to waiting list" << dendl;
3925 assert(lock_state->is_waiting(set_lock));
3926 mdr->more()->flock_was_waiting = true;
3927 mds->locker->drop_locks(mdr.get());
3928 mdr->drop_local_auth_pins();
3929 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
3930 }
3931 } else
3932 respond_to_request(mdr, 0);
3933 }
3934 dout(10) << " state after lock change: " << *lock_state << dendl;
3935}
3936
3937void Server::handle_client_file_readlock(MDRequestRef& mdr)
3938{
3939 MClientRequest *req = mdr->client_request;
3940 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3941
3942 // get the inode to operate on, and set up any locks needed for that
3943 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3944 if (!cur)
3945 return;
3946
3947 /* acquire_locks will return true if it gets the locks. If it fails,
3948 it will redeliver this request at a later date, so drop the request.
3949 */
3950 rdlocks.insert(&cur->flocklock);
3951 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
3952 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
3953 return;
3954 }
3955
3956 // copy the lock change into a ceph_filelock so we can store/apply it
3957 ceph_filelock checking_lock;
3958 checking_lock.start = req->head.args.filelock_change.start;
3959 checking_lock.length = req->head.args.filelock_change.length;
3960 checking_lock.client = req->get_orig_source().num();
3961 checking_lock.owner = req->head.args.filelock_change.owner;
3962 checking_lock.pid = req->head.args.filelock_change.pid;
3963 checking_lock.type = req->head.args.filelock_change.type;
3964
3965 // get the appropriate lock state
3966 ceph_lock_state_t *lock_state = NULL;
3967 switch (req->head.args.filelock_change.rule) {
3968 case CEPH_LOCK_FLOCK:
3969 lock_state = cur->get_flock_lock_state();
3970 break;
3971
3972 case CEPH_LOCK_FCNTL:
3973 lock_state = cur->get_fcntl_lock_state();
3974 break;
3975
3976 default:
3977 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
3978 respond_to_request(mdr, -EINVAL);
3979 return;
3980 }
3981 lock_state->look_for_lock(checking_lock);
3982
3983 bufferlist lock_bl;
3984 ::encode(checking_lock, lock_bl);
3985
3986 mdr->reply_extra_bl = lock_bl;
3987 respond_to_request(mdr, 0);
3988}
3989
3990void Server::handle_client_setattr(MDRequestRef& mdr)
3991{
3992 MClientRequest *req = mdr->client_request;
3993 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3994 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3995 if (!cur) return;
3996
3997 if (mdr->snapid != CEPH_NOSNAP) {
3998 respond_to_request(mdr, -EROFS);
3999 return;
4000 }
4001 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
4002 respond_to_request(mdr, -EPERM);
4003 return;
4004 }
4005
4006 __u32 mask = req->head.args.setattr.mask;
4007 __u32 access_mask = MAY_WRITE;
4008
4009 // xlock inode
4010 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
4011 xlocks.insert(&cur->authlock);
4012 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
4013 xlocks.insert(&cur->filelock);
4014 if (mask & CEPH_SETATTR_CTIME)
4015 wrlocks.insert(&cur->versionlock);
4016
4017 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4018 return;
4019
4020 if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
4021 access_mask |= MAY_CHOWN;
4022
4023 if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
4024 access_mask |= MAY_CHGRP;
4025
4026 if (!check_access(mdr, cur, access_mask))
4027 return;
4028
4029 // trunc from bigger -> smaller?
4030 inode_t *pi = cur->get_projected_inode();
4031
4032 uint64_t old_size = MAX(pi->size, req->head.args.setattr.old_size);
4033
4034 // ENOSPC on growing file while full, but allow shrinks
4035 if (is_full && req->head.args.setattr.size > old_size) {
4036 dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
4037 respond_to_request(mdr, -ENOSPC);
4038 return;
4039 }
4040
4041 bool truncating_smaller = false;
4042 if (mask & CEPH_SETATTR_SIZE) {
4043 truncating_smaller = req->head.args.setattr.size < old_size;
4044 if (truncating_smaller && pi->is_truncating()) {
4045 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4046 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4047 mds->locker->drop_locks(mdr.get());
4048 mdr->drop_local_auth_pins();
4049 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4050 return;
4051 }
4052 }
4053
4054 bool changed_ranges = false;
4055
4056 // project update
4057 mdr->ls = mdlog->get_current_segment();
4058 EUpdate *le = new EUpdate(mdlog, "setattr");
4059 mdlog->start_entry(le);
4060
4061 pi = cur->project_inode();
4062
4063 if (mask & CEPH_SETATTR_UID)
4064 pi->uid = req->head.args.setattr.uid;
4065 if (mask & CEPH_SETATTR_GID)
4066 pi->gid = req->head.args.setattr.gid;
4067
4068 if (mask & CEPH_SETATTR_MODE)
4069 pi->mode = (pi->mode & ~07777) | (req->head.args.setattr.mode & 07777);
4070 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
4071 S_ISREG(pi->mode)) {
4072 pi->mode &= ~S_ISUID;
4073 if ((pi->mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP))
4074 pi->mode &= ~S_ISGID;
4075 }
4076
4077 if (mask & CEPH_SETATTR_MTIME)
4078 pi->mtime = req->head.args.setattr.mtime;
4079 if (mask & CEPH_SETATTR_ATIME)
4080 pi->atime = req->head.args.setattr.atime;
4081 if (mask & CEPH_SETATTR_BTIME)
4082 pi->btime = req->head.args.setattr.btime;
4083 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
4084 pi->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
4085 if (mask & CEPH_SETATTR_SIZE) {
4086 if (truncating_smaller) {
4087 pi->truncate(old_size, req->head.args.setattr.size);
4088 le->metablob.add_truncate_start(cur->ino());
4089 } else {
4090 pi->size = req->head.args.setattr.size;
4091 pi->rstat.rbytes = pi->size;
4092 }
4093 pi->mtime = mdr->get_op_stamp();
4094
4095 // adjust client's max_size?
4096 map<client_t,client_writeable_range_t> new_ranges;
4097 bool max_increased = false;
4098 mds->locker->calc_new_client_ranges(cur, pi->size, &new_ranges, &max_increased);
4099 if (pi->client_ranges != new_ranges) {
4100 dout(10) << " client_ranges " << pi->client_ranges << " -> " << new_ranges << dendl;
4101 pi->client_ranges = new_ranges;
4102 changed_ranges = true;
4103 }
4104 }
4105
4106 pi->version = cur->pre_dirty();
4107 pi->ctime = mdr->get_op_stamp();
4108 pi->change_attr++;
4109
4110 // log + wait
4111 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4112 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4113 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4114
4115 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
4116 truncating_smaller, changed_ranges));
4117
4118 // flush immediately if there are readers/writers waiting
4119 if (xlocks.count(&cur->filelock) &&
4120 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
4121 mds->mdlog->flush();
4122}
4123
4124/* Takes responsibility for mdr */
4125void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
4126{
4127 CInode *in = mdr->in[0];
4128 client_t client = mdr->get_client();
4129 assert(in);
4130
4131 dout(10) << "do_open_truncate " << *in << dendl;
4132
4133 SnapRealm *realm = in->find_snaprealm();
4134 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
4135
4136 mdr->ls = mdlog->get_current_segment();
4137 EUpdate *le = new EUpdate(mdlog, "open_truncate");
4138 mdlog->start_entry(le);
4139
4140 // prepare
4141 inode_t *pi = in->project_inode();
4142 pi->version = in->pre_dirty();
4143 pi->mtime = pi->ctime = mdr->get_op_stamp();
4144 pi->change_attr++;
4145
4146 uint64_t old_size = MAX(pi->size, mdr->client_request->head.args.open.old_size);
4147 if (old_size > 0) {
4148 pi->truncate(old_size, 0);
4149 le->metablob.add_truncate_start(in->ino());
4150 }
4151
4152 bool changed_ranges = false;
4153 if (cmode & CEPH_FILE_MODE_WR) {
4154 pi->client_ranges[client].range.first = 0;
4155 pi->client_ranges[client].range.last = pi->get_layout_size_increment();
4156 pi->client_ranges[client].follows = in->find_snaprealm()->get_newest_seq();
4157 changed_ranges = true;
4158 }
4159
4160 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
4161
4162 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
4163 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
4164
4165 // make sure ino gets into the journal
4166 le->metablob.add_opened_ino(in->ino());
4167 LogSegment *ls = mds->mdlog->get_current_segment();
4168 ls->open_files.push_back(&in->item_open_file);
4169
4170 mdr->o_trunc = true;
4171
4172 CDentry *dn = 0;
4173 if (mdr->client_request->get_dentry_wanted()) {
4174 assert(mdr->dn[0].size());
4175 dn = mdr->dn[0].back();
4176 }
4177
4178 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
4179 changed_ranges));
4180 // Although the `open` part can give an early reply, the truncation won't
4181 // happen until our EUpdate is persistent, to give the client a prompt
4182 // response we must also flush that event.
4183 mdlog->flush();
4184}
4185
4186
4187/* This function cleans up the passed mdr */
4188void Server::handle_client_setlayout(MDRequestRef& mdr)
4189{
4190 MClientRequest *req = mdr->client_request;
4191 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4192 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4193 if (!cur) return;
4194
4195 if (mdr->snapid != CEPH_NOSNAP) {
4196 respond_to_request(mdr, -EROFS);
4197 return;
4198 }
4199 if (!cur->is_file()) {
4200 respond_to_request(mdr, -EINVAL);
4201 return;
4202 }
4203 if (cur->get_projected_inode()->size ||
4204 cur->get_projected_inode()->truncate_seq > 1) {
4205 respond_to_request(mdr, -ENOTEMPTY);
4206 return;
4207 }
4208
4209 // validate layout
4210 file_layout_t layout = cur->get_projected_inode()->layout;
4211 // save existing layout for later
4212 const auto old_layout = layout;
4213
4214 int access = MAY_WRITE;
4215
4216 if (req->head.args.setlayout.layout.fl_object_size > 0)
4217 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4218 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4219 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4220 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4221 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4222 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4223 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4224
4225 // make sure we have as new a map as the client
4226 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4227 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4228 return;
4229 }
4230 }
4231
4232 // Don't permit layout modifications without 'p' caps
4233 if (layout != old_layout) {
4234 access |= MAY_SET_VXATTR;
4235 }
4236
4237 if (!layout.is_valid()) {
4238 dout(10) << "bad layout" << dendl;
4239 respond_to_request(mdr, -EINVAL);
4240 return;
4241 }
4242 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4243 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4244 respond_to_request(mdr, -EINVAL);
4245 return;
4246 }
4247
4248 xlocks.insert(&cur->filelock);
4249 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4250 return;
4251
4252 if (!check_access(mdr, cur, access))
4253 return;
4254
4255 // project update
4256 inode_t *pi = cur->project_inode();
4257 pi->layout = layout;
4258 // add the old pool to the inode
4259 pi->add_old_pool(old_layout.pool_id);
4260 pi->version = cur->pre_dirty();
4261 pi->ctime = mdr->get_op_stamp();
4262 pi->change_attr++;
4263
4264 // log + wait
4265 mdr->ls = mdlog->get_current_segment();
4266 EUpdate *le = new EUpdate(mdlog, "setlayout");
4267 mdlog->start_entry(le);
4268 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4269 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4270 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4271
4272 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4273}
4274
4275void Server::handle_client_setdirlayout(MDRequestRef& mdr)
4276{
4277 MClientRequest *req = mdr->client_request;
4278 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4279 file_layout_t *dir_layout = NULL;
4280 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4281 if (!cur) return;
4282
4283 if (mdr->snapid != CEPH_NOSNAP) {
4284 respond_to_request(mdr, -EROFS);
4285 return;
4286 }
4287
4288 if (!cur->is_dir()) {
4289 respond_to_request(mdr, -ENOTDIR);
4290 return;
4291 }
4292
4293 xlocks.insert(&cur->policylock);
4294 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4295 return;
4296
4297 // validate layout
4298 const inode_t *old_pi = cur->get_projected_inode();
4299 file_layout_t layout;
4300 if (old_pi->has_layout())
4301 layout = old_pi->layout;
4302 else if (dir_layout)
4303 layout = *dir_layout;
4304 else
4305 layout = mdcache->default_file_layout;
4306
4307 // Level of access required to complete
4308 int access = MAY_WRITE;
4309
4310 const auto old_layout = layout;
4311
4312 if (req->head.args.setlayout.layout.fl_object_size > 0)
4313 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4314 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4315 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4316 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4317 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4318 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4319 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4320 // make sure we have as new a map as the client
4321 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4322 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4323 return;
4324 }
4325 }
4326
4327 if (layout != old_layout) {
4328 access |= MAY_SET_VXATTR;
4329 }
4330
4331 if (!layout.is_valid()) {
4332 dout(10) << "bad layout" << dendl;
4333 respond_to_request(mdr, -EINVAL);
4334 return;
4335 }
4336 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4337 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4338 respond_to_request(mdr, -EINVAL);
4339 return;
4340 }
4341
4342 if (!check_access(mdr, cur, access))
4343 return;
4344
4345 inode_t *pi = cur->project_inode();
4346 pi->layout = layout;
4347 pi->version = cur->pre_dirty();
4348
4349 // log + wait
4350 mdr->ls = mdlog->get_current_segment();
4351 EUpdate *le = new EUpdate(mdlog, "setlayout");
4352 mdlog->start_entry(le);
4353 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4354 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4355 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4356
4357 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4358}
4359
4360// XATTRS
4361
4362int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
4363 file_layout_t *layout, bool validate)
4364{
4365 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
4366 try {
4367 if (name == "layout") {
4368 string::iterator begin = value.begin();
4369 string::iterator end = value.end();
4370 keys_and_values<string::iterator> p; // create instance of parser
4371 std::map<string, string> m; // map to receive results
4372 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4373 return -EINVAL;
4374 }
4375 string left(begin, end);
4376 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4377 if (begin != end)
4378 return -EINVAL;
4379 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4380 // Skip validation on each attr, we do it once at the end (avoid
4381 // rejecting intermediate states if the overall result is ok)
4382 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
4383 osdmap, layout, false);
4384 if (r < 0)
4385 return r;
4386 }
4387 } else if (name == "layout.object_size") {
4388 layout->object_size = boost::lexical_cast<unsigned>(value);
4389 } else if (name == "layout.stripe_unit") {
4390 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
4391 } else if (name == "layout.stripe_count") {
4392 layout->stripe_count = boost::lexical_cast<unsigned>(value);
4393 } else if (name == "layout.pool") {
4394 try {
4395 layout->pool_id = boost::lexical_cast<unsigned>(value);
4396 } catch (boost::bad_lexical_cast const&) {
4397 int64_t pool = osdmap.lookup_pg_pool_name(value);
4398 if (pool < 0) {
4399 dout(10) << " unknown pool " << value << dendl;
4400 return -ENOENT;
4401 }
4402 layout->pool_id = pool;
4403 }
4404 } else if (name == "layout.pool_namespace") {
4405 layout->pool_ns = value;
4406 } else {
4407 dout(10) << " unknown layout vxattr " << name << dendl;
4408 return -EINVAL;
4409 }
4410 } catch (boost::bad_lexical_cast const&) {
4411 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4412 return -EINVAL;
4413 }
4414
4415 if (validate && !layout->is_valid()) {
4416 dout(10) << "bad layout" << dendl;
4417 return -EINVAL;
4418 }
4419 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
4420 dout(10) << " invalid data pool " << layout->pool_id << dendl;
4421 return -EINVAL;
4422 }
4423 return 0;
4424}
4425
4426int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
4427{
4428 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
4429 try {
4430 if (name == "quota") {
4431 string::iterator begin = value.begin();
4432 string::iterator end = value.end();
4433 keys_and_values<string::iterator> p; // create instance of parser
4434 std::map<string, string> m; // map to receive results
4435 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4436 return -EINVAL;
4437 }
4438 string left(begin, end);
4439 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4440 if (begin != end)
4441 return -EINVAL;
4442 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4443 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
4444 if (r < 0)
4445 return r;
4446 }
4447 } else if (name == "quota.max_bytes") {
4448 int64_t q = boost::lexical_cast<int64_t>(value);
4449 if (q < 0)
4450 return -EINVAL;
4451 quota->max_bytes = q;
4452 } else if (name == "quota.max_files") {
4453 int64_t q = boost::lexical_cast<int64_t>(value);
4454 if (q < 0)
4455 return -EINVAL;
4456 quota->max_files = q;
4457 } else {
4458 dout(10) << " unknown quota vxattr " << name << dendl;
4459 return -EINVAL;
4460 }
4461 } catch (boost::bad_lexical_cast const&) {
4462 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4463 return -EINVAL;
4464 }
4465
4466 if (!quota->is_valid()) {
4467 dout(10) << "bad quota" << dendl;
4468 return -EINVAL;
4469 }
4470 return 0;
4471}
4472
4473/*
4474 * Verify that the file layout attribute carried by client
4475 * is well-formatted.
4476 * Return 0 on success, otherwise this function takes
4477 * responsibility for the passed mdr.
4478 */
4479int Server::check_layout_vxattr(MDRequestRef& mdr,
4480 string name,
4481 string value,
4482 file_layout_t *layout)
4483{
4484 MClientRequest *req = mdr->client_request;
4485 epoch_t epoch;
4486 int r;
4487
4488 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4489 r = parse_layout_vxattr(name, value, osdmap, layout);
4490 epoch = osdmap.get_epoch();
4491 });
4492
4493 if (r == -ENOENT) {
4494
4495 // we don't have the specified pool, make sure our map
4496 // is newer than or as new as the client.
4497 epoch_t req_epoch = req->get_osdmap_epoch();
4498
4499 if (req_epoch > epoch) {
4500
4501 // well, our map is older. consult mds.
4502 Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
4503
4504 if (!mds->objecter->wait_for_map(req_epoch, fin))
4505 return r; // wait, fin will retry this request later
4506
4507 delete fin;
4508
4509 // now we have at least as new a map as the client, try again.
4510 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4511 r = parse_layout_vxattr(name, value, osdmap, layout);
4512 epoch = osdmap.get_epoch();
4513 });
4514
4515 assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
4516
4517 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
4518
4519 // For compatibility with client w/ old code, we still need get the
4520 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
4521 // we can remove those code.
4522 mdr->waited_for_osdmap = true;
4523 mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
4524 mds, new C_MDS_RetryRequest(mdcache, mdr)));
4525 return r;
4526 }
4527 }
4528
4529 if (r < 0) {
4530
4531 if (r == -ENOENT)
4532 r = -EINVAL;
4533
4534 respond_to_request(mdr, r);
4535 return r;
4536 }
4537
4538 // all is well
4539 return 0;
4540}
4541
4542void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
4543 file_layout_t *dir_layout,
4544 set<SimpleLock*> rdlocks,
4545 set<SimpleLock*> wrlocks,
4546 set<SimpleLock*> xlocks)
4547{
4548 MClientRequest *req = mdr->client_request;
4549 string name(req->get_path2());
4550 bufferlist bl = req->get_data();
4551 string value (bl.c_str(), bl.length());
4552 dout(10) << "handle_set_vxattr " << name
4553 << " val " << value.length()
4554 << " bytes on " << *cur
4555 << dendl;
4556
4557 inode_t *pi = NULL;
4558 string rest;
4559
4560 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
4561 return;
4562 }
4563
4564 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
4565 if (!cur->is_dir()) {
4566 respond_to_request(mdr, -EINVAL);
4567 return;
4568 }
4569
4570 file_layout_t layout;
4571 if (cur->get_projected_inode()->has_layout())
4572 layout = cur->get_projected_inode()->layout;
4573 else if (dir_layout)
4574 layout = *dir_layout;
4575 else
4576 layout = mdcache->default_file_layout;
4577
4578 rest = name.substr(name.find("layout"));
4579 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4580 return;
4581
4582 xlocks.insert(&cur->policylock);
4583 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4584 return;
4585
4586 pi = cur->project_inode();
4587 pi->layout = layout;
4588 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
4589 if (!cur->is_file()) {
4590 respond_to_request(mdr, -EINVAL);
4591 return;
4592 }
4593 if (cur->get_projected_inode()->size ||
4594 cur->get_projected_inode()->truncate_seq > 1) {
4595 respond_to_request(mdr, -ENOTEMPTY);
4596 return;
4597 }
4598 file_layout_t layout = cur->get_projected_inode()->layout;
4599 rest = name.substr(name.find("layout"));
4600 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4601 return;
4602
4603 xlocks.insert(&cur->filelock);
4604 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4605 return;
4606
4607 pi = cur->project_inode();
4608 int64_t old_pool = pi->layout.pool_id;
4609 pi->add_old_pool(old_pool);
4610 pi->layout = layout;
4611 pi->ctime = mdr->get_op_stamp();
4612 } else if (name.compare(0, 10, "ceph.quota") == 0) {
4613 if (!cur->is_dir() || cur->is_root()) {
4614 respond_to_request(mdr, -EINVAL);
4615 return;
4616 }
4617
4618 quota_info_t quota = cur->get_projected_inode()->quota;
4619
4620 rest = name.substr(name.find("quota"));
4621 int r = parse_quota_vxattr(rest, value, &quota);
4622 if (r < 0) {
4623 respond_to_request(mdr, r);
4624 return;
4625 }
4626
4627 xlocks.insert(&cur->policylock);
4628 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4629 return;
4630
4631 pi = cur->project_inode();
4632 pi->quota = quota;
4633 } else if (name.find("ceph.dir.pin") == 0) {
4634 if (!cur->is_dir() || cur->is_root()) {
4635 respond_to_request(mdr, -EINVAL);
4636 return;
4637 }
4638
4639 mds_rank_t rank;
4640 try {
4641 rank = boost::lexical_cast<mds_rank_t>(value);
4642 if (rank < 0) rank = MDS_RANK_NONE;
4643 } catch (boost::bad_lexical_cast const&) {
4644 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4645 respond_to_request(mdr, -EINVAL);
4646 return;
4647 }
4648
4649 xlocks.insert(&cur->policylock);
4650 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4651 return;
4652
4653 pi = cur->project_inode();
4654 cur->set_export_pin(rank);
4655 } else {
4656 dout(10) << " unknown vxattr " << name << dendl;
4657 respond_to_request(mdr, -EINVAL);
4658 return;
4659 }
4660
4661 pi->change_attr++;
4662 pi->ctime = mdr->get_op_stamp();
4663 pi->version = cur->pre_dirty();
4664 if (cur->is_file())
4665 pi->update_backtrace();
4666
4667 // log + wait
4668 mdr->ls = mdlog->get_current_segment();
4669 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
4670 mdlog->start_entry(le);
4671 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4672 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4673 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4674
4675 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4676 return;
4677}
4678
4679void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
4680 file_layout_t *dir_layout,
4681 set<SimpleLock*> rdlocks,
4682 set<SimpleLock*> wrlocks,
4683 set<SimpleLock*> xlocks)
4684{
4685 MClientRequest *req = mdr->client_request;
4686 string name(req->get_path2());
4687
4688 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
4689
4690 if (name == "ceph.dir.layout") {
4691 if (!cur->is_dir()) {
4692 respond_to_request(mdr, -ENODATA);
4693 return;
4694 }
4695 if (cur->is_root()) {
4696 dout(10) << "can't remove layout policy on the root directory" << dendl;
4697 respond_to_request(mdr, -EINVAL);
4698 return;
4699 }
4700
4701 if (!cur->get_projected_inode()->has_layout()) {
4702 respond_to_request(mdr, -ENODATA);
4703 return;
4704 }
4705
4706 xlocks.insert(&cur->policylock);
4707 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4708 return;
4709
4710 inode_t *pi = cur->project_inode();
4711 pi->clear_layout();
4712 pi->version = cur->pre_dirty();
4713
4714 // log + wait
4715 mdr->ls = mdlog->get_current_segment();
4716 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
4717 mdlog->start_entry(le);
4718 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4719 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4720 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4721
4722 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4723 return;
4724 } else if (name == "ceph.dir.layout.pool_namespace"
4725 || name == "ceph.file.layout.pool_namespace") {
4726 // Namespace is the only layout field that has a meaningful
4727 // null/none value (empty string, means default layout). Is equivalent
4728 // to a setxattr with empty string: pass through the empty payload of
4729 // the rmxattr request to do this.
4730 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4731 return;
4732 }
4733
4734 respond_to_request(mdr, -ENODATA);
4735}
4736
4737class C_MDS_inode_xattr_update_finish : public ServerLogContext {
4738 CInode *in;
4739public:
4740
4741 C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
4742 ServerLogContext(s, r), in(i) { }
4743 void finish(int r) override {
4744 assert(r == 0);
4745
4746 // apply
4747 in->pop_and_dirty_projected_inode(mdr->ls);
4748
4749 mdr->apply();
4750
4751 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
4752
4753 server->respond_to_request(mdr, 0);
4754 }
4755};
4756
4757void Server::handle_client_setxattr(MDRequestRef& mdr)
4758{
4759 MClientRequest *req = mdr->client_request;
4760 string name(req->get_path2());
4761 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4762 CInode *cur;
4763
4764 file_layout_t *dir_layout = NULL;
4765 if (name.compare(0, 15, "ceph.dir.layout") == 0)
4766 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4767 else
4768 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4769 if (!cur)
4770 return;
4771
4772 if (mdr->snapid != CEPH_NOSNAP) {
4773 respond_to_request(mdr, -EROFS);
4774 return;
4775 }
4776
4777 int flags = req->head.args.setxattr.flags;
4778
4779 // magic ceph.* namespace?
4780 if (name.compare(0, 5, "ceph.") == 0) {
4781 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4782 return;
4783 }
4784
4785 xlocks.insert(&cur->xattrlock);
4786 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4787 return;
4788
4789 if (!check_access(mdr, cur, MAY_WRITE))
4790 return;
4791
4792 map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
4793 size_t len = req->get_data().length();
4794 size_t inc = len + name.length();
4795
4796 // check xattrs kv pairs size
4797 size_t cur_xattrs_size = 0;
4798 for (const auto& p : *pxattrs) {
4799 if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
4800 continue;
4801 }
4802 cur_xattrs_size += p.first.length() + p.second.length();
4803 }
4804
4805 if (((cur_xattrs_size + inc) > g_conf->mds_max_xattr_pairs_size)) {
4806 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
4807 << cur_xattrs_size << ", inc " << inc << dendl;
4808 respond_to_request(mdr, -ENOSPC);
4809 return;
4810 }
4811
4812 if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(name)) {
4813 dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
4814 respond_to_request(mdr, -EEXIST);
4815 return;
4816 }
4817 if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(name)) {
4818 dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
4819 respond_to_request(mdr, -ENODATA);
4820 return;
4821 }
4822
4823 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
4824
4825 // project update
4826 map<string,bufferptr> *px = new map<string,bufferptr>;
4827 inode_t *pi = cur->project_inode(px);
4828 pi->version = cur->pre_dirty();
4829 pi->ctime = mdr->get_op_stamp();
4830 pi->change_attr++;
4831 pi->xattr_version++;
4832 px->erase(name);
4833 if (!(flags & CEPH_XATTR_REMOVE)) {
4834 (*px)[name] = buffer::create(len);
4835 if (len)
4836 req->get_data().copy(0, len, (*px)[name].c_str());
4837 }
4838
4839 // log + wait
4840 mdr->ls = mdlog->get_current_segment();
4841 EUpdate *le = new EUpdate(mdlog, "setxattr");
4842 mdlog->start_entry(le);
4843 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4844 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4845 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4846
4847 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4848}
4849
4850void Server::handle_client_removexattr(MDRequestRef& mdr)
4851{
4852 MClientRequest *req = mdr->client_request;
4853 string name(req->get_path2());
4854 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4855 file_layout_t *dir_layout = NULL;
4856 CInode *cur;
4857 if (name == "ceph.dir.layout")
4858 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4859 else
4860 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4861 if (!cur)
4862 return;
4863
4864 if (mdr->snapid != CEPH_NOSNAP) {
4865 respond_to_request(mdr, -EROFS);
4866 return;
4867 }
4868
4869 if (name.compare(0, 5, "ceph.") == 0) {
4870 handle_remove_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4871 return;
4872 }
4873
4874 xlocks.insert(&cur->xattrlock);
4875 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4876 return;
4877
4878 map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
4879 if (pxattrs->count(name) == 0) {
4880 dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
4881 respond_to_request(mdr, -ENODATA);
4882 return;
4883 }
4884
4885 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
4886
4887 // project update
4888 map<string,bufferptr> *px = new map<string,bufferptr>;
4889 inode_t *pi = cur->project_inode(px);
4890 pi->version = cur->pre_dirty();
4891 pi->ctime = mdr->get_op_stamp();
4892 pi->change_attr++;
4893 pi->xattr_version++;
4894 px->erase(name);
4895
4896 // log + wait
4897 mdr->ls = mdlog->get_current_segment();
4898 EUpdate *le = new EUpdate(mdlog, "removexattr");
4899 mdlog->start_entry(le);
4900 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4901 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4902 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4903
4904 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4905}
4906
4907
4908// =================================================================
4909// DIRECTORY and NAMESPACE OPS
4910
4911
4912// ------------------------------------------------
4913
4914// MKNOD
4915
4916class C_MDS_mknod_finish : public ServerLogContext {
4917 CDentry *dn;
4918 CInode *newi;
4919public:
4920 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4921 ServerLogContext(s, r), dn(d), newi(ni) {}
4922 void finish(int r) override {
4923 assert(r == 0);
4924
4925 // link the inode
4926 dn->pop_projected_linkage();
4927
4928 // be a bit hacky with the inode version, here.. we decrement it
4929 // just to keep mark_dirty() happen. (we didn't bother projecting
4930 // a new version of hte inode since it's just been created)
4931 newi->inode.version--;
4932 newi->mark_dirty(newi->inode.version + 1, mdr->ls);
4933 newi->_mark_dirty_parent(mdr->ls, true);
4934
4935 // mkdir?
4936 if (newi->inode.is_dir()) {
4937 CDir *dir = newi->get_dirfrag(frag_t());
4938 assert(dir);
4939 dir->fnode.version--;
4940 dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
4941 dir->mark_new(mdr->ls);
4942 }
4943
4944 mdr->apply();
4945
4946 MDRequestRef null_ref;
4947 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4948
4949 if (newi->inode.is_file())
4950 get_mds()->locker->share_inode_max_size(newi);
4951
4952 // hit pop
4953 get_mds()->balancer->hit_inode(mdr->get_mds_stamp(), newi, META_POP_IWR);
4954
4955 // reply
4956 server->respond_to_request(mdr, 0);
4957 }
4958};
4959
4960
4961void Server::handle_client_mknod(MDRequestRef& mdr)
4962{
4963 MClientRequest *req = mdr->client_request;
4964 client_t client = mdr->get_client();
4965 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4966 file_layout_t *dir_layout = NULL;
4967 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false,
4968 &dir_layout);
4969 if (!dn) return;
4970 if (mdr->snapid != CEPH_NOSNAP) {
4971 respond_to_request(mdr, -EROFS);
4972 return;
4973 }
4974 CInode *diri = dn->get_dir()->get_inode();
4975 rdlocks.insert(&diri->authlock);
4976 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4977 return;
4978
4979 if (!check_access(mdr, diri, MAY_WRITE))
4980 return;
4981
4982 if (!check_fragment_space(mdr, dn->get_dir()))
4983 return;
4984
4985 unsigned mode = req->head.args.mknod.mode;
4986 if ((mode & S_IFMT) == 0)
4987 mode |= S_IFREG;
4988
4989 // set layout
4990 file_layout_t layout;
4991 if (dir_layout && S_ISREG(mode))
4992 layout = *dir_layout;
4993 else
4994 layout = mdcache->default_file_layout;
4995
4996 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
4997 snapid_t follows = realm->get_newest_seq();
4998 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4999 mode, &layout);
5000 assert(newi);
5001
5002 dn->push_projected_linkage(newi);
5003
5004 newi->inode.rdev = req->head.args.mknod.rdev;
5005 newi->inode.version = dn->pre_dirty();
5006 newi->inode.rstat.rfiles = 1;
5007 if (layout.pool_id != mdcache->default_file_layout.pool_id)
5008 newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
5009 newi->inode.update_backtrace();
5010
5011 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5012 // want to write to it (e.g., if they are reexporting NFS)
5013 if (S_ISREG(newi->inode.mode)) {
5014 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
5015 newi->inode.client_ranges[client].range.first = 0;
5016 newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
5017 newi->inode.client_ranges[client].follows = follows;
5018
5019 // issue a cap on the file
5020 int cmode = CEPH_FILE_MODE_RDWR;
5021 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5022 if (cap) {
5023 cap->set_wanted(0);
5024
5025 // put locks in excl mode
5026 newi->filelock.set_state(LOCK_EXCL);
5027 newi->authlock.set_state(LOCK_EXCL);
5028 newi->xattrlock.set_state(LOCK_EXCL);
5029 }
5030 }
5031
5032 assert(dn->first == follows + 1);
5033 newi->first = dn->first;
5034
5035 dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
5036
5037 // prepare finisher
5038 mdr->ls = mdlog->get_current_segment();
5039 EUpdate *le = new EUpdate(mdlog, "mknod");
5040 mdlog->start_entry(le);
5041 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5042 journal_allocated_inos(mdr, &le->metablob);
5043
5044 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
5045 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5046 le->metablob.add_primary_dentry(dn, newi, true, true, true);
5047
5048 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5049}
5050
5051
5052
5053// MKDIR
5054/* This function takes responsibility for the passed mdr*/
5055void Server::handle_client_mkdir(MDRequestRef& mdr)
5056{
5057 MClientRequest *req = mdr->client_request;
5058 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5059 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5060 if (!dn) return;
5061 if (mdr->snapid != CEPH_NOSNAP) {
5062 respond_to_request(mdr, -EROFS);
5063 return;
5064 }
5065 CDir *dir = dn->get_dir();
5066 CInode *diri = dir->get_inode();
5067 rdlocks.insert(&diri->authlock);
5068 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5069 return;
5070
5071 // mkdir check access
5072 if (!check_access(mdr, diri, MAY_WRITE))
5073 return;
5074
5075 if (!check_fragment_space(mdr, dir))
5076 return;
5077
5078 // new inode
5079 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5080 snapid_t follows = realm->get_newest_seq();
5081
5082 unsigned mode = req->head.args.mkdir.mode;
5083 mode &= ~S_IFMT;
5084 mode |= S_IFDIR;
5085 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5086 assert(newi);
5087
5088 // it's a directory.
5089 dn->push_projected_linkage(newi);
5090
5091 newi->inode.version = dn->pre_dirty();
5092 newi->inode.rstat.rsubdirs = 1;
5093 newi->inode.update_backtrace();
5094
5095 dout(12) << " follows " << follows << dendl;
5096 assert(dn->first == follows + 1);
5097 newi->first = dn->first;
5098
5099 // ...and that new dir is empty.
5100 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
5101 newdir->state_set(CDir::STATE_CREATING);
5102 newdir->mark_complete();
5103 newdir->fnode.version = newdir->pre_dirty();
5104
5105 // prepare finisher
5106 mdr->ls = mdlog->get_current_segment();
5107 EUpdate *le = new EUpdate(mdlog, "mkdir");
5108 mdlog->start_entry(le);
5109 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5110 journal_allocated_inos(mdr, &le->metablob);
5111 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5112 le->metablob.add_primary_dentry(dn, newi, true, true);
5113 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
5114
5115 // issue a cap on the directory
5116 int cmode = CEPH_FILE_MODE_RDWR;
5117 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5118 if (cap) {
5119 cap->set_wanted(0);
5120
5121 // put locks in excl mode
5122 newi->filelock.set_state(LOCK_EXCL);
5123 newi->authlock.set_state(LOCK_EXCL);
5124 newi->xattrlock.set_state(LOCK_EXCL);
5125 }
5126
5127 // make sure this inode gets into the journal
5128 le->metablob.add_opened_ino(newi->ino());
5129 LogSegment *ls = mds->mdlog->get_current_segment();
5130 ls->open_files.push_back(&newi->item_open_file);
5131
5132 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5133}
5134
5135
5136// SYMLINK
5137
5138void Server::handle_client_symlink(MDRequestRef& mdr)
5139{
5140 MClientRequest *req = mdr->client_request;
5141 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5142 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5143 if (!dn) return;
5144 if (mdr->snapid != CEPH_NOSNAP) {
5145 respond_to_request(mdr, -EROFS);
5146 return;
5147 }
5148 CDir *dir = dn->get_dir();
5149 CInode *diri = dir->get_inode();
5150 rdlocks.insert(&diri->authlock);
5151 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5152 return;
5153
5154 if (!check_access(mdr, diri, MAY_WRITE))
5155 return;
5156
5157 if (!check_fragment_space(mdr, dir))
5158 return;
5159
5160 unsigned mode = S_IFLNK | 0777;
5161 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5162 assert(newi);
5163
5164 // it's a symlink
5165 dn->push_projected_linkage(newi);
5166
5167 newi->symlink = req->get_path2();
5168 newi->inode.size = newi->symlink.length();
5169 newi->inode.rstat.rbytes = newi->inode.size;
5170 newi->inode.rstat.rfiles = 1;
5171 newi->inode.version = dn->pre_dirty();
5172 newi->inode.update_backtrace();
5173
5174 newi->first = dn->first;
5175
5176 // prepare finisher
5177 mdr->ls = mdlog->get_current_segment();
5178 EUpdate *le = new EUpdate(mdlog, "symlink");
5179 mdlog->start_entry(le);
5180 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5181 journal_allocated_inos(mdr, &le->metablob);
5182 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5183 le->metablob.add_primary_dentry(dn, newi, true, true);
5184
5185 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5186}
5187
5188
5189
5190
5191
5192// LINK
5193
5194void Server::handle_client_link(MDRequestRef& mdr)
5195{
5196 MClientRequest *req = mdr->client_request;
5197
5198 dout(7) << "handle_client_link " << req->get_filepath()
5199 << " to " << req->get_filepath2()
5200 << dendl;
5201
5202 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5203
5204 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5205 if (!dn) return;
5206 CInode *targeti = rdlock_path_pin_ref(mdr, 1, rdlocks, false);
5207 if (!targeti) return;
5208 if (mdr->snapid != CEPH_NOSNAP) {
5209 respond_to_request(mdr, -EROFS);
5210 return;
5211 }
5212
5213 CDir *dir = dn->get_dir();
5214 dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
5215 dout(7) << "target is " << *targeti << dendl;
5216 if (targeti->is_dir()) {
5217 dout(7) << "target is a dir, failing..." << dendl;
5218 respond_to_request(mdr, -EINVAL);
5219 return;
5220 }
5221
5222 xlocks.insert(&targeti->linklock);
5223
5224 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5225 return;
5226
5227 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5228 if (!check_access(mdr, targeti, MAY_WRITE))
5229 return;
5230
5231 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
5232 return;
5233
5234 if (!check_fragment_space(mdr, dir))
5235 return;
5236 }
5237
5238 // go!
5239 assert(g_conf->mds_kill_link_at != 1);
5240
5241 // local or remote?
5242 if (targeti->is_auth())
5243 _link_local(mdr, dn, targeti);
5244 else
5245 _link_remote(mdr, true, dn, targeti);
5246}
5247
5248
5249class C_MDS_link_local_finish : public ServerLogContext {
5250 CDentry *dn;
5251 CInode *targeti;
5252 version_t dnpv;
5253 version_t tipv;
5254public:
5255 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
5256 version_t dnpv_, version_t tipv_) :
5257 ServerLogContext(s, r), dn(d), targeti(ti),
5258 dnpv(dnpv_), tipv(tipv_) { }
5259 void finish(int r) override {
5260 assert(r == 0);
5261 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv);
5262 }
5263};
5264
5265
5266void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
5267{
5268 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
5269
5270 mdr->ls = mdlog->get_current_segment();
5271
5272 // predirty NEW dentry
5273 version_t dnpv = dn->pre_dirty();
5274 version_t tipv = targeti->pre_dirty();
5275
5276 // project inode update
5277 inode_t *pi = targeti->project_inode();
5278 pi->nlink++;
5279 pi->ctime = mdr->get_op_stamp();
5280 pi->change_attr++;
5281 pi->version = tipv;
5282
5283 // log + wait
5284 EUpdate *le = new EUpdate(mdlog, "link_local");
5285 mdlog->start_entry(le);
5286 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5287 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
5288 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
5289 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5290 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
5291
5292 // do this after predirty_*, to avoid funky extra dnl arg
5293 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5294
5295 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv));
5296}
5297
5298void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
5299 version_t dnpv, version_t tipv)
5300{
5301 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
5302
5303 // link and unlock the NEW dentry
31f18b77
FG
5304 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
5305 if (!dnl->get_inode())
5306 dn->link_remote(dnl, targeti);
7c673cae
FG
5307 dn->mark_dirty(dnpv, mdr->ls);
5308
5309 // target inode
5310 targeti->pop_and_dirty_projected_inode(mdr->ls);
5311
5312 mdr->apply();
5313
5314 MDRequestRef null_ref;
5315 mdcache->send_dentry_link(dn, null_ref);
5316
5317 // bump target popularity
5318 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5319 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
5320
5321 // reply
5322 respond_to_request(mdr, 0);
5323}
5324
5325
5326// link / unlink remote
5327
5328class C_MDS_link_remote_finish : public ServerLogContext {
5329 bool inc;
5330 CDentry *dn;
5331 CInode *targeti;
5332 version_t dpv;
5333public:
5334 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
5335 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
5336 dpv(d->get_projected_version()) {}
5337 void finish(int r) override {
5338 assert(r == 0);
5339 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
5340 }
5341};
5342
5343void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
5344{
5345 dout(10) << "_link_remote "
5346 << (inc ? "link ":"unlink ")
5347 << *dn << " to " << *targeti << dendl;
5348
5349 // 1. send LinkPrepare to dest (journal nlink++ prepare)
5350 mds_rank_t linkauth = targeti->authority().first;
5351 if (mdr->more()->witnessed.count(linkauth) == 0) {
5352 if (mds->is_cluster_degraded() &&
5353 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
5354 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
5355 if (mdr->more()->waiting_on_slave.empty())
5356 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
5357 return;
5358 }
5359
5360 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
5361 int op;
5362 if (inc)
5363 op = MMDSSlaveRequest::OP_LINKPREP;
5364 else
5365 op = MMDSSlaveRequest::OP_UNLINKPREP;
5366 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, op);
5367 targeti->set_object_info(req->get_object_info());
5368 req->op_stamp = mdr->get_op_stamp();
5369 mds->send_message_mds(req, linkauth);
5370
5371 assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
5372 mdr->more()->waiting_on_slave.insert(linkauth);
5373 return;
5374 }
5375 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
5376
5377 assert(g_conf->mds_kill_link_at != 2);
5378
5379 mdr->set_mds_stamp(ceph_clock_now());
5380
5381 // add to event
5382 mdr->ls = mdlog->get_current_segment();
5383 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
5384 mdlog->start_entry(le);
5385 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5386 if (!mdr->more()->witnessed.empty()) {
5387 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
5388 le->reqid = mdr->reqid;
5389 le->had_slaves = true;
5390 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
5391 }
5392
5393 if (inc) {
5394 dn->pre_dirty();
5395 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
5396 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5397 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5398 } else {
5399 dn->pre_dirty();
5400 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
5401 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
5402 le->metablob.add_null_dentry(dn, true);
31f18b77 5403 dn->push_projected_linkage();
7c673cae
FG
5404 }
5405
5406 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
5407}
5408
5409void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
5410 CDentry *dn, CInode *targeti,
5411 version_t dpv)
5412{
5413 dout(10) << "_link_remote_finish "
5414 << (inc ? "link ":"unlink ")
5415 << *dn << " to " << *targeti << dendl;
5416
5417 assert(g_conf->mds_kill_link_at != 3);
5418
5419 if (!mdr->more()->witnessed.empty())
5420 mdcache->logged_master_update(mdr->reqid);
5421
5422 if (inc) {
5423 // link the new dentry
31f18b77
FG
5424 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
5425 if (!dnl->get_inode())
5426 dn->link_remote(dnl, targeti);
7c673cae
FG
5427 dn->mark_dirty(dpv, mdr->ls);
5428 } else {
5429 // unlink main dentry
5430 dn->get_dir()->unlink_inode(dn);
31f18b77 5431 dn->pop_projected_linkage();
7c673cae
FG
5432 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
5433 }
5434
5435 mdr->apply();
5436
5437 MDRequestRef null_ref;
5438 if (inc)
5439 mdcache->send_dentry_link(dn, null_ref);
5440 else
5441 mdcache->send_dentry_unlink(dn, NULL, null_ref);
5442
5443 // bump target popularity
5444 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5445 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
5446
5447 // reply
5448 respond_to_request(mdr, 0);
5449
5450 if (!inc)
5451 // removing a new dn?
5452 dn->get_dir()->try_remove_unlinked_dn(dn);
5453}
5454
5455
5456// remote linking/unlinking
5457
5458class C_MDS_SlaveLinkPrep : public ServerLogContext {
5459 CInode *targeti;
5460public:
5461 C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t) :
5462 ServerLogContext(s, r), targeti(t) { }
5463 void finish(int r) override {
5464 assert(r == 0);
5465 server->_logged_slave_link(mdr, targeti);
5466 }
5467};
5468
5469class C_MDS_SlaveLinkCommit : public ServerContext {
5470 MDRequestRef mdr;
5471 CInode *targeti;
5472public:
5473 C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
5474 ServerContext(s), mdr(r), targeti(t) { }
5475 void finish(int r) override {
5476 server->_commit_slave_link(mdr, r, targeti);
5477 }
5478};
5479
5480/* This function DOES put the mdr->slave_request before returning*/
5481void Server::handle_slave_link_prep(MDRequestRef& mdr)
5482{
5483 dout(10) << "handle_slave_link_prep " << *mdr
5484 << " on " << mdr->slave_request->get_object_info()
5485 << dendl;
5486
5487 assert(g_conf->mds_kill_link_at != 4);
5488
5489 CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
5490 assert(targeti);
5491 dout(10) << "targeti " << *targeti << dendl;
5492 CDentry *dn = targeti->get_parent_dn();
5493 CDentry::linkage_t *dnl = dn->get_linkage();
5494 assert(dnl->is_primary());
5495
5496 mdr->set_op_stamp(mdr->slave_request->op_stamp);
5497
5498 mdr->auth_pin(targeti);
5499
5500 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
5501 assert(g_conf->mds_kill_link_at != 5);
5502
5503 // journal it
5504 mdr->ls = mdlog->get_current_segment();
5505 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
5506 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
5507 mdlog->start_entry(le);
5508
5509 inode_t *pi = dnl->get_inode()->project_inode();
5510
5511 // update journaled target inode
5512 bool inc;
5513 if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
5514 inc = true;
5515 pi->nlink++;
5516 } else {
5517 inc = false;
5518 pi->nlink--;
5519 }
5520
5521 link_rollback rollback;
5522 rollback.reqid = mdr->reqid;
5523 rollback.ino = targeti->ino();
5524 rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
5525 const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
5526 rollback.old_dir_mtime = pf->fragstat.mtime;
5527 rollback.old_dir_rctime = pf->rstat.rctime;
5528 rollback.was_inc = inc;
5529 ::encode(rollback, le->rollback);
5530 mdr->more()->rollback_bl = le->rollback;
5531
5532 pi->ctime = mdr->get_op_stamp();
5533 pi->version = targeti->pre_dirty();
5534
5535 dout(10) << " projected inode " << pi << " v " << pi->version << dendl;
5536
5537 // commit case
5538 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
5539 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
5540
5541 // set up commit waiter
5542 mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
5543
5544 mdr->more()->slave_update_journaled = true;
5545 submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti),
5546 mdr, __func__);
5547 mdlog->flush();
5548}
5549
5550void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti)
5551{
5552 dout(10) << "_logged_slave_link " << *mdr
5553 << " " << *targeti << dendl;
5554
5555 assert(g_conf->mds_kill_link_at != 6);
5556
5557 // update the target
5558 targeti->pop_and_dirty_projected_inode(mdr->ls);
5559 mdr->apply();
5560
5561 // hit pop
5562 mds->balancer->hit_inode(mdr->get_mds_stamp(), targeti, META_POP_IWR);
5563
5564 // done.
5565 mdr->slave_request->put();
5566 mdr->slave_request = 0;
5567
5568 // ack
5569 if (!mdr->aborted) {
5570 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5571 MMDSSlaveRequest::OP_LINKPREPACK);
5572 mds->send_message_mds(reply, mdr->slave_to_mds);
5573 } else {
5574 dout(10) << " abort flag set, finishing" << dendl;
5575 mdcache->request_finish(mdr);
5576 }
5577}
5578
5579
5580struct C_MDS_CommittedSlave : public ServerLogContext {
5581 C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
5582 void finish(int r) override {
5583 server->_committed_slave(mdr);
5584 }
5585};
5586
5587void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
5588{
5589 dout(10) << "_commit_slave_link " << *mdr
5590 << " r=" << r
5591 << " " << *targeti << dendl;
5592
5593 assert(g_conf->mds_kill_link_at != 7);
5594
5595 if (r == 0) {
5596 // drop our pins, etc.
5597 mdr->cleanup();
5598
5599 // write a commit to the journal
5600 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
5601 ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
5602 mdlog->start_entry(le);
5603 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
5604 mdlog->flush();
5605 } else {
5606 do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
5607 }
5608}
5609
5610void Server::_committed_slave(MDRequestRef& mdr)
5611{
5612 dout(10) << "_committed_slave " << *mdr << dendl;
5613
5614 assert(g_conf->mds_kill_link_at != 8);
5615
5616 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5617 MMDSSlaveRequest::OP_COMMITTED);
5618 mds->send_message_mds(req, mdr->slave_to_mds);
5619 mdcache->request_finish(mdr);
5620}
5621
5622struct C_MDS_LoggedLinkRollback : public ServerLogContext {
5623 MutationRef mut;
5624 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r) : ServerLogContext(s, r), mut(m) {}
5625 void finish(int r) override {
5626 server->_link_rollback_finish(mut, mdr);
5627 }
5628};
5629
5630void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
5631{
5632 link_rollback rollback;
5633 bufferlist::iterator p = rbl.begin();
5634 ::decode(rollback, p);
5635
5636 dout(10) << "do_link_rollback on " << rollback.reqid
5637 << (rollback.was_inc ? " inc":" dec")
5638 << " ino " << rollback.ino
5639 << dendl;
5640
5641 assert(g_conf->mds_kill_link_at != 9);
5642
5643 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
5644 assert(mdr || mds->is_resolve());
5645
5646 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
5647 mut->ls = mds->mdlog->get_current_segment();
5648
5649 CInode *in = mdcache->get_inode(rollback.ino);
5650 assert(in);
5651 dout(10) << " target is " << *in << dendl;
5652 assert(!in->is_projected()); // live slave request hold versionlock xlock.
5653
5654 inode_t *pi = in->project_inode();
5655 pi->version = in->pre_dirty();
5656 mut->add_projected_inode(in);
5657
5658 // parent dir rctime
5659 CDir *parent = in->get_projected_parent_dn()->get_dir();
5660 fnode_t *pf = parent->project_fnode();
5661 mut->add_projected_fnode(parent);
5662 pf->version = parent->pre_dirty();
5663 if (pf->fragstat.mtime == pi->ctime) {
5664 pf->fragstat.mtime = rollback.old_dir_mtime;
5665 if (pf->rstat.rctime == pi->ctime)
5666 pf->rstat.rctime = rollback.old_dir_rctime;
5667 mut->add_updated_lock(&parent->get_inode()->filelock);
5668 mut->add_updated_lock(&parent->get_inode()->nestlock);
5669 }
5670
5671 // inode
5672 pi->ctime = rollback.old_ctime;
5673 if (rollback.was_inc)
5674 pi->nlink--;
5675 else
5676 pi->nlink++;
5677
5678 // journal it
5679 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
5680 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
5681 mdlog->start_entry(le);
5682 le->commit.add_dir_context(parent);
5683 le->commit.add_dir(parent, true);
5684 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
5685
5686 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr),
5687 mdr, __func__);
5688 mdlog->flush();
5689}
5690
5691void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr)
5692{
5693 dout(10) << "_link_rollback_finish" << dendl;
5694
5695 assert(g_conf->mds_kill_link_at != 10);
5696
5697 mut->apply();
5698 if (mdr)
5699 mdcache->request_finish(mdr);
5700
5701 mdcache->finish_rollback(mut->reqid);
5702
5703 mut->cleanup();
5704}
5705
5706
5707/* This function DOES NOT put the passed message before returning*/
5708void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *m)
5709{
5710 dout(10) << "handle_slave_link_prep_ack " << *mdr
5711 << " " << *m << dendl;
5712 mds_rank_t from = mds_rank_t(m->get_source().num());
5713
5714 assert(g_conf->mds_kill_link_at != 11);
5715
5716 // note slave
5717 mdr->more()->slaves.insert(from);
5718
5719 // witnessed!
5720 assert(mdr->more()->witnessed.count(from) == 0);
5721 mdr->more()->witnessed.insert(from);
5722 assert(!m->is_not_journaled());
5723 mdr->more()->has_journaled_slaves = true;
5724
5725 // remove from waiting list
5726 assert(mdr->more()->waiting_on_slave.count(from));
5727 mdr->more()->waiting_on_slave.erase(from);
5728
5729 assert(mdr->more()->waiting_on_slave.empty());
5730
5731 dispatch_client_request(mdr); // go again!
5732}
5733
5734
5735
5736
5737
5738// UNLINK
5739
5740void Server::handle_client_unlink(MDRequestRef& mdr)
5741{
5742 MClientRequest *req = mdr->client_request;
5743 client_t client = mdr->get_client();
5744
5745 // rmdir or unlink?
5746 bool rmdir = false;
5747 if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true;
5748
5749 if (req->get_filepath().depth() == 0) {
5750 respond_to_request(mdr, -EINVAL);
5751 return;
5752 }
5753
5754 // traverse to path
5755 vector<CDentry*> trace;
5756 CInode *in;
5757 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(), &trace, &in, MDS_TRAVERSE_FORWARD);
5758 if (r > 0) return;
5759 if (r < 0) {
5760 if (r == -ESTALE) {
5761 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
5762 mdcache->find_ino_peers(req->get_filepath().get_ino(), new C_MDS_TryFindInode(this, mdr));
5763 return;
5764 }
5765 respond_to_request(mdr, r);
5766 return;
5767 }
5768 if (mdr->snapid != CEPH_NOSNAP) {
5769 respond_to_request(mdr, -EROFS);
5770 return;
5771 }
5772
5773 CDentry *dn = trace[trace.size()-1];
5774 assert(dn);
5775 if (!dn->is_auth()) {
5776 mdcache->request_forward(mdr, dn->authority().first);
5777 return;
5778 }
5779
5780 CInode *diri = dn->get_dir()->get_inode();
5781
5782 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
5783 assert(!dnl->is_null());
5784
5785 if (rmdir) {
5786 dout(7) << "handle_client_rmdir on " << *dn << dendl;
5787 } else {
5788 dout(7) << "handle_client_unlink on " << *dn << dendl;
5789 }
5790 dout(7) << "dn links to " << *in << dendl;
5791
5792 // rmdir vs is_dir
5793 if (in->is_dir()) {
5794 if (rmdir) {
5795 // do empty directory checks
5796 if (_dir_is_nonempty_unlocked(mdr, in)) {
5797 respond_to_request(mdr, -ENOTEMPTY);
5798 return;
5799 }
5800 } else {
5801 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
5802 respond_to_request(mdr, -EISDIR);
5803 return;
5804 }
5805 } else {
5806 if (rmdir) {
5807 // unlink
5808 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
5809 respond_to_request(mdr, -ENOTDIR);
5810 return;
5811 }
5812 }
5813
5814 // -- create stray dentry? --
5815 CDentry *straydn = NULL;
5816 if (dnl->is_primary()) {
5817 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
5818 if (!straydn)
5819 return;
5820 dout(10) << " straydn is " << *straydn << dendl;
5821 } else if (mdr->straydn) {
5822 mdr->unpin(mdr->straydn);
5823 mdr->straydn = NULL;
5824 }
5825
5826 // lock
5827 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5828
5829 for (int i=0; i<(int)trace.size()-1; i++)
5830 rdlocks.insert(&trace[i]->lock);
5831 xlocks.insert(&dn->lock);
5832 wrlocks.insert(&diri->filelock);
5833 wrlocks.insert(&diri->nestlock);
5834 xlocks.insert(&in->linklock);
5835 if (straydn) {
5836 wrlocks.insert(&straydn->get_dir()->inode->filelock);
5837 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
5838 xlocks.insert(&straydn->lock);
5839 }
5840 if (in->is_dir())
5841 rdlocks.insert(&in->filelock); // to verify it's empty
5842 mds->locker->include_snap_rdlocks(rdlocks, dnl->get_inode());
5843
5844 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5845 return;
5846
5847 if (in->is_dir() &&
5848 _dir_is_nonempty(mdr, in)) {
5849 respond_to_request(mdr, -ENOTEMPTY);
5850 return;
5851 }
5852
5853 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5854 if (!check_access(mdr, diri, MAY_WRITE))
5855 return;
5856 }
5857
5858 // yay!
5859 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
5860 // subtree root auths need to be witnesses
5861 set<mds_rank_t> witnesses;
5862 in->list_replicas(witnesses);
5863 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
5864
5865 for (set<mds_rank_t>::iterator p = witnesses.begin();
5866 p != witnesses.end();
5867 ++p) {
5868 if (mdr->more()->witnessed.count(*p)) {
5869 dout(10) << " already witnessed by mds." << *p << dendl;
5870 } else if (mdr->more()->waiting_on_slave.count(*p)) {
5871 dout(10) << " already waiting on witness mds." << *p << dendl;
5872 } else {
5873 if (!_rmdir_prepare_witness(mdr, *p, trace, straydn))
5874 return;
5875 }
5876 }
5877 if (!mdr->more()->waiting_on_slave.empty())
5878 return; // we're waiting for a witness.
5879 }
5880
5881 // ok!
5882 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
5883 _link_remote(mdr, false, dn, dnl->get_inode());
5884 else
5885 _unlink_local(mdr, dn, straydn);
5886}
5887
5888class C_MDS_unlink_local_finish : public ServerLogContext {
5889 CDentry *dn;
5890 CDentry *straydn;
5891 version_t dnpv; // deleted dentry
5892public:
5893 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
5894 ServerLogContext(s, r), dn(d), straydn(sd),
5895 dnpv(d->get_projected_version()) {}
5896 void finish(int r) override {
5897 assert(r == 0);
5898 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
5899 }
5900};
5901
5902void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
5903{
5904 dout(10) << "_unlink_local " << *dn << dendl;
5905
5906 CDentry::linkage_t *dnl = dn->get_projected_linkage();
5907 CInode *in = dnl->get_inode();
5908
5909 SnapRealm *realm = in->find_snaprealm();
5910 snapid_t follows = realm->get_newest_seq();
5911
5912 // ok, let's do it.
5913 mdr->ls = mdlog->get_current_segment();
5914
5915 // prepare log entry
5916 EUpdate *le = new EUpdate(mdlog, "unlink_local");
5917 mdlog->start_entry(le);
5918 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5919 if (!mdr->more()->witnessed.empty()) {
5920 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
5921 le->reqid = mdr->reqid;
5922 le->had_slaves = true;
5923 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
5924 }
5925
5926 if (straydn) {
5927 assert(dnl->is_primary());
5928 straydn->push_projected_linkage(in);
5929 straydn->first = follows + 1;
5930 }
5931
5932 // the unlinked dentry
5933 dn->pre_dirty();
5934
5935 inode_t *pi = in->project_inode();
31f18b77 5936 dn->make_path_string(pi->stray_prior_path, true);
7c673cae
FG
5937 mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
5938 pi->version = in->pre_dirty();
5939 pi->ctime = mdr->get_op_stamp();
5940 pi->change_attr++;
5941 pi->nlink--;
5942 if (pi->nlink == 0)
5943 in->state_set(CInode::STATE_ORPHAN);
5944
5945 if (dnl->is_primary()) {
5946 // primary link. add stray dentry.
5947 assert(straydn);
5948 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
5949 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5950
5951 // project snaprealm, too
5952 if (in->snaprealm || follows + 1 > in->get_oldest_snap())
5953 in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
5954
5955 pi->update_backtrace();
5956 le->metablob.add_primary_dentry(straydn, in, true, true);
5957 } else {
5958 // remote link. update remote inode.
5959 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
5960 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5961 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5962 }
5963
5964 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
5965 le->metablob.add_null_dentry(dn, true);
5966
5967 if (in->is_dir()) {
5968 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
5969 le->metablob.renamed_dirino = in->ino();
5970 }
5971
5972 dn->push_projected_linkage();
5973
5974 if (in->is_dir()) {
5975 assert(straydn);
5976 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
5977 }
5978
5979 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
5980}
5981
5982void Server::_unlink_local_finish(MDRequestRef& mdr,
5983 CDentry *dn, CDentry *straydn,
5984 version_t dnpv)
5985{
5986 dout(10) << "_unlink_local_finish " << *dn << dendl;
5987
5988 if (!mdr->more()->witnessed.empty())
5989 mdcache->logged_master_update(mdr->reqid);
5990
5991 // unlink main dentry
5992 dn->get_dir()->unlink_inode(dn);
5993 dn->pop_projected_linkage();
5994
5995 // relink as stray? (i.e. was primary link?)
5996 CInode *strayin = NULL;
5997 bool snap_is_new = false;
5998 if (straydn) {
5999 dout(20) << " straydn is " << *straydn << dendl;
6000 CDentry::linkage_t *straydnl = straydn->pop_projected_linkage();
6001 strayin = straydnl->get_inode();
6002
6003 snap_is_new = strayin->snaprealm ? true : false;
6004 mdcache->touch_dentry_bottom(straydn);
6005 }
6006
6007 dn->mark_dirty(dnpv, mdr->ls);
6008 mdr->apply();
6009
6010 if (snap_is_new) //only new if strayin exists
6011 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, true);
6012
6013 mdcache->send_dentry_unlink(dn, straydn, mdr);
6014
6015 // update subtree map?
6016 if (straydn && strayin->is_dir())
6017 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
6018
6019 // bump pop
6020 mds->balancer->hit_dir(mdr->get_mds_stamp(), dn->get_dir(), META_POP_IWR);
6021
6022 // reply
6023 respond_to_request(mdr, 0);
6024
6025 // removing a new dn?
6026 dn->get_dir()->try_remove_unlinked_dn(dn);
6027
6028 // clean up ?
6029 // respond_to_request() drops locks. So stray reintegration can race with us.
6030 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6031 // Tip off the MDCache that this dentry is a stray that
6032 // might be elegible for purge.
6033 mdcache->notify_stray(straydn);
6034 }
6035}
6036
6037bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
6038{
6039 if (mds->is_cluster_degraded() &&
6040 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6041 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
6042 if (mdr->more()->waiting_on_slave.empty())
6043 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6044 return false;
6045 }
6046
6047 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
6048 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6049 MMDSSlaveRequest::OP_RMDIRPREP);
6050 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
6051 for (auto dn : trace)
6052 req->srcdnpath.push_dentry(dn->name);
6053 mdcache->replicate_stray(straydn, who, req->stray);
6054
6055 req->op_stamp = mdr->get_op_stamp();
6056 mds->send_message_mds(req, who);
6057
6058 assert(mdr->more()->waiting_on_slave.count(who) == 0);
6059 mdr->more()->waiting_on_slave.insert(who);
6060 return true;
6061}
6062
6063struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
6064 CDentry *dn, *straydn;
6065 C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
6066 : ServerLogContext(s, r), dn(d), straydn(st) {}
6067 void finish(int r) override {
6068 server->_logged_slave_rmdir(mdr, dn, straydn);
6069 }
6070};
6071
6072struct C_MDS_SlaveRmdirCommit : public ServerContext {
6073 MDRequestRef mdr;
31f18b77
FG
6074 CDentry *straydn;
6075 C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
6076 : ServerContext(s), mdr(r), straydn(sd) { }
7c673cae 6077 void finish(int r) override {
31f18b77 6078 server->_commit_slave_rmdir(mdr, r, straydn);
7c673cae
FG
6079 }
6080};
6081
6082void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
6083{
6084 dout(10) << "handle_slave_rmdir_prep " << *mdr
6085 << " " << mdr->slave_request->srcdnpath
6086 << " to " << mdr->slave_request->destdnpath
6087 << dendl;
6088
6089 vector<CDentry*> trace;
6090 filepath srcpath(mdr->slave_request->srcdnpath);
6091 dout(10) << " src " << srcpath << dendl;
6092 CInode *in;
6093 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK);
6094 if (r > 0) return;
6095 if (r == -ESTALE) {
6096 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
6097 mdr->slave_to_mds);
6098 return;
6099 }
6100 assert(r == 0);
6101 CDentry *dn = trace[trace.size()-1];
6102 dout(10) << " dn " << *dn << dendl;
6103 mdr->pin(dn);
6104
6105 assert(mdr->straydn);
6106 CDentry *straydn = mdr->straydn;
6107 dout(10) << " straydn " << *straydn << dendl;
6108
6109 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6110
6111 rmdir_rollback rollback;
6112 rollback.reqid = mdr->reqid;
6113 rollback.src_dir = dn->get_dir()->dirfrag();
6114 rollback.src_dname = dn->name;
6115 rollback.dest_dir = straydn->get_dir()->dirfrag();
6116 rollback.dest_dname = straydn->name;
6117 ::encode(rollback, mdr->more()->rollback_bl);
6118 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
6119
6120 // set up commit waiter
31f18b77 6121 mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
7c673cae
FG
6122
6123 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
6124 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
6125 dn->get_dir()->unlink_inode(dn);
6126 straydn->get_dir()->link_primary_inode(straydn, in);
6127
6128 assert(straydn->first >= in->first);
6129 in->first = straydn->first;
6130
6131 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), false);
6132
6133 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6134 MMDSSlaveRequest::OP_RMDIRPREPACK);
6135 reply->mark_not_journaled();
6136 mds->send_message_mds(reply, mdr->slave_to_mds);
6137
6138 // send caps to auth (if we're not already)
6139 if (in->is_any_caps() && !in->state_test(CInode::STATE_EXPORTINGCAPS))
6140 mdcache->migrator->export_caps(in);
6141
6142 mdcache->touch_dentry_bottom(straydn); // move stray to end of lru
6143
6144 mdr->slave_request->put();
6145 mdr->slave_request = 0;
6146 mdr->straydn = 0;
6147 return;
6148 }
6149
6150 straydn->push_projected_linkage(in);
6151 dn->push_projected_linkage();
6152
6153 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
6154 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
6155 mdlog->start_entry(le);
6156 le->rollback = mdr->more()->rollback_bl;
6157
6158 le->commit.add_dir_context(straydn->get_dir());
6159 le->commit.add_primary_dentry(straydn, in, true);
6160 // slave: no need to journal original dentry
6161
6162 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6163 le->commit.renamed_dirino = in->ino();
6164
6165 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6166
6167 mdr->more()->slave_update_journaled = true;
6168 submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
6169 mdr, __func__);
6170 mdlog->flush();
6171}
6172
6173void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
6174{
6175 dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
6176
6177 // update our cache now, so we are consistent with what is in the journal
6178 // when we journal a subtree map
6179 CInode *in = dn->get_linkage()->get_inode();
6180 dn->get_dir()->unlink_inode(dn);
6181 straydn->pop_projected_linkage();
6182 dn->pop_projected_linkage();
6183 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), true);
6184
6185 // done.
6186 mdr->slave_request->put();
6187 mdr->slave_request = 0;
6188 mdr->straydn = 0;
6189
6190 if (!mdr->aborted) {
6191 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6192 MMDSSlaveRequest::OP_RMDIRPREPACK);
6193 mds->send_message_mds(reply, mdr->slave_to_mds);
6194 } else {
6195 dout(10) << " abort flag set, finishing" << dendl;
6196 mdcache->request_finish(mdr);
6197 }
6198}
6199
6200void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
6201{
6202 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
6203 << " " << *ack << dendl;
6204
6205 mds_rank_t from = mds_rank_t(ack->get_source().num());
6206
6207 mdr->more()->slaves.insert(from);
6208 mdr->more()->witnessed.insert(from);
6209 if (!ack->is_not_journaled())
6210 mdr->more()->has_journaled_slaves = true;
6211
6212 // remove from waiting list
6213 assert(mdr->more()->waiting_on_slave.count(from));
6214 mdr->more()->waiting_on_slave.erase(from);
6215
6216 if (mdr->more()->waiting_on_slave.empty())
6217 dispatch_client_request(mdr); // go again!
6218 else
6219 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
6220}
6221
31f18b77 6222void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
7c673cae
FG
6223{
6224 dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
6225
6226 if (r == 0) {
31f18b77
FG
6227 if (mdr->more()->slave_update_journaled) {
6228 CInode *strayin = straydn->get_projected_linkage()->get_inode();
6229 if (strayin && !strayin->snaprealm)
6230 mdcache->clear_dirty_bits_for_stray(strayin);
6231 }
6232
7c673cae
FG
6233 mdr->cleanup();
6234
6235 if (mdr->more()->slave_update_journaled) {
6236 // write a commit to the journal
6237 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
6238 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
6239 ESlaveUpdate::RMDIR);
6240 mdlog->start_entry(le);
6241 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
6242 mdlog->flush();
6243 } else {
6244 _committed_slave(mdr);
6245 }
6246 } else {
6247 // abort
6248 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
6249 }
6250}
6251
6252struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
6253 metareqid_t reqid;
6254 CDentry *dn;
6255 CDentry *straydn;
6256 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
6257 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
6258 void finish(int r) override {
6259 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
6260 }
6261};
6262
6263void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
6264{
6265 // unlink the other rollback methods, the rmdir rollback is only
6266 // needed to record the subtree changes in the journal for inode
6267 // replicas who are auth for empty dirfrags. no actual changes to
6268 // the file system are taking place here, so there is no Mutation.
6269
6270 rmdir_rollback rollback;
6271 bufferlist::iterator p = rbl.begin();
6272 ::decode(rollback, p);
6273
6274 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
6275 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
6276 assert(mdr || mds->is_resolve());
6277
6278 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
6279 if (!dir)
6280 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
6281 assert(dir);
6282 CDentry *dn = dir->lookup(rollback.src_dname);
6283 assert(dn);
6284 dout(10) << " dn " << *dn << dendl;
6285 dir = mdcache->get_dirfrag(rollback.dest_dir);
6286 assert(dir);
6287 CDentry *straydn = dir->lookup(rollback.dest_dname);
6288 assert(straydn);
6289 dout(10) << " straydn " << *dn << dendl;
6290 CInode *in = straydn->get_linkage()->get_inode();
6291
6292 if (mdr && !mdr->more()->slave_update_journaled) {
6293 assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
6294
6295 straydn->get_dir()->unlink_inode(straydn);
6296 dn->get_dir()->link_primary_inode(dn, in);
6297
6298 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), false);
6299
6300 mdcache->request_finish(mdr);
6301 mdcache->finish_rollback(rollback.reqid);
6302 return;
6303 }
6304
6305 dn->push_projected_linkage(in);
6306 straydn->push_projected_linkage();
6307
6308 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
6309 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
6310 mdlog->start_entry(le);
6311
6312 le->commit.add_dir_context(dn->get_dir());
6313 le->commit.add_primary_dentry(dn, in, true);
6314 // slave: no need to journal straydn
6315
6316 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6317 le->commit.renamed_dirino = in->ino();
6318
6319 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
6320
6321 submit_mdlog_entry(le,
6322 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
6323 dn, straydn),
6324 mdr, __func__);
6325 mdlog->flush();
6326}
6327
6328void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
6329{
6330 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
6331
6332 straydn->get_dir()->unlink_inode(straydn);
6333 dn->pop_projected_linkage();
6334 straydn->pop_projected_linkage();
6335
6336 CInode *in = dn->get_linkage()->get_inode();
6337 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), true);
6338 if (mds->is_resolve()) {
6339 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
6340 mdcache->try_trim_non_auth_subtree(root);
6341 }
6342
6343 if (mdr)
6344 mdcache->request_finish(mdr);
6345
6346 mdcache->finish_rollback(reqid);
6347}
6348
6349
6350/** _dir_is_nonempty[_unlocked]
6351 *
6352 * check if a directory is non-empty (i.e. we can rmdir it).
6353 *
6354 * the unlocked varient this is a fastpath check. we can't really be
6355 * sure until we rdlock the filelock.
6356 */
6357bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
6358{
6359 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
6360 assert(in->is_auth());
6361
6362 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
6363 return true; // in a snapshot!
6364
6365 list<CDir*> ls;
6366 in->get_dirfrags(ls);
6367 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6368 CDir *dir = *p;
6369 // is the frag obviously non-empty?
6370 if (dir->is_auth()) {
6371 if (dir->get_projected_fnode()->fragstat.size()) {
6372 dout(10) << "dir_is_nonempty_unlocked dirstat has "
6373 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
6374 return true;
6375 }
6376 }
6377 }
6378
6379 return false;
6380}
6381
6382bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
6383{
6384 dout(10) << "dir_is_nonempty " << *in << dendl;
6385 assert(in->is_auth());
6386 assert(in->filelock.can_read(mdr->get_client()));
6387
6388 frag_info_t dirstat;
6389 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
6390
6391 list<CDir*> ls;
6392 in->get_dirfrags(ls);
6393 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6394 CDir *dir = *p;
6395 const fnode_t *pf = dir->get_projected_fnode();
6396 if (pf->fragstat.size()) {
6397 dout(10) << "dir_is_nonempty dirstat has "
6398 << pf->fragstat.size() << " items " << *dir << dendl;
6399 return true;
6400 }
6401
6402 if (pf->accounted_fragstat.version == dirstat_version)
6403 dirstat.add(pf->accounted_fragstat);
6404 else
6405 dirstat.add(pf->fragstat);
6406 }
6407
6408 return dirstat.size() != in->get_projected_inode()->dirstat.size();
6409}
6410
6411
6412// ======================================================
6413
6414
6415class C_MDS_rename_finish : public ServerLogContext {
6416 CDentry *srcdn;
6417 CDentry *destdn;
6418 CDentry *straydn;
6419public:
6420 C_MDS_rename_finish(Server *s, MDRequestRef& r,
6421 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
6422 ServerLogContext(s, r),
6423 srcdn(sdn), destdn(ddn), straydn(stdn) { }
6424 void finish(int r) override {
6425 assert(r == 0);
6426 server->_rename_finish(mdr, srcdn, destdn, straydn);
6427 }
6428};
6429
6430
6431/** handle_client_rename
6432 *
6433 * rename master is the destdn auth. this is because cached inodes
6434 * must remain connected. thus, any replica of srci, must also
6435 * replicate destdn, and possibly straydn, so that srci (and
6436 * destdn->inode) remain connected during the rename.
6437 *
6438 * to do this, we freeze srci, then master (destdn auth) verifies that
6439 * all other nodes have also replciated destdn and straydn. note that
6440 * destdn replicas need not also replicate srci. this only works when
6441 * destdn is master.
6442 *
6443 * This function takes responsibility for the passed mdr.
6444 */
6445void Server::handle_client_rename(MDRequestRef& mdr)
6446{
6447 MClientRequest *req = mdr->client_request;
6448 dout(7) << "handle_client_rename " << *req << dendl;
6449
6450 filepath destpath = req->get_filepath();
6451 filepath srcpath = req->get_filepath2();
6452 if (destpath.depth() == 0 || srcpath.depth() == 0) {
6453 respond_to_request(mdr, -EINVAL);
6454 return;
6455 }
6456 const string &destname = destpath.last_dentry();
6457
6458 vector<CDentry*>& srctrace = mdr->dn[1];
6459 vector<CDentry*>& desttrace = mdr->dn[0];
6460
6461 set<SimpleLock*> rdlocks, wrlocks, xlocks;
6462
6463 CDentry *destdn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, true, false, true);
6464 if (!destdn) return;
6465 dout(10) << " destdn " << *destdn << dendl;
6466 if (mdr->snapid != CEPH_NOSNAP) {
6467 respond_to_request(mdr, -EROFS);
6468 return;
6469 }
6470 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
6471 CDir *destdir = destdn->get_dir();
6472 assert(destdir->is_auth());
6473
6474 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER);
6475 if (r > 0)
6476 return; // delayed
6477 if (r < 0) {
6478 if (r == -ESTALE) {
6479 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
6480 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
6481 } else {
6482 dout(10) << "FAIL on error " << r << dendl;
6483 respond_to_request(mdr, r);
6484 }
6485 return;
6486
6487 }
6488 assert(!srctrace.empty());
6489 CDentry *srcdn = srctrace[srctrace.size()-1];
6490 dout(10) << " srcdn " << *srcdn << dendl;
6491 if (srcdn->last != CEPH_NOSNAP) {
6492 respond_to_request(mdr, -EROFS);
6493 return;
6494 }
6495 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
6496 CInode *srci = srcdnl->get_inode();
6497 dout(10) << " srci " << *srci << dendl;
6498
6499 CInode *oldin = 0;
6500 if (!destdnl->is_null()) {
6501 //dout(10) << "dest dn exists " << *destdn << dendl;
6502 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
6503 if (!oldin) return;
6504 dout(10) << " oldin " << *oldin << dendl;
6505
6506 // mv /some/thing /to/some/existing_other_thing
6507 if (oldin->is_dir() && !srci->is_dir()) {
6508 respond_to_request(mdr, -EISDIR);
6509 return;
6510 }
6511 if (!oldin->is_dir() && srci->is_dir()) {
6512 respond_to_request(mdr, -ENOTDIR);
6513 return;
6514 }
6515
6516 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
6517 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
6518 respond_to_request(mdr, -ENOTEMPTY);
6519 return;
6520 }
6521 if (srci == oldin && !srcdn->get_dir()->inode->is_stray()) {
6522 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
6523 return;
6524 }
6525 }
6526
6527 // -- some sanity checks --
6528
6529 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
6530 if (destpath.get_ino() != srcpath.get_ino() &&
6531 !(req->get_source().is_mds() &&
6532 MDS_INO_IS_MDSDIR(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
6533 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
6534 CInode *destbase = desttrace[0]->get_dir()->get_inode();
6535 // ok, extend srctrace toward root until it is an ancestor of desttrace.
6536 while (srcbase != destbase &&
6537 !srcbase->is_projected_ancestor_of(destbase)) {
6538 CDentry *pdn = srcbase->get_projected_parent_dn();
6539 srctrace.insert(srctrace.begin(), pdn);
6540 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
6541 srcbase = pdn->get_dir()->get_inode();
6542 }
6543
6544 // then, extend destpath until it shares the same parent inode as srcpath.
6545 while (destbase != srcbase) {
6546 CDentry *pdn = destbase->get_projected_parent_dn();
6547 desttrace.insert(desttrace.begin(), pdn);
6548 rdlocks.insert(&pdn->lock);
6549 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
6550 destbase = pdn->get_dir()->get_inode();
6551 }
6552 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
6553 }
6554
6555 // src == dest?
6556 if (srcdn->get_dir() == destdir && srcdn->name == destname) {
6557 dout(7) << "rename src=dest, noop" << dendl;
6558 respond_to_request(mdr, 0);
6559 return;
6560 }
6561
6562 // dest a child of src?
6563 // e.g. mv /usr /usr/foo
6564 CDentry *pdn = destdir->inode->get_projected_parent_dn();
6565 while (pdn) {
6566 if (pdn == srcdn) {
6567 dout(7) << "cannot rename item to be a child of itself" << dendl;
6568 respond_to_request(mdr, -EINVAL);
6569 return;
6570 }
6571 pdn = pdn->get_dir()->inode->parent;
6572 }
6573
6574 // is this a stray migration, reintegration or merge? (sanity checks!)
6575 if (mdr->reqid.name.is_mds() &&
6576 !(MDS_INO_IS_MDSDIR(srcpath.get_ino()) &&
6577 MDS_INO_IS_MDSDIR(destpath.get_ino())) &&
6578 !(destdnl->is_remote() &&
6579 destdnl->get_remote_ino() == srci->ino())) {
6580 respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
6581 return;
6582 }
6583
6584 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
6585 (srcdnl->is_primary() || destdnl->is_primary()));
6586 if (linkmerge)
6587 dout(10) << " this is a link merge" << dendl;
6588
6589 // -- create stray dentry? --
6590 CDentry *straydn = NULL;
6591 if (destdnl->is_primary() && !linkmerge) {
6592 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
6593 if (!straydn)
6594 return;
6595 dout(10) << " straydn is " << *straydn << dendl;
6596 } else if (mdr->straydn) {
6597 mdr->unpin(mdr->straydn);
6598 mdr->straydn = NULL;
6599 }
6600
6601 // -- prepare witness list --
6602 /*
6603 * NOTE: we use _all_ replicas as witnesses.
6604 * this probably isn't totally necessary (esp for file renames),
6605 * but if/when we change that, we have to make sure rejoin is
6606 * sufficiently robust to handle strong rejoins from survivors
6607 * with totally wrong dentry->inode linkage.
6608 * (currently, it can ignore rename effects, because the resolve
6609 * stage will sort them out.)
6610 */
6611 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
6612 if (srcdn->is_auth())
6613 srcdn->list_replicas(witnesses);
6614 else
6615 witnesses.insert(srcdn->authority().first);
6616 if (srcdnl->is_remote() && !srci->is_auth())
6617 witnesses.insert(srci->authority().first);
6618 destdn->list_replicas(witnesses);
6619 if (destdnl->is_remote() && !oldin->is_auth())
6620 witnesses.insert(oldin->authority().first);
6621 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
6622
6623
6624 // -- locks --
6625 map<SimpleLock*, mds_rank_t> remote_wrlocks;
6626
6627 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
6628 for (int i=0; i<(int)srctrace.size(); i++)
6629 rdlocks.insert(&srctrace[i]->lock);
6630 xlocks.insert(&srcdn->lock);
6631 mds_rank_t srcdirauth = srcdn->get_dir()->authority().first;
6632 if (srcdirauth != mds->get_nodeid()) {
6633 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
6634 remote_wrlocks[&srcdn->get_dir()->inode->filelock] = srcdirauth;
6635 remote_wrlocks[&srcdn->get_dir()->inode->nestlock] = srcdirauth;
6636 if (srci->is_dir())
6637 rdlocks.insert(&srci->dirfragtreelock);
6638 } else {
6639 wrlocks.insert(&srcdn->get_dir()->inode->filelock);
6640 wrlocks.insert(&srcdn->get_dir()->inode->nestlock);
6641 }
6642 mds->locker->include_snap_rdlocks(rdlocks, srcdn->get_dir()->inode);
6643
6644 // straydn?
6645 if (straydn) {
6646 wrlocks.insert(&straydn->get_dir()->inode->filelock);
6647 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
6648 xlocks.insert(&straydn->lock);
6649 }
6650
6651 // xlock versionlock on dentries if there are witnesses.
6652 // replicas can't see projected dentry linkages, and will get
6653 // confused if we try to pipeline things.
6654 if (!witnesses.empty()) {
6655 // take xlock on all projected ancestor dentries for srcdn and destdn.
6656 // this ensures the srcdn and destdn can be traversed to by the witnesses.
6657 for (int i= 0; i<(int)srctrace.size(); i++) {
6658 if (srctrace[i]->is_auth() && srctrace[i]->is_projected())
6659 xlocks.insert(&srctrace[i]->versionlock);
6660 }
6661 for (int i=0; i<(int)desttrace.size(); i++) {
6662 if (desttrace[i]->is_auth() && desttrace[i]->is_projected())
6663 xlocks.insert(&desttrace[i]->versionlock);
6664 }
6665 // xlock srci and oldin's primary dentries, so witnesses can call
6666 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
6667 // is traversed.
6668 if (srcdnl->is_remote())
6669 xlocks.insert(&srci->get_projected_parent_dn()->lock);
6670 if (destdnl->is_remote())
6671 xlocks.insert(&oldin->get_projected_parent_dn()->lock);
6672 }
6673
6674 // we need to update srci's ctime. xlock its least contended lock to do that...
6675 xlocks.insert(&srci->linklock);
6676
6677 // xlock oldin (for nlink--)
6678 if (oldin) {
6679 xlocks.insert(&oldin->linklock);
6680 if (oldin->is_dir())
6681 rdlocks.insert(&oldin->filelock);
6682 }
6683 if (srcdnl->is_primary() && srci->is_dir())
6684 // FIXME: this should happen whenever we are renamning between
6685 // realms, regardless of the file type
6686 // FIXME: If/when this changes, make sure to update the
6687 // "allowance" in handle_slave_rename_prep
6688 xlocks.insert(&srci->snaplock); // FIXME: an auth bcast could be sufficient?
6689 else
6690 rdlocks.insert(&srci->snaplock);
6691
6692 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL;
6693 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks,
6694 &remote_wrlocks, auth_pin_freeze))
6695 return;
6696
6697 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6698 if (!check_access(mdr, srcdn->get_dir()->get_inode(), MAY_WRITE))
6699 return;
6700
6701 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
6702 return;
6703
6704 if (!check_fragment_space(mdr, destdn->get_dir()))
6705 return;
6706
6707 if (!check_access(mdr, srci, MAY_WRITE))
6708 return;
6709 }
6710
6711 // with read lock, really verify oldin is empty
6712 if (oldin &&
6713 oldin->is_dir() &&
6714 _dir_is_nonempty(mdr, oldin)) {
6715 respond_to_request(mdr, -ENOTEMPTY);
6716 return;
6717 }
6718
6719 /* project_past_snaprealm_parent() will do this job
6720 *
6721 // moving between snaprealms?
6722 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
6723 SnapRealm *srcrealm = srci->find_snaprealm();
6724 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
6725 if (srcrealm != destrealm &&
6726 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
6727 destrealm->get_newest_seq() + 1 > srcdn->first)) {
6728 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
6729 mdcache->snaprealm_create(mdr, srci);
6730 return;
6731 }
6732 }
6733 */
6734
6735 assert(g_conf->mds_kill_rename_at != 1);
6736
6737 // -- open all srcdn inode frags, if any --
6738 // we need these open so that auth can properly delegate from inode to dirfrags
6739 // after the inode is _ours_.
6740 if (srcdnl->is_primary() &&
6741 !srcdn->is_auth() &&
6742 srci->is_dir()) {
6743 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
6744 mdr->set_stickydirs(srci);
6745
6746 list<frag_t> frags;
6747 srci->dirfragtree.get_leaves(frags);
6748 for (list<frag_t>::iterator p = frags.begin();
6749 p != frags.end();
6750 ++p) {
6751 CDir *dir = srci->get_dirfrag(*p);
6752 if (!dir) {
6753 dout(10) << " opening " << *p << " under " << *srci << dendl;
6754 mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr));
6755 return;
6756 }
6757 }
6758 }
6759
6760 // -- prepare witnesses --
6761
6762 // do srcdn auth last
6763 mds_rank_t last = MDS_RANK_NONE;
6764 if (!srcdn->is_auth()) {
6765 last = srcdn->authority().first;
6766 mdr->more()->srcdn_auth_mds = last;
6767 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
6768 // are involved in the rename operation.
6769 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
6770 dout(10) << " preparing ambiguous auth for srci" << dendl;
6771 assert(mdr->more()->is_remote_frozen_authpin);
6772 assert(mdr->more()->rename_inode == srci);
6773 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6774 return;
6775 }
6776 }
6777
6778 for (set<mds_rank_t>::iterator p = witnesses.begin();
6779 p != witnesses.end();
6780 ++p) {
6781 if (*p == last) continue; // do it last!
6782 if (mdr->more()->witnessed.count(*p)) {
6783 dout(10) << " already witnessed by mds." << *p << dendl;
6784 } else if (mdr->more()->waiting_on_slave.count(*p)) {
6785 dout(10) << " already waiting on witness mds." << *p << dendl;
6786 } else {
6787 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
6788 return;
6789 }
6790 }
6791 if (!mdr->more()->waiting_on_slave.empty())
6792 return; // we're waiting for a witness.
6793
6794 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
6795 dout(10) << " preparing last witness (srcdn auth)" << dendl;
6796 assert(mdr->more()->waiting_on_slave.count(last) == 0);
6797 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6798 return;
6799 }
6800
6801 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
6802 if (!mdr->more()->slaves.empty() && !srci->is_dir())
6803 assert(g_conf->mds_kill_rename_at != 3);
6804 if (!mdr->more()->slaves.empty() && srci->is_dir())
6805 assert(g_conf->mds_kill_rename_at != 4);
6806
6807 // -- declare now --
6808 mdr->set_mds_stamp(ceph_clock_now());
6809
6810 // -- prepare journal entry --
6811 mdr->ls = mdlog->get_current_segment();
6812 EUpdate *le = new EUpdate(mdlog, "rename");
6813 mdlog->start_entry(le);
6814 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6815 if (!mdr->more()->witnessed.empty()) {
6816 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6817
6818 le->reqid = mdr->reqid;
6819 le->had_slaves = true;
6820
6821 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6822 // no need to send frozen auth pin to recovring auth MDS of srci
6823 mdr->more()->is_remote_frozen_authpin = false;
6824 }
6825
6826 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
6827 if (le->client_map.length())
6828 le->cmapv = mds->sessionmap.get_projected();
6829
6830 // -- commit locally --
6831 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
6832
6833 journal_and_reply(mdr, srci, destdn, le, fin);
6834}
6835
6836
6837void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
6838{
6839 dout(10) << "_rename_finish " << *mdr << dendl;
6840
6841 if (!mdr->more()->witnessed.empty())
6842 mdcache->logged_master_update(mdr->reqid);
6843
6844 // apply
6845 _rename_apply(mdr, srcdn, destdn, straydn);
6846
6847 mdcache->send_dentry_link(destdn, mdr);
6848
6849 CDentry::linkage_t *destdnl = destdn->get_linkage();
6850 CInode *in = destdnl->get_inode();
6851 bool need_eval = mdr->more()->cap_imports.count(in);
6852
6853 // test hack: test slave commit
6854 if (!mdr->more()->slaves.empty() && !in->is_dir())
6855 assert(g_conf->mds_kill_rename_at != 5);
6856 if (!mdr->more()->slaves.empty() && in->is_dir())
6857 assert(g_conf->mds_kill_rename_at != 6);
6858
6859 // bump popularity
6860 mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
6861 if (destdnl->is_remote() && in->is_auth())
6862 mds->balancer->hit_inode(mdr->get_mds_stamp(), in, META_POP_IWR);
6863
6864 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
6865
6866 assert(g_conf->mds_kill_rename_at != 7);
6867
6868 // reply
6869 respond_to_request(mdr, 0);
6870
6871 if (need_eval)
6872 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
6873
6874 // clean up?
6875 // respond_to_request() drops locks. So stray reintegration can race with us.
6876 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6877 mdcache->notify_stray(straydn);
6878 }
6879}
6880
6881
6882
6883// helpers
6884
6885bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
6886 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
6887{
6888 if (mds->is_cluster_degraded() &&
6889 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6890 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
6891 if (mdr->more()->waiting_on_slave.empty())
6892 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6893 return false;
6894 }
6895
6896 dout(10) << "_rename_prepare_witness mds." << who << dendl;
6897 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6898 MMDSSlaveRequest::OP_RENAMEPREP);
6899
6900 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
6901 for (auto dn : srctrace)
6902 req->srcdnpath.push_dentry(dn->name);
6903 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
6904 for (auto dn : dsttrace)
6905 req->destdnpath.push_dentry(dn->name);
6906 if (straydn)
6907 mdcache->replicate_stray(straydn, who, req->stray);
31f18b77
FG
6908
6909 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
7c673cae
FG
6910
6911 // srcdn auth will verify our current witness list is sufficient
6912 req->witnesses = witnesse;
6913
6914 req->op_stamp = mdr->get_op_stamp();
6915 mds->send_message_mds(req, who);
6916
6917 assert(mdr->more()->waiting_on_slave.count(who) == 0);
6918 mdr->more()->waiting_on_slave.insert(who);
6919 return true;
6920}
6921
6922version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
6923{
6924 version_t oldpv = mdr->more()->inode_import_v;
6925
6926 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
6927
6928 /* import node */
6929 bufferlist::iterator blp = mdr->more()->inode_import.begin();
6930
6931 // imported caps
6932 ::decode(mdr->more()->imported_client_map, blp);
6933 ::encode(mdr->more()->imported_client_map, *client_map_bl,
6934 mds->mdsmap->get_up_features());
6935 prepare_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
6936
6937 list<ScatterLock*> updated_scatterlocks;
6938 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
6939 mdr->more()->cap_imports, updated_scatterlocks);
6940
6941 // hack: force back to !auth and clean, temporarily
6942 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
6943 srcdnl->get_inode()->mark_clean();
6944
6945 return oldpv;
6946}
6947
6948bool Server::_need_force_journal(CInode *diri, bool empty)
6949{
6950 list<CDir*> ls;
6951 diri->get_dirfrags(ls);
6952
6953 bool force_journal = false;
6954 if (empty) {
6955 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6956 if ((*p)->is_subtree_root() && (*p)->get_dir_auth().first == mds->get_nodeid()) {
6957 dout(10) << " frag " << (*p)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
6958 force_journal = true;
6959 break;
6960 } else
6961 dout(20) << " frag " << (*p)->get_frag() << " is not auth subtree dirfrag" << dendl;
6962 }
6963 } else {
6964 // see if any children of our frags are auth subtrees.
6965 list<CDir*> subtrees;
6966 mdcache->list_subtrees(subtrees);
6967 dout(10) << " subtrees " << subtrees << " frags " << ls << dendl;
6968 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6969 CDir *dir = *p;
6970 for (list<CDir*>::iterator q = subtrees.begin(); q != subtrees.end(); ++q) {
6971 if (dir->contains(*q)) {
6972 if ((*q)->get_dir_auth().first == mds->get_nodeid()) {
6973 dout(10) << " frag " << (*p)->get_frag() << " contains (maybe) auth subtree, will force journal "
6974 << **q << dendl;
6975 force_journal = true;
6976 break;
6977 } else
6978 dout(20) << " frag " << (*p)->get_frag() << " contains but isn't auth for " << **q << dendl;
6979 } else
6980 dout(20) << " frag " << (*p)->get_frag() << " does not contain " << **q << dendl;
6981 }
6982 if (force_journal)
6983 break;
6984 }
6985 }
6986 return force_journal;
6987}
6988
6989void Server::_rename_prepare(MDRequestRef& mdr,
6990 EMetaBlob *metablob, bufferlist *client_map_bl,
6991 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
6992{
6993 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
6994 if (straydn)
6995 dout(10) << " straydn " << *straydn << dendl;
6996
6997 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
6998 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
6999 CInode *srci = srcdnl->get_inode();
7000 CInode *oldin = destdnl->get_inode();
7001
7002 // primary+remote link merge?
7003 bool linkmerge = (srci == destdnl->get_inode() &&
7004 (srcdnl->is_primary() || destdnl->is_primary()));
7005 bool silent = srcdn->get_dir()->inode->is_stray();
7006
7007 bool force_journal_dest = false;
7008 if (srci->is_dir() && !destdn->is_auth()) {
7009 if (srci->is_auth()) {
7010 // if we are auth for srci and exporting it, force journal because journal replay needs
7011 // the source inode to create auth subtrees.
7012 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
7013 force_journal_dest = true;
7014 } else
7015 force_journal_dest = _need_force_journal(srci, false);
7016 }
7017
7018 bool force_journal_stray = false;
7019 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
7020 force_journal_stray = _need_force_journal(oldin, true);
7021
7022 if (linkmerge)
7023 dout(10) << " merging remote and primary links to the same inode" << dendl;
7024 if (silent)
7025 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
7026 if (force_journal_dest)
7027 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
7028 if (force_journal_stray)
7029 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
7030
7031 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
7032 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
7033 metablob->renamed_dirino = srci->ino();
7034 } else if (oldin && oldin->is_dir() && force_journal_stray) {
7035 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
7036 metablob->renamed_dirino = oldin->ino();
7037 }
7038
7039 // prepare
7040 inode_t *pi = 0; // renamed inode
7041 inode_t *tpi = 0; // target/overwritten inode
7042
7043 // target inode
7044 if (!linkmerge) {
7045 if (destdnl->is_primary()) {
7046 assert(straydn); // moving to straydn.
7047 // link--, and move.
7048 if (destdn->is_auth()) {
7049 tpi = oldin->project_inode(); //project_snaprealm
7050 tpi->version = straydn->pre_dirty(tpi->version);
7051 tpi->update_backtrace();
7052 }
7053 straydn->push_projected_linkage(oldin);
7054 } else if (destdnl->is_remote()) {
7055 // nlink-- targeti
7056 if (oldin->is_auth()) {
7057 tpi = oldin->project_inode();
7058 tpi->version = oldin->pre_dirty();
7059 }
7060 }
7061 }
7062
7063 // dest
7064 if (srcdnl->is_remote()) {
7065 if (!linkmerge) {
7066 // destdn
7067 if (destdn->is_auth())
7068 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
7069 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7070 // srci
7071 if (srci->is_auth()) {
7072 pi = srci->project_inode();
7073 pi->version = srci->pre_dirty();
7074 }
7075 } else {
7076 dout(10) << " will merge remote onto primary link" << dendl;
7077 if (destdn->is_auth()) {
7078 pi = oldin->project_inode();
7079 pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
7080 }
7081 }
7082 } else { // primary
7083 if (destdn->is_auth()) {
7084 version_t oldpv;
7085 if (srcdn->is_auth())
7086 oldpv = srci->get_projected_version();
7087 else {
7088 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
7089
7090 // note which dirfrags have child subtrees in the journal
7091 // event, so that we can open those (as bounds) during replay.
7092 if (srci->is_dir()) {
7093 list<CDir*> ls;
7094 srci->get_dirfrags(ls);
7095 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7096 CDir *dir = *p;
7097 if (!dir->is_auth())
7098 metablob->renamed_dir_frags.push_back(dir->get_frag());
7099 }
7100 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
7101 }
7102 }
7103 pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
7104 // & srcdnl->snaprealm
7105 pi->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
7106 pi->update_backtrace();
7107 }
7108 destdn->push_projected_linkage(srci);
7109 }
7110
7111 // src
7112 if (srcdn->is_auth())
7113 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
7114 srcdn->push_projected_linkage(); // push null linkage
7115
7116 if (!silent) {
7117 if (pi) {
7118 pi->ctime = mdr->get_op_stamp();
7119 pi->change_attr++;
7120 if (linkmerge)
7121 pi->nlink--;
7122 }
7123 if (tpi) {
7124 tpi->ctime = mdr->get_op_stamp();
7125 tpi->change_attr++;
31f18b77 7126 destdn->make_path_string(tpi->stray_prior_path, true);
7c673cae
FG
7127 tpi->nlink--;
7128 if (tpi->nlink == 0)
7129 oldin->state_set(CInode::STATE_ORPHAN);
7130 }
7131 }
7132
7133 // prepare nesting, mtime updates
7134 int predirty_dir = silent ? 0:PREDIRTY_DIR;
7135
7136 // guarantee stray dir is processed first during journal replay. unlink the old inode,
7137 // then link the source inode to destdn
7138 if (destdnl->is_primary()) {
7139 assert(straydn);
7140 if (straydn->is_auth()) {
7141 metablob->add_dir_context(straydn->get_dir());
7142 metablob->add_dir(straydn->get_dir(), true);
7143 }
7144 }
7145
7146 // sub off target
7147 if (destdn->is_auth() && !destdnl->is_null()) {
7148 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
7149 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
224ce89b
WB
7150 if (destdnl->is_primary()) {
7151 assert(straydn);
7c673cae
FG
7152 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
7153 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
224ce89b 7154 }
7c673cae
FG
7155 }
7156
7157 // move srcdn
7158 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
7159 int flags = predirty_dir | predirty_primary;
7160 if (srcdn->is_auth())
7161 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
7162 if (destdn->is_auth())
7163 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
7164
7165 SnapRealm *src_realm = srci->find_snaprealm();
7166 SnapRealm *dest_realm = destdn->get_dir()->inode->find_snaprealm();
7167 snapid_t next_dest_snap = dest_realm->get_newest_seq() + 1;
7168
7169 // add it all to the metablob
7170 // target inode
7171 if (!linkmerge) {
7172 if (destdnl->is_primary()) {
224ce89b 7173 assert(straydn);
7c673cae
FG
7174 if (destdn->is_auth()) {
7175 // project snaprealm, too
7176 if (oldin->snaprealm || dest_realm->get_newest_seq() + 1 > oldin->get_oldest_snap())
7177 oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
7178 straydn->first = MAX(oldin->first, next_dest_snap);
7179 metablob->add_primary_dentry(straydn, oldin, true, true);
7180 } else if (force_journal_stray) {
7181 dout(10) << " forced journaling straydn " << *straydn << dendl;
7182 metablob->add_dir_context(straydn->get_dir());
7183 metablob->add_primary_dentry(straydn, oldin, true);
7184 }
7185 } else if (destdnl->is_remote()) {
7186 if (oldin->is_auth()) {
7187 // auth for targeti
7188 metablob->add_dir_context(oldin->get_projected_parent_dir());
7189 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
7190 CEPH_NOSNAP, 0, destdnl);
7191 metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
7192 }
7193 }
7194 }
7195
7196 // dest
7197 if (srcdnl->is_remote()) {
7198 if (!linkmerge) {
7199 if (destdn->is_auth() && !destdnl->is_null())
7200 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7201 else
7202 destdn->first = MAX(destdn->first, next_dest_snap);
7203
7204 if (destdn->is_auth())
7205 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7206 if (srci->get_projected_parent_dn()->is_auth()) { // it's remote
7207 metablob->add_dir_context(srci->get_projected_parent_dir());
7208 mdcache->journal_cow_dentry(mdr.get(), metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl);
7209 metablob->add_primary_dentry(srci->get_projected_parent_dn(), srci, true);
7210 }
7211 } else {
7212 if (destdn->is_auth() && !destdnl->is_null())
7213 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7214 else
7215 destdn->first = MAX(destdn->first, next_dest_snap);
7216
7217 if (destdn->is_auth())
7218 metablob->add_primary_dentry(destdn, destdnl->get_inode(), true, true);
7219 }
7220 } else if (srcdnl->is_primary()) {
7221 // project snap parent update?
7222 if (destdn->is_auth() && src_realm != dest_realm &&
7223 (srci->snaprealm || src_realm->get_newest_seq() + 1 > srci->get_oldest_snap()))
7224 srci->project_past_snaprealm_parent(dest_realm);
7225
7226 if (destdn->is_auth() && !destdnl->is_null())
7227 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7228 else
7229 destdn->first = MAX(destdn->first, next_dest_snap);
7230
7231 if (destdn->is_auth())
7232 metablob->add_primary_dentry(destdn, srci, true, true);
7233 else if (force_journal_dest) {
7234 dout(10) << " forced journaling destdn " << *destdn << dendl;
7235 metablob->add_dir_context(destdn->get_dir());
7236 metablob->add_primary_dentry(destdn, srci, true);
7237 if (srcdn->is_auth() && srci->is_dir()) {
7238 // journal new subtrees root dirfrags
7239 list<CDir*> ls;
7240 srci->get_dirfrags(ls);
7241 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7242 CDir *dir = *p;
7243 if (dir->is_auth())
7244 metablob->add_dir(dir, true);
7245 }
7246 }
7247 }
7248 }
7249
7250 // src
7251 if (srcdn->is_auth()) {
7252 dout(10) << " journaling srcdn " << *srcdn << dendl;
7253 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
7254 // also journal the inode in case we need do slave rename rollback. It is Ok to add
7255 // both primary and NULL dentries. Because during journal replay, null dentry is
7256 // processed after primary dentry.
7257 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
7258 metablob->add_primary_dentry(srcdn, srci, true);
7259 metablob->add_null_dentry(srcdn, true);
7260 } else
7261 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
7262
7263 // make renamed inode first track the dn
7264 if (srcdnl->is_primary() && destdn->is_auth())
7265 srci->first = destdn->first;
7266
224ce89b
WB
7267 if (oldin && oldin->is_dir()) {
7268 assert(straydn);
7c673cae 7269 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
224ce89b 7270 }
7c673cae
FG
7271 if (srci->is_dir())
7272 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
7273
7274}
7275
7276
7277void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7278{
7279 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7280 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
7281
7282 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7283 CDentry::linkage_t *destdnl = destdn->get_linkage();
7284
7285 CInode *oldin = destdnl->get_inode();
7c673cae
FG
7286
7287 // primary+remote link merge?
7288 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7289 (srcdnl->is_primary() || destdnl->is_primary()));
7290
7291 // target inode
7292 if (!linkmerge) {
7293 if (destdnl->is_primary()) {
7294 assert(straydn);
7295 dout(10) << "straydn is " << *straydn << dendl;
31f18b77 7296 destdn->get_dir()->unlink_inode(destdn, false);
7c673cae
FG
7297
7298 straydn->pop_projected_linkage();
7299 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7300 assert(!straydn->is_projected()); // no other projected
7301
7302 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
7303
7304 // nlink-- targeti
7305 if (destdn->is_auth()) {
7306 bool hadrealm = (oldin->snaprealm ? true : false);
7307 oldin->pop_and_dirty_projected_inode(mdr->ls);
7308 if (oldin->snaprealm && !hadrealm)
7309 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT);
7310 } else {
7311 // FIXME this snaprealm is not filled out correctly
7312 //oldin->open_snaprealm(); might be sufficient..
7313 }
7314 } else if (destdnl->is_remote()) {
31f18b77 7315 destdn->get_dir()->unlink_inode(destdn, false);
7c673cae
FG
7316 if (oldin->is_auth())
7317 oldin->pop_and_dirty_projected_inode(mdr->ls);
7318 }
7319 }
7320
7321 // unlink src before we relink it at dest
7322 CInode *in = srcdnl->get_inode();
7323 assert(in);
7324
7325 bool srcdn_was_remote = srcdnl->is_remote();
7326 srcdn->get_dir()->unlink_inode(srcdn);
7327
7328 // dest
7329 if (srcdn_was_remote) {
7330 if (!linkmerge) {
7331 // destdn
7332 destdnl = destdn->pop_projected_linkage();
7333 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7334 assert(!destdn->is_projected()); // no other projected
7335
7336 destdn->link_remote(destdnl, in);
7337 if (destdn->is_auth())
7338 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
7339 // in
7340 if (in->is_auth())
7341 in->pop_and_dirty_projected_inode(mdr->ls);
7342 } else {
7343 dout(10) << "merging remote onto primary link" << dendl;
7344 oldin->pop_and_dirty_projected_inode(mdr->ls);
7345 }
7346 } else { // primary
7347 if (linkmerge) {
7348 dout(10) << "merging primary onto remote link" << dendl;
31f18b77 7349 destdn->get_dir()->unlink_inode(destdn, false);
7c673cae
FG
7350 }
7351 destdnl = destdn->pop_projected_linkage();
7352 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7353 assert(!destdn->is_projected()); // no other projected
7354
7355 // srcdn inode import?
7356 if (!srcdn->is_auth() && destdn->is_auth()) {
7357 assert(mdr->more()->inode_import.length() > 0);
7358
7359 map<client_t,Capability::Import> imported_caps;
7360
7361 // finish cap imports
7362 finish_force_open_sessions(mdr->more()->imported_client_map, mdr->more()->sseq_map);
7363 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
7364 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
7365 mdr->more()->srcdn_auth_mds, true,
7366 mdr->more()->cap_imports[destdnl->get_inode()],
7367 imported_caps);
7368 }
7369
7370 mdr->more()->inode_import.clear();
7371 ::encode(imported_caps, mdr->more()->inode_import);
7372
7373 /* hack: add an auth pin for each xlock we hold. These were
7374 * remote xlocks previously but now they're local and
7375 * we're going to try and unpin when we xlock_finish. */
7376 for (set<SimpleLock *>::iterator i = mdr->xlocks.begin();
7377 i != mdr->xlocks.end();
7378 ++i)
7379 if ((*i)->get_parent() == destdnl->get_inode() &&
7380 !(*i)->is_locallock())
7381 mds->locker->xlock_import(*i);
7382
7383 // hack: fix auth bit
7384 in->state_set(CInode::STATE_AUTH);
7c673cae
FG
7385
7386 mdr->clear_ambiguous_auth();
7387 }
7388
7389 if (destdn->is_auth()) {
7390 in->pop_and_dirty_projected_inode(mdr->ls);
7391
7392 } else {
7393 // FIXME: fix up snaprealm!
7394 }
7395 }
7396
7397 // src
7398 if (srcdn->is_auth())
7399 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
7400 srcdn->pop_projected_linkage();
7401 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7402 assert(!srcdn->is_projected()); // no other projected
7403
7404 // apply remaining projected inodes (nested)
7405 mdr->apply();
7406
7407 // update subtree map?
7408 if (destdnl->is_primary() && in->is_dir())
224ce89b 7409 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
7c673cae
FG
7410
7411 if (straydn && oldin->is_dir())
7412 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
7413
7414 // removing a new dn?
7415 if (srcdn->is_auth())
7416 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
7417}
7418
7419
7420
7421// ------------
7422// SLAVE
7423
7424class C_MDS_SlaveRenamePrep : public ServerLogContext {
7425 CDentry *srcdn, *destdn, *straydn;
7426public:
7427 C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7428 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
7429 void finish(int r) override {
7430 server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
7431 }
7432};
7433
7434class C_MDS_SlaveRenameCommit : public ServerContext {
7435 MDRequestRef mdr;
7436 CDentry *srcdn, *destdn, *straydn;
7437public:
7438 C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7439 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
7440 void finish(int r) override {
7441 server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
7442 }
7443};
7444
7445class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
7446 MDRequestRef mdr;
7447public:
7448 C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
7449 ServerContext(s), mdr(r) {}
7450 void finish(int r) override {
7451 server->_slave_rename_sessions_flushed(mdr);
7452 }
7453};
7454
7455/* This function DOES put the mdr->slave_request before returning*/
7456void Server::handle_slave_rename_prep(MDRequestRef& mdr)
7457{
7458 dout(10) << "handle_slave_rename_prep " << *mdr
7459 << " " << mdr->slave_request->srcdnpath
7460 << " to " << mdr->slave_request->destdnpath
7461 << dendl;
31f18b77
FG
7462
7463 if (mdr->slave_request->is_interrupted()) {
7464 dout(10) << " slave request interrupted, sending noop reply" << dendl;
7465 MMDSSlaveRequest *reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7466 reply->mark_interrupted();
7467 mds->send_message_mds(reply, mdr->slave_to_mds);
7468 mdr->slave_request->put();
7469 mdr->slave_request = 0;
7470 return;
7471 }
7472
7c673cae
FG
7473 // discover destdn
7474 filepath destpath(mdr->slave_request->destdnpath);
7475 dout(10) << " dest " << destpath << dendl;
7476 vector<CDentry*> trace;
7477 int r = mdcache->path_traverse(mdr, NULL, NULL, destpath, &trace, NULL, MDS_TRAVERSE_DISCOVERXLOCK);
7478 if (r > 0) return;
7479 if (r == -ESTALE) {
7480 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
7481 mdr->slave_to_mds);
7482 return;
7483 }
7484 assert(r == 0); // we shouldn't get an error here!
7485
7486 CDentry *destdn = trace[trace.size()-1];
7487 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7488 dout(10) << " destdn " << *destdn << dendl;
7489 mdr->pin(destdn);
7490
7491 // discover srcdn
7492 filepath srcpath(mdr->slave_request->srcdnpath);
7493 dout(10) << " src " << srcpath << dendl;
7494 CInode *srci = nullptr;
7495 r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &srci, MDS_TRAVERSE_DISCOVERXLOCK);
7496 if (r > 0) return;
7497 assert(r == 0);
7498
7499 // srcpath must not point to a null dentry
7500 assert(srci != nullptr);
7501
7502 CDentry *srcdn = trace[trace.size()-1];
7503 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7504 dout(10) << " srcdn " << *srcdn << dendl;
7505 mdr->pin(srcdn);
7506 mdr->pin(srci);
7507
7508 // stray?
7509 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7510 (srcdnl->is_primary() || destdnl->is_primary()));
7511 CDentry *straydn = mdr->straydn;
7512 if (destdnl->is_primary() && !linkmerge)
7513 assert(straydn);
7514
7515 mdr->set_op_stamp(mdr->slave_request->op_stamp);
7516 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
7517
7518 // set up commit waiter (early, to clean up any freezing etc we do)
7519 if (!mdr->more()->slave_commit)
7520 mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
7521
7522 // am i srcdn auth?
7523 if (srcdn->is_auth()) {
7524 set<mds_rank_t> srcdnrep;
7525 srcdn->list_replicas(srcdnrep);
7526
7527 bool reply_witness = false;
7528 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
7529 // freeze?
7530 // we need this to
7531 // - avoid conflicting lock state changes
7532 // - avoid concurrent updates to the inode
7533 // (this could also be accomplished with the versionlock)
7534 int allowance = 2; // 1 for the mdr auth_pin, 1 for the link lock
7535 allowance += srcdnl->get_inode()->is_dir(); // for the snap lock
7536 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
7537 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
7538
7539 // unfreeze auth pin after freezing the inode to avoid queueing waiters
7540 if (srcdnl->get_inode()->is_frozen_auth_pin())
7541 mdr->unfreeze_auth_pin();
7542
7543 if (!frozen_inode) {
7544 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
7545 return;
7546 }
7547
7548 /*
7549 * set ambiguous auth for srci
7550 * NOTE: we don't worry about ambiguous cache expire as we do
7551 * with subtree migrations because all slaves will pin
7552 * srcdn->get_inode() for duration of this rename.
7553 */
7554 mdr->set_ambiguous_auth(srcdnl->get_inode());
7555
7556 // just mark the source inode as ambiguous auth if more than two MDS are involved.
7557 // the master will send another OP_RENAMEPREP slave request later.
7558 if (mdr->slave_request->witnesses.size() > 1) {
7559 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
7560 reply_witness = true;
7561 }
7562
7563 // make sure bystanders have received all lock related messages
7564 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7565 if (*p == mdr->slave_to_mds ||
7566 (mds->is_cluster_degraded() &&
7567 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
7568 continue;
7569 MMDSSlaveRequest *notify = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7570 MMDSSlaveRequest::OP_RENAMENOTIFY);
7571 mds->send_message_mds(notify, *p);
7572 mdr->more()->waiting_on_slave.insert(*p);
7573 }
7574
7575 // make sure clients have received all cap related messages
7576 set<client_t> export_client_set;
7577 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
7578
7579 MDSGatherBuilder gather(g_ceph_context);
7580 flush_client_sessions(export_client_set, gather);
7581 if (gather.has_subs()) {
7582 mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
7583 gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
7584 gather.activate();
7585 }
7586 }
7587
7588 // is witness list sufficient?
7589 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7590 if (*p == mdr->slave_to_mds ||
7591 mdr->slave_request->witnesses.count(*p)) continue;
7592 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
7593 reply_witness = true;
7594 break;
7595 }
7596
7597 if (reply_witness) {
7598 assert(!srcdnrep.empty());
7599 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7600 MMDSSlaveRequest::OP_RENAMEPREPACK);
7601 reply->witnesses.swap(srcdnrep);
7602 mds->send_message_mds(reply, mdr->slave_to_mds);
7603 mdr->slave_request->put();
7604 mdr->slave_request = 0;
7605 return;
7606 }
7607 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
7608 if (!mdr->more()->waiting_on_slave.empty()) {
7609 dout(10) << " still waiting for rename notify acks from "
7610 << mdr->more()->waiting_on_slave << dendl;
7611 return;
7612 }
7613 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
7614 // set ambiguous auth for srci on witnesses
7615 mdr->set_ambiguous_auth(srcdnl->get_inode());
7616 }
7617
7618 // encode everything we'd need to roll this back... basically, just the original state.
7619 rename_rollback rollback;
7620
7621 rollback.reqid = mdr->reqid;
7622
7623 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
7624 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7625 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
7626 rollback.orig_src.dname = srcdn->name;
7627 if (srcdnl->is_primary())
7628 rollback.orig_src.ino = srcdnl->get_inode()->ino();
7629 else {
7630 assert(srcdnl->is_remote());
7631 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
7632 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
7633 }
7634
7635 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
7636 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7637 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
7638 rollback.orig_dest.dname = destdn->name;
7639 if (destdnl->is_primary())
7640 rollback.orig_dest.ino = destdnl->get_inode()->ino();
7641 else if (destdnl->is_remote()) {
7642 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
7643 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
7644 }
7645
7646 if (straydn) {
7647 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
7648 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
7649 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
7650 rollback.stray.dname = straydn->name;
7651 }
7652 ::encode(rollback, mdr->more()->rollback_bl);
7653 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7654
7655 // journal.
7656 mdr->ls = mdlog->get_current_segment();
7657 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
7658 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
7659 mdlog->start_entry(le);
7660 le->rollback = mdr->more()->rollback_bl;
7661
7662 bufferlist blah; // inode import data... obviously not used if we're the slave
7663 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
7664
7665 if (le->commit.empty()) {
7666 dout(10) << " empty metablob, skipping journal" << dendl;
7667 mdlog->cancel_entry(le);
7668 mdr->ls = NULL;
7669 _logged_slave_rename(mdr, srcdn, destdn, straydn);
7670 } else {
7671 mdr->more()->slave_update_journaled = true;
7672 submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
7673 mdr, __func__);
7674 mdlog->flush();
7675 }
7676}
7677
7678void Server::_logged_slave_rename(MDRequestRef& mdr,
7679 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7680{
7681 dout(10) << "_logged_slave_rename " << *mdr << dendl;
7682
7683 // prepare ack
7684 MMDSSlaveRequest *reply = NULL;
7685 if (!mdr->aborted) {
7686 reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7687 if (!mdr->more()->slave_update_journaled)
7688 reply->mark_not_journaled();
7689 }
7690
7691 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7692 CDentry::linkage_t *destdnl = NULL;
7693 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
7694
7695 // export srci?
7696 if (srcdn->is_auth() && srcdnl->is_primary()) {
7697 // set export bounds for CInode::encode_export()
7698 list<CDir*> bounds;
7699 if (srcdnl->get_inode()->is_dir()) {
7700 srcdnl->get_inode()->get_dirfrags(bounds);
7701 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7702 (*p)->state_set(CDir::STATE_EXPORTBOUND);
7703 }
7704
7705 map<client_t,entity_inst_t> exported_client_map;
7706 bufferlist inodebl;
7707 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
7708 exported_client_map);
7709
7710 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7711 (*p)->state_clear(CDir::STATE_EXPORTBOUND);
7712
7713 if (reply) {
7714 ::encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
7715 reply->inode_export.claim_append(inodebl);
7716 reply->inode_export_v = srcdnl->get_inode()->inode.version;
7717 }
7718
7719 // remove mdr auth pin
7720 mdr->auth_unpin(srcdnl->get_inode());
7721 mdr->more()->is_inode_exporter = true;
7722
7723 if (srcdnl->get_inode()->is_dirty())
7724 srcdnl->get_inode()->mark_clean();
7725
7726 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
7727 }
7728
7729 // apply
7730 _rename_apply(mdr, srcdn, destdn, straydn);
7731
7732 destdnl = destdn->get_linkage();
7733
7734 // bump popularity
7735 mds->balancer->hit_dir(mdr->get_mds_stamp(), srcdn->get_dir(), META_POP_IWR);
7736 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
7737 mds->balancer->hit_inode(mdr->get_mds_stamp(), destdnl->get_inode(),
7738 META_POP_IWR);
7739
7740 // done.
7741 mdr->slave_request->put();
7742 mdr->slave_request = 0;
7743 mdr->straydn = 0;
7744
7745 if (reply) {
7746 mds->send_message_mds(reply, mdr->slave_to_mds);
7747 } else {
7748 assert(mdr->aborted);
7749 dout(10) << " abort flag set, finishing" << dendl;
7750 mdcache->request_finish(mdr);
7751 }
7752}
7753
7754void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
7755 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7756{
7757 dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
7758
7759 CDentry::linkage_t *destdnl = destdn->get_linkage();
7760
7761 list<MDSInternalContextBase*> finished;
7762 if (r == 0) {
7763 // unfreeze+singleauth inode
7764 // hmm, do i really need to delay this?
7765 if (mdr->more()->is_inode_exporter) {
7766
7767 CInode *in = destdnl->get_inode();
7768
7769 // drop our pins
7770 // we exported, clear out any xlocks that we moved to another MDS
7771 set<SimpleLock*>::iterator i = mdr->xlocks.begin();
7772 while (i != mdr->xlocks.end()) {
7773 SimpleLock *lock = *i++;
7774
7775 // we only care about xlocks on the exported inode
7776 if (lock->get_parent() == in &&
7777 !lock->is_locallock())
7778 mds->locker->xlock_export(lock, mdr.get());
7779 }
7780
7781 map<client_t,Capability::Import> peer_imported;
7782 bufferlist::iterator bp = mdr->more()->inode_import.begin();
7783 ::decode(peer_imported, bp);
7784
7785 dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl;
7786 mdcache->migrator->finish_export_inode(destdnl->get_inode(),
7787 mdr->get_mds_stamp(),
7788 mdr->slave_to_mds, peer_imported, finished);
7789 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
7790
7791 // unfreeze
7792 assert(destdnl->get_inode()->is_frozen_inode());
7793 destdnl->get_inode()->unfreeze_inode(finished);
7794 }
7795
7796 // singleauth
7797 if (mdr->more()->is_ambiguous_auth) {
7798 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
7799 mdr->more()->is_ambiguous_auth = false;
7800 }
7801
31f18b77
FG
7802 if (straydn && mdr->more()->slave_update_journaled) {
7803 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7804 if (strayin && !strayin->snaprealm)
7805 mdcache->clear_dirty_bits_for_stray(strayin);
7806 }
7c673cae
FG
7807
7808 mds->queue_waiters(finished);
7809 mdr->cleanup();
7810
7811 if (mdr->more()->slave_update_journaled) {
7812 // write a commit to the journal
7813 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
7814 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
7815 ESlaveUpdate::RENAME);
7816 mdlog->start_entry(le);
7817 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
7818 mdlog->flush();
7819 } else {
7820 _committed_slave(mdr);
7821 }
7822 } else {
7823
7824 // abort
7825 // rollback_bl may be empty if we froze the inode but had to provide an expanded
7826 // witness list from the master, and they failed before we tried prep again.
7827 if (mdr->more()->rollback_bl.length()) {
7828 if (mdr->more()->is_inode_exporter) {
7829 dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl;
7830 destdnl->get_inode()->abort_export();
7831 }
7832 if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
7833 mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
7834 // rollback but preserve the slave request
7835 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
7836 mdr->more()->rollback_bl.clear();
7837 } else
7838 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
7839 } else {
7840 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
7841 // singleauth
7842 if (mdr->more()->is_ambiguous_auth) {
7843 if (srcdn->is_auth())
7844 mdr->more()->rename_inode->unfreeze_inode(finished);
7845
7846 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
7847 mdr->more()->is_ambiguous_auth = false;
7848 }
7849 mds->queue_waiters(finished);
7850 mdcache->request_finish(mdr);
7851 }
7852 }
7853}
7854
7855void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
7856 bool isdir, int linkunlink, nest_info_t &rstat)
7857{
7858 fnode_t *pf;
7859 pf = dir->project_fnode();
7860 mut->add_projected_fnode(dir);
7861 pf->version = dir->pre_dirty();
7862
7863 if (isdir) {
7864 pf->fragstat.nsubdirs += linkunlink;
7865 } else {
7866 pf->fragstat.nfiles += linkunlink;
7867 }
7868 if (r.ino) {
7869 pf->rstat.rbytes += linkunlink * rstat.rbytes;
7870 pf->rstat.rfiles += linkunlink * rstat.rfiles;
7871 pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
7872 pf->rstat.rsnaprealms += linkunlink * rstat.rsnaprealms;
7873 }
7874 if (pf->fragstat.mtime == ctime) {
7875 pf->fragstat.mtime = r.dirfrag_old_mtime;
7876 if (pf->rstat.rctime == ctime)
7877 pf->rstat.rctime = r.dirfrag_old_rctime;
7878 }
7879 mut->add_updated_lock(&dir->get_inode()->filelock);
7880 mut->add_updated_lock(&dir->get_inode()->nestlock);
7881}
7882
7883struct C_MDS_LoggedRenameRollback : public ServerLogContext {
7884 MutationRef mut;
7885 CDentry *srcdn;
7886 version_t srcdnpv;
7887 CDentry *destdn;
7888 CDentry *straydn;
7889 bool finish_mdr;
7890 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
7891 CDentry *sd, version_t pv, CDentry *dd,
7892 CDentry *st, bool f) :
7893 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
7894 straydn(st), finish_mdr(f) {}
7895 void finish(int r) override {
7896 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
7897 destdn, straydn, finish_mdr);
7898 }
7899};
7900
7901void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
7902 bool finish_mdr)
7903{
7904 rename_rollback rollback;
7905 bufferlist::iterator p = rbl.begin();
7906 ::decode(rollback, p);
7907
7908 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
7909 // need to finish this update before sending resolve to claim the subtree
7910 mdcache->add_rollback(rollback.reqid, master);
7911
7912 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7913 mut->ls = mds->mdlog->get_current_segment();
7914
7915 CDentry *srcdn = NULL;
7916 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
7917 if (!srcdir)
7918 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
7919 if (srcdir) {
7920 dout(10) << " srcdir " << *srcdir << dendl;
7921 srcdn = srcdir->lookup(rollback.orig_src.dname);
7922 if (srcdn) {
7923 dout(10) << " srcdn " << *srcdn << dendl;
7924 assert(srcdn->get_linkage()->is_null());
7925 } else
7926 dout(10) << " srcdn not found" << dendl;
7927 } else
7928 dout(10) << " srcdir not found" << dendl;
7929
7930 CDentry *destdn = NULL;
7931 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
7932 if (!destdir)
7933 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
7934 if (destdir) {
7935 dout(10) << " destdir " << *destdir << dendl;
7936 destdn = destdir->lookup(rollback.orig_dest.dname);
7937 if (destdn)
7938 dout(10) << " destdn " << *destdn << dendl;
7939 else
7940 dout(10) << " destdn not found" << dendl;
7941 } else
7942 dout(10) << " destdir not found" << dendl;
7943
7944 CInode *in = NULL;
7945 if (rollback.orig_src.ino) {
7946 in = mdcache->get_inode(rollback.orig_src.ino);
7947 if (in && in->is_dir())
7948 assert(srcdn && destdn);
7949 } else
7950 in = mdcache->get_inode(rollback.orig_src.remote_ino);
7951
7952 CDir *straydir = NULL;
7953 CDentry *straydn = NULL;
7954 if (rollback.stray.dirfrag.ino) {
7955 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
7956 if (straydir) {
7957 dout(10) << "straydir " << *straydir << dendl;
7958 straydn = straydir->lookup(rollback.stray.dname);
7959 if (straydn) {
7960 dout(10) << " straydn " << *straydn << dendl;
7961 assert(straydn->get_linkage()->is_primary());
7962 } else
7963 dout(10) << " straydn not found" << dendl;
7964 } else
7965 dout(10) << "straydir not found" << dendl;
7966 }
7967
7968 CInode *target = NULL;
7969 if (rollback.orig_dest.ino) {
7970 target = mdcache->get_inode(rollback.orig_dest.ino);
7971 if (target)
7972 assert(destdn && straydn);
7973 } else if (rollback.orig_dest.remote_ino)
7974 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
7975
7976 // can't use is_auth() in the resolve stage
7977 mds_rank_t whoami = mds->get_nodeid();
7978 // slave
7979 assert(!destdn || destdn->authority().first != whoami);
7980 assert(!straydn || straydn->authority().first != whoami);
7981
7982 bool force_journal_src = false;
7983 bool force_journal_dest = false;
7984 if (in && in->is_dir() && srcdn->authority().first != whoami)
7985 force_journal_src = _need_force_journal(in, false);
7986 if (in && target && target->is_dir())
7987 force_journal_dest = _need_force_journal(in, true);
7988
7989 version_t srcdnpv = 0;
7990 // repair src
7991 if (srcdn) {
7992 if (srcdn->authority().first == whoami)
7993 srcdnpv = srcdn->pre_dirty();
7994 if (rollback.orig_src.ino) {
7995 assert(in);
7996 srcdn->push_projected_linkage(in);
7997 } else
7998 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
7999 rollback.orig_src.remote_d_type);
8000 }
8001
8002 inode_t *pi = 0;
8003 if (in) {
8004 if (in->authority().first == whoami) {
8005 pi = in->project_inode();
8006 mut->add_projected_inode(in);
8007 pi->version = in->pre_dirty();
8008 } else
8009 pi = in->get_projected_inode();
8010 if (pi->ctime == rollback.ctime)
8011 pi->ctime = rollback.orig_src.old_ctime;
8012 }
8013
8014 if (srcdn && srcdn->authority().first == whoami) {
8015 nest_info_t blah;
8016 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
8017 in ? in->is_dir() : false, 1, pi ? pi->accounted_rstat : blah);
8018 }
8019
8020 // repair dest
8021 if (destdn) {
8022 if (rollback.orig_dest.ino && target) {
8023 destdn->push_projected_linkage(target);
8024 } else if (rollback.orig_dest.remote_ino) {
8025 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
8026 rollback.orig_dest.remote_d_type);
8027 } else {
8028 // the dentry will be trimmed soon, it's ok to have wrong linkage
8029 if (rollback.orig_dest.ino)
8030 assert(mds->is_resolve());
8031 destdn->push_projected_linkage();
8032 }
8033 }
8034
8035 if (straydn)
8036 straydn->push_projected_linkage();
8037
8038 if (target) {
8039 inode_t *ti = NULL;
8040 if (target->authority().first == whoami) {
8041 ti = target->project_inode();
8042 mut->add_projected_inode(target);
8043 ti->version = target->pre_dirty();
8044 } else
8045 ti = target->get_projected_inode();
8046 if (ti->ctime == rollback.ctime)
8047 ti->ctime = rollback.orig_dest.old_ctime;
8048 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
8049 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
8050 assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
8051 else
8052 assert(rollback.orig_dest.remote_ino &&
8053 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
8054 } else
8055 ti->nlink++;
8056 }
8057
8058 if (srcdn)
8059 dout(0) << " srcdn back to " << *srcdn << dendl;
8060 if (in)
8061 dout(0) << " srci back to " << *in << dendl;
8062 if (destdn)
8063 dout(0) << " destdn back to " << *destdn << dendl;
8064 if (target)
8065 dout(0) << " desti back to " << *target << dendl;
8066
8067 // journal it
8068 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
8069 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
8070 mdlog->start_entry(le);
8071
8072 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
8073 le->commit.add_dir_context(srcdir);
8074 if (rollback.orig_src.ino)
8075 le->commit.add_primary_dentry(srcdn, 0, true);
8076 else
8077 le->commit.add_remote_dentry(srcdn, true);
8078 }
8079
8080 if (!rollback.orig_src.ino && // remote linkage
8081 in && in->authority().first == whoami) {
8082 le->commit.add_dir_context(in->get_projected_parent_dir());
8083 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
8084 }
8085
8086 if (force_journal_dest) {
8087 assert(rollback.orig_dest.ino);
8088 le->commit.add_dir_context(destdir);
8089 le->commit.add_primary_dentry(destdn, 0, true);
8090 }
8091
8092 // slave: no need to journal straydn
8093
8094 if (target && target != in && target->authority().first == whoami) {
8095 assert(rollback.orig_dest.remote_ino);
8096 le->commit.add_dir_context(target->get_projected_parent_dir());
8097 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
8098 }
8099
8100 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
8101 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
8102 le->commit.renamed_dirino = in->ino();
8103 if (srcdn->authority().first == whoami) {
8104 list<CDir*> ls;
8105 in->get_dirfrags(ls);
8106 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
8107 CDir *dir = *p;
8108 if (!dir->is_auth())
8109 le->commit.renamed_dir_frags.push_back(dir->get_frag());
8110 }
8111 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
8112 }
8113 } else if (force_journal_dest) {
8114 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
8115 le->commit.renamed_dirino = target->ino();
8116 }
8117
8118 if (target && target->is_dir()) {
8119 assert(destdn);
8120 mdcache->project_subtree_rename(target, straydir, destdir);
8121 }
8122
8123 if (in && in->is_dir()) {
8124 assert(srcdn);
8125 mdcache->project_subtree_rename(in, destdir, srcdir);
8126 }
8127
8128 if (mdr && !mdr->more()->slave_update_journaled) {
8129 assert(le->commit.empty());
8130 mdlog->cancel_entry(le);
8131 mut->ls = NULL;
8132 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, finish_mdr);
8133 } else {
8134 assert(!le->commit.empty());
8135 if (mdr)
8136 mdr->more()->slave_update_journaled = false;
8137 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr, srcdn, srcdnpv,
8138 destdn, straydn, finish_mdr);
8139 submit_mdlog_entry(le, fin, mdr, __func__);
8140 mdlog->flush();
8141 }
8142}
8143
8144void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
8145 version_t srcdnpv, CDentry *destdn,
8146 CDentry *straydn, bool finish_mdr)
8147{
8148 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
8149
8150 if (straydn) {
8151 straydn->get_dir()->unlink_inode(straydn);
8152 straydn->pop_projected_linkage();
8153 }
8154 if (destdn) {
8155 destdn->get_dir()->unlink_inode(destdn);
8156 destdn->pop_projected_linkage();
8157 }
8158 if (srcdn) {
8159 srcdn->pop_projected_linkage();
8160 if (srcdn->authority().first == mds->get_nodeid())
8161 srcdn->mark_dirty(srcdnpv, mut->ls);
8162 }
8163
8164 mut->apply();
8165
8166 if (srcdn && srcdn->get_linkage()->is_primary()) {
8167 CInode *in = srcdn->get_linkage()->get_inode();
8168 if (srcdn->authority().first == mds->get_nodeid())
8169 in->state_set(CInode::STATE_AUTH);
8170 // update subtree map?
8171 if (in && in->is_dir()) {
8172 assert(destdn);
8173 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
8174 }
8175 }
8176
8177 if (destdn) {
8178 CInode *oldin = destdn->get_linkage()->get_inode();
8179 // update subtree map?
8180 if (oldin && oldin->is_dir()) {
8181 assert(straydn);
8182 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
8183 }
8184 }
8185
8186 if (mds->is_resolve()) {
8187 CDir *root = NULL;
8188 if (straydn)
8189 root = mdcache->get_subtree_root(straydn->get_dir());
8190 else if (destdn)
8191 root = mdcache->get_subtree_root(destdn->get_dir());
8192 if (root)
8193 mdcache->try_trim_non_auth_subtree(root);
8194 }
8195
8196 if (mdr) {
8197 list<MDSInternalContextBase*> finished;
8198 if (mdr->more()->is_ambiguous_auth) {
8199 if (srcdn->is_auth())
8200 mdr->more()->rename_inode->unfreeze_inode(finished);
8201
8202 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8203 mdr->more()->is_ambiguous_auth = false;
8204 }
8205 mds->queue_waiters(finished);
8206 if (finish_mdr || mdr->aborted)
8207 mdcache->request_finish(mdr);
8208 else
8209 mdr->more()->slave_rolling_back = false;
8210 }
8211
8212 mdcache->finish_rollback(mut->reqid);
8213
8214 mut->cleanup();
8215}
8216
8217/* This function DOES put the passed message before returning*/
8218void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8219{
8220 dout(10) << "handle_slave_rename_prep_ack " << *mdr
8221 << " witnessed by " << ack->get_source()
8222 << " " << *ack << dendl;
8223 mds_rank_t from = mds_rank_t(ack->get_source().num());
8224
8225 // note slave
8226 mdr->more()->slaves.insert(from);
8227 if (mdr->more()->srcdn_auth_mds == from &&
8228 mdr->more()->is_remote_frozen_authpin &&
8229 !mdr->more()->is_ambiguous_auth) {
8230 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
8231 }
8232
8233 // witnessed? or add extra witnesses?
8234 assert(mdr->more()->witnessed.count(from) == 0);
31f18b77
FG
8235 if (ack->is_interrupted()) {
8236 dout(10) << " slave request interrupted, noop" << dendl;
8237 } else if (ack->witnesses.empty()) {
7c673cae
FG
8238 mdr->more()->witnessed.insert(from);
8239 if (!ack->is_not_journaled())
8240 mdr->more()->has_journaled_slaves = true;
8241 } else {
8242 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
8243 mdr->more()->extra_witnesses.swap(ack->witnesses);
8244 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
8245 }
8246
8247 // srci import?
8248 if (ack->inode_export.length()) {
8249 dout(10) << " got srci import" << dendl;
8250 mdr->more()->inode_import.claim(ack->inode_export);
8251 mdr->more()->inode_import_v = ack->inode_export_v;
8252 }
8253
8254 // remove from waiting list
8255 assert(mdr->more()->waiting_on_slave.count(from));
8256 mdr->more()->waiting_on_slave.erase(from);
8257
8258 if (mdr->more()->waiting_on_slave.empty())
8259 dispatch_client_request(mdr); // go again!
8260 else
8261 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
8262}
8263
8264void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8265{
8266 dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
8267 << ack->get_source() << dendl;
8268 assert(mdr->is_slave());
8269 mds_rank_t from = mds_rank_t(ack->get_source().num());
8270
8271 if (mdr->more()->waiting_on_slave.count(from)) {
8272 mdr->more()->waiting_on_slave.erase(from);
8273
8274 if (mdr->more()->waiting_on_slave.empty()) {
8275 if (mdr->slave_request)
8276 dispatch_slave_request(mdr);
8277 } else
8278 dout(10) << " still waiting for rename notify acks from "
8279 << mdr->more()->waiting_on_slave << dendl;
8280 }
8281}
8282
8283void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
8284{
8285 dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
8286
8287 if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
8288 mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
8289
8290 if (mdr->more()->waiting_on_slave.empty()) {
8291 if (mdr->slave_request)
8292 dispatch_slave_request(mdr);
8293 } else
8294 dout(10) << " still waiting for rename notify acks from "
8295 << mdr->more()->waiting_on_slave << dendl;
8296 }
8297}
8298
8299// snaps
8300/* This function takes responsibility for the passed mdr*/
8301void Server::handle_client_lssnap(MDRequestRef& mdr)
8302{
8303 MClientRequest *req = mdr->client_request;
8304
8305 // traverse to path
8306 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8307 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8308 respond_to_request(mdr, -ESTALE);
8309 return;
8310 }
8311 if (!diri->is_auth()) {
8312 mdcache->request_forward(mdr, diri->authority().first);
8313 return;
8314 }
8315 if (!diri->is_dir()) {
8316 respond_to_request(mdr, -ENOTDIR);
8317 return;
8318 }
8319 dout(10) << "lssnap on " << *diri << dendl;
8320
8321 // lock snap
8322 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8323 mds->locker->include_snap_rdlocks(rdlocks, diri);
8324 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8325 return;
8326
8327 if (!check_access(mdr, diri, MAY_READ))
8328 return;
8329
8330 SnapRealm *realm = diri->find_snaprealm();
8331 map<snapid_t,SnapInfo*> infomap;
8332 realm->get_snap_info(infomap, diri->get_oldest_snap());
8333
8334 unsigned max_entries = req->head.args.readdir.max_entries;
8335 if (!max_entries)
8336 max_entries = infomap.size();
8337 int max_bytes = req->head.args.readdir.max_bytes;
8338 if (!max_bytes)
8339 // make sure at least one item can be encoded
8340 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
8341
8342 __u64 last_snapid = 0;
8343 string offset_str = req->get_path2();
8344 if (!offset_str.empty())
8345 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
8346
8347 bufferlist dirbl;
8348 encode_empty_dirstat(dirbl);
8349
8350 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
8351
8352 __u32 num = 0;
8353 bufferlist dnbl;
8354 map<snapid_t,SnapInfo*>::iterator p = infomap.upper_bound(last_snapid);
8355 for (; p != infomap.end() && num < max_entries; ++p) {
8356 dout(10) << p->first << " -> " << *p->second << dendl;
8357
8358 // actual
8359 string snap_name;
8360 if (p->second->ino == diri->ino())
8361 snap_name = p->second->name;
8362 else
8363 snap_name = p->second->get_long_name();
8364
8365 unsigned start_len = dnbl.length();
8366 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
8367 break;
8368
8369 ::encode(snap_name, dnbl);
8370 encode_infinite_lease(dnbl);
8371
8372 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
8373 if (r < 0) {
8374 bufferlist keep;
8375 keep.substr_of(dnbl, 0, start_len);
8376 dnbl.swap(keep);
8377 break;
8378 }
8379 ++num;
8380 }
8381
8382 ::encode(num, dirbl);
8383 __u16 flags = 0;
8384 if (p == infomap.end()) {
8385 flags = CEPH_READDIR_FRAG_END;
8386 if (last_snapid == 0)
8387 flags |= CEPH_READDIR_FRAG_COMPLETE;
8388 }
8389 ::encode(flags, dirbl);
8390 dirbl.claim_append(dnbl);
8391
8392 mdr->reply_extra_bl = dirbl;
8393 mdr->tracei = diri;
8394 respond_to_request(mdr, 0);
8395}
8396
8397
8398// MKSNAP
8399
8400struct C_MDS_mksnap_finish : public ServerLogContext {
8401 CInode *diri;
8402 SnapInfo info;
8403 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
8404 ServerLogContext(s, r), diri(di), info(i) {}
8405 void finish(int r) override {
8406 server->_mksnap_finish(mdr, diri, info);
8407 }
8408};
8409
8410/* This function takes responsibility for the passed mdr*/
8411void Server::handle_client_mksnap(MDRequestRef& mdr)
8412{
8413 if (!mds->mdsmap->allows_snaps()) {
8414 // you can't make snapshots until you set an option right now
8415 respond_to_request(mdr, -EPERM);
8416 return;
8417 }
8418
8419 MClientRequest *req = mdr->client_request;
8420 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8421 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8422 respond_to_request(mdr, -ESTALE);
8423 return;
8424 }
8425
8426 if (!diri->is_auth()) { // fw to auth?
8427 mdcache->request_forward(mdr, diri->authority().first);
8428 return;
8429 }
8430
8431 // dir only
8432 if (!diri->is_dir()) {
8433 respond_to_request(mdr, -ENOTDIR);
8434 return;
8435 }
8436 if (diri->is_system() && !diri->is_root()) {
8437 // no snaps in system dirs (root is ok)
8438 respond_to_request(mdr, -EPERM);
8439 return;
8440 }
8441
8442 const string &snapname = req->get_filepath().last_dentry();
8443
8444 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8445 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8446 respond_to_request(mdr, -EPERM);
8447 return;
8448 }
8449
8450 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
8451
8452 // lock snap
8453 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8454
8455 mds->locker->include_snap_rdlocks(rdlocks, diri);
8456 rdlocks.erase(&diri->snaplock);
8457 xlocks.insert(&diri->snaplock);
8458
8459 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8460 return;
8461
8462 if (!check_access(mdr, diri, MAY_WRITE))
8463 return;
8464
8465 // make sure name is unique
8466 if (diri->snaprealm &&
8467 diri->snaprealm->exists(snapname)) {
8468 respond_to_request(mdr, -EEXIST);
8469 return;
8470 }
8471 if (snapname.length() == 0 ||
8472 snapname[0] == '_') {
8473 respond_to_request(mdr, -EINVAL);
8474 return;
8475 }
8476
8477 // allocate a snapid
8478 if (!mdr->more()->stid) {
8479 // prepare an stid
8480 mds->snapclient->prepare_create(diri->ino(), snapname,
8481 mdr->get_mds_stamp(),
8482 &mdr->more()->stid, &mdr->more()->snapidbl,
8483 new C_MDS_RetryRequest(mdcache, mdr));
8484 return;
8485 }
8486
8487 version_t stid = mdr->more()->stid;
8488 snapid_t snapid;
8489 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8490 ::decode(snapid, p);
8491 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
8492
8493 // journal
8494 SnapInfo info;
8495 info.ino = diri->ino();
8496 info.snapid = snapid;
8497 info.name = snapname;
8498 info.stamp = mdr->get_op_stamp();
8499
8500 inode_t *pi = diri->project_inode();
8501 pi->ctime = info.stamp;
8502 pi->version = diri->pre_dirty();
8503
8504 // project the snaprealm
8505 sr_t *newsnap = diri->project_snaprealm(snapid);
8506 newsnap->snaps[snapid] = info;
8507 newsnap->seq = snapid;
8508 newsnap->last_created = snapid;
8509
8510 // journal the inode changes
8511 mdr->ls = mdlog->get_current_segment();
8512 EUpdate *le = new EUpdate(mdlog, "mksnap");
8513 mdlog->start_entry(le);
8514
8515 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8516 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8517 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8518 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8519
8520 // journal the snaprealm changes
8521 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
8522 mdr, __func__);
8523 mdlog->flush();
8524}
8525
8526void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
8527{
8528 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
8529
8530 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
8531
8532 diri->pop_and_dirty_projected_inode(mdr->ls);
8533 mdr->apply();
8534
8535 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8536
8537 // create snap
8538 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8539
8540 mdcache->do_realm_invalidate_and_update_notify(diri, op);
8541
8542 // yay
8543 mdr->in[0] = diri;
8544 mdr->snapid = info.snapid;
8545 mdr->tracei = diri;
8546 respond_to_request(mdr, 0);
8547}
8548
8549
8550// RMSNAP
8551
8552struct C_MDS_rmsnap_finish : public ServerLogContext {
8553 CInode *diri;
8554 snapid_t snapid;
8555 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8556 ServerLogContext(s, r), diri(di), snapid(sn) {}
8557 void finish(int r) override {
8558 server->_rmsnap_finish(mdr, diri, snapid);
8559 }
8560};
8561
8562/* This function takes responsibility for the passed mdr*/
8563void Server::handle_client_rmsnap(MDRequestRef& mdr)
8564{
8565 MClientRequest *req = mdr->client_request;
8566
8567 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8568 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8569 respond_to_request(mdr, -ESTALE);
8570 return;
8571 }
8572 if (!diri->is_auth()) { // fw to auth?
8573 mdcache->request_forward(mdr, diri->authority().first);
8574 return;
8575 }
8576 if (!diri->is_dir()) {
8577 respond_to_request(mdr, -ENOTDIR);
8578 return;
8579 }
8580
8581 const string &snapname = req->get_filepath().last_dentry();
8582
8583 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8584 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8585 respond_to_request(mdr, -EPERM);
8586 return;
8587 }
8588
8589 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
8590
8591 // does snap exist?
8592 if (snapname.length() == 0 || snapname[0] == '_') {
8593 respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
8594 return;
8595 }
8596 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
8597 respond_to_request(mdr, -ENOENT);
8598 return;
8599 }
8600 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
8601 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
8602
8603 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8604 mds->locker->include_snap_rdlocks(rdlocks, diri);
8605 rdlocks.erase(&diri->snaplock);
8606 xlocks.insert(&diri->snaplock);
8607
8608 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8609 return;
8610
8611 if (!check_access(mdr, diri, MAY_WRITE))
8612 return;
8613
8614 // prepare
8615 if (!mdr->more()->stid) {
8616 mds->snapclient->prepare_destroy(diri->ino(), snapid,
8617 &mdr->more()->stid, &mdr->more()->snapidbl,
8618 new C_MDS_RetryRequest(mdcache, mdr));
8619 return;
8620 }
8621 version_t stid = mdr->more()->stid;
8622 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8623 snapid_t seq;
8624 ::decode(seq, p);
8625 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8626
8627 // journal
8628 inode_t *pi = diri->project_inode();
8629 pi->version = diri->pre_dirty();
8630 pi->ctime = mdr->get_op_stamp();
8631
8632 mdr->ls = mdlog->get_current_segment();
8633 EUpdate *le = new EUpdate(mdlog, "rmsnap");
8634 mdlog->start_entry(le);
8635
8636 // project the snaprealm
8637 sr_t *newnode = diri->project_snaprealm();
8638 newnode->snaps.erase(snapid);
8639 newnode->seq = seq;
8640 newnode->last_destroyed = seq;
8641
8642 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8643 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8644 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8645 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8646
8647 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
8648 mdr, __func__);
8649 mdlog->flush();
8650}
8651
8652void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8653{
8654 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
8655 snapid_t stid = mdr->more()->stid;
8656 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8657 snapid_t seq;
8658 ::decode(seq, p);
8659
8660 diri->pop_and_dirty_projected_inode(mdr->ls);
8661 mdr->apply();
8662
8663 mds->snapclient->commit(stid, mdr->ls);
8664
8665 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8666
8667 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
8668
8669 // yay
8670 mdr->in[0] = diri;
8671 respond_to_request(mdr, 0);
8672
8673 // purge snapshot data
8674 if (diri->snaprealm->have_past_parents_open())
8675 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
8676}
8677
8678struct C_MDS_renamesnap_finish : public ServerLogContext {
8679 CInode *diri;
8680 snapid_t snapid;
8681 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8682 ServerLogContext(s, r), diri(di), snapid(sn) {}
8683 void finish(int r) override {
8684 server->_renamesnap_finish(mdr, diri, snapid);
8685 }
8686};
8687
8688/* This function takes responsibility for the passed mdr*/
8689void Server::handle_client_renamesnap(MDRequestRef& mdr)
8690{
8691 MClientRequest *req = mdr->client_request;
8692 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
8693 respond_to_request(mdr, -EINVAL);
8694 return;
8695 }
8696
8697 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8698 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8699 respond_to_request(mdr, -ESTALE);
8700 return;
8701 }
8702
8703 if (!diri->is_auth()) { // fw to auth?
8704 mdcache->request_forward(mdr, diri->authority().first);
8705 return;
8706 }
8707
8708 if (!diri->is_dir()) { // dir only
8709 respond_to_request(mdr, -ENOTDIR);
8710 return;
8711 }
8712
8713 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid ||
8714 mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8715 respond_to_request(mdr, -EPERM);
8716 return;
8717 }
8718
8719 const string &dstname = req->get_filepath().last_dentry();
8720 const string &srcname = req->get_filepath2().last_dentry();
8721 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
8722
8723 if (srcname.length() == 0 || srcname[0] == '_') {
8724 respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
8725 return;
8726 }
8727 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
8728 respond_to_request(mdr, -ENOENT);
8729 return;
8730 }
8731 if (dstname.length() == 0 || dstname[0] == '_') {
8732 respond_to_request(mdr, -EINVAL);
8733 return;
8734 }
8735 if (diri->snaprealm->exists(dstname)) {
8736 respond_to_request(mdr, -EEXIST);
8737 return;
8738 }
8739
8740 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
8741 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
8742
8743 // lock snap
8744 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8745
8746 mds->locker->include_snap_rdlocks(rdlocks, diri);
8747 rdlocks.erase(&diri->snaplock);
8748 xlocks.insert(&diri->snaplock);
8749
8750 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8751 return;
8752
8753 if (!check_access(mdr, diri, MAY_WRITE))
8754 return;
8755
8756 // prepare
8757 if (!mdr->more()->stid) {
8758 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
8759 &mdr->more()->stid, &mdr->more()->snapidbl,
8760 new C_MDS_RetryRequest(mdcache, mdr));
8761 return;
8762 }
8763
8764 version_t stid = mdr->more()->stid;
8765 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8766 snapid_t seq;
8767 ::decode(seq, p);
8768 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8769
8770 // journal
8771 inode_t *pi = diri->project_inode();
8772 pi->ctime = mdr->get_op_stamp();
8773 pi->version = diri->pre_dirty();
8774
8775 // project the snaprealm
8776 sr_t *newsnap = diri->project_snaprealm();
8777 assert(newsnap->snaps.count(snapid));
8778 newsnap->snaps[snapid].name = dstname;
8779
8780 // journal the inode changes
8781 mdr->ls = mdlog->get_current_segment();
8782 EUpdate *le = new EUpdate(mdlog, "renamesnap");
8783 mdlog->start_entry(le);
8784
8785 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8786 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8787 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8788 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8789
8790 // journal the snaprealm changes
8791 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
8792 mdr, __func__);
8793 mdlog->flush();
8794}
8795
8796void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8797{
8798 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
8799
8800 diri->pop_and_dirty_projected_inode(mdr->ls);
8801 mdr->apply();
8802
8803 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8804
8805 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8806
8807 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE, true);
8808
8809 // yay
8810 mdr->in[0] = diri;
8811 mdr->tracei = diri;
8812 mdr->snapid = snapid;
8813 respond_to_request(mdr, 0);
8814}
8815
8816/**
8817 * Return true if server is in state RECONNECT and this
8818 * client has not yet reconnected.
8819 */
8820bool Server::waiting_for_reconnect(client_t c) const
8821{
8822 return client_reconnect_gather.count(c) > 0;
8823}
8824
8825void Server::dump_reconnect_status(Formatter *f) const
8826{
8827 f->open_object_section("reconnect_status");
8828 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
8829 f->close_section();
8830}