]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
04a9e3e386b4364cd19b8c3e16a8cc5c3380f680
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20
21 #include "MDSRank.h"
22 #include "Server.h"
23 #include "Locker.h"
24 #include "MDCache.h"
25 #include "MDLog.h"
26 #include "Migrator.h"
27 #include "MDBalancer.h"
28 #include "InoTable.h"
29 #include "SnapClient.h"
30 #include "Mutation.h"
31
32 #include "msg/Messenger.h"
33
34 #include "osdc/Objecter.h"
35
36 #include "messages/MClientSession.h"
37 #include "messages/MClientRequest.h"
38 #include "messages/MClientReply.h"
39 #include "messages/MClientReconnect.h"
40 #include "messages/MClientCaps.h"
41 #include "messages/MClientSnap.h"
42
43 #include "messages/MMDSSlaveRequest.h"
44
45 #include "messages/MLock.h"
46
47 #include "events/EUpdate.h"
48 #include "events/ESlaveUpdate.h"
49 #include "events/ESession.h"
50 #include "events/EOpen.h"
51 #include "events/ECommitted.h"
52
53 #include "include/filepath.h"
54 #include "common/errno.h"
55 #include "common/Timer.h"
56 #include "common/perf_counters.h"
57 #include "include/compat.h"
58 #include "osd/OSDMap.h"
59
60 #include <errno.h>
61
62 #include <list>
63 #include <iostream>
64 #include <boost/utility/string_view.hpp>
65 using namespace std;
66
67 #include "common/config.h"
68
69 #define dout_context g_ceph_context
70 #define dout_subsys ceph_subsys_mds
71 #undef dout_prefix
72 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
73
74 class ServerContext : public MDSInternalContextBase {
75 protected:
76 Server *server;
77 MDSRank *get_mds() override
78 {
79 return server->mds;
80 }
81
82 public:
83 explicit ServerContext(Server *s) : server(s) {
84 assert(server != NULL);
85 }
86 };
87
88 class ServerLogContext : public MDSLogContextBase {
89 protected:
90 Server *server;
91 MDSRank *get_mds() override
92 {
93 return server->mds;
94 }
95
96 MDRequestRef mdr;
97 void pre_finish(int r) override {
98 if (mdr)
99 mdr->mark_event("journal_committed: ");
100 }
101 public:
102 explicit ServerLogContext(Server *s) : server(s) {
103 assert(server != NULL);
104 }
105 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
106 assert(server != NULL);
107 }
108 };
109
110 void Server::create_logger()
111 {
112 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
113
114 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
115 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
116 plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
117 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
118 plb.add_u64_counter(l_mdss_handle_client_session,
119 "handle_client_session", "Client session messages", "hcs",
120 PerfCountersBuilder::PRIO_INTERESTING);
121
122 // fop latencies are useful
123 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
124 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
125 "Request type lookup hash of inode latency");
126 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
127 "Request type lookup inode latency");
128 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
129 "Request type lookup parent latency");
130 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
131 "Request type lookup name latency");
132 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
133 "Request type lookup latency");
134 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
135 "Request type lookup snapshot latency");
136 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
137 "Request type get attribute latency");
138 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
139 "Request type set attribute latency");
140 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
141 "Request type set file layout latency");
142 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
143 "Request type set directory layout latency");
144 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
145 "Request type set extended attribute latency");
146 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
147 "Request type remove extended attribute latency");
148 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
149 "Request type read directory latency");
150 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
151 "Request type set file lock latency");
152 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
153 "Request type get file lock latency");
154 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
155 "Request type create latency");
156 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
157 "Request type open latency");
158 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
159 "Request type make node latency");
160 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
161 "Request type link latency");
162 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
163 "Request type unlink latency");
164 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
165 "Request type remove directory latency");
166 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
167 "Request type rename latency");
168 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
169 "Request type make directory latency");
170 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
171 "Request type symbolic link latency");
172 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
173 "Request type list snapshot latency");
174 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
175 "Request type make snapshot latency");
176 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
177 "Request type remove snapshot latency");
178 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
179 "Request type rename snapshot latency");
180
181 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
182 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
183
184 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
185 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
186 "Client requests dispatched");
187 plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request",
188 "Server requests dispatched");
189
190 logger = plb.create_perf_counters();
191 g_ceph_context->get_perfcounters_collection()->add(logger);
192 }
193
194 Server::Server(MDSRank *m) :
195 mds(m),
196 mdcache(mds->mdcache), mdlog(mds->mdlog),
197 logger(0),
198 is_full(false),
199 reconnect_done(NULL),
200 failed_reconnects(0),
201 reconnect_evicting(false),
202 terminating_sessions(false)
203 {
204 }
205
206
207 /* This function DOES put the passed message before returning*/
208 void Server::dispatch(Message *m)
209 {
210 switch (m->get_type()) {
211 case CEPH_MSG_CLIENT_RECONNECT:
212 handle_client_reconnect(static_cast<MClientReconnect*>(m));
213 return;
214 }
215
216 // active?
217 // handle_slave_request()/handle_client_session() will wait if necessary
218 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
219 MClientRequest *req = static_cast<MClientRequest*>(m);
220 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
221 Session *session = mds->get_session(req);
222 if (!session || session->is_closed()) {
223 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
224 req->put();
225 return;
226 }
227 bool queue_replay = false;
228 if (req->is_replay()) {
229 dout(3) << "queuing replayed op" << dendl;
230 queue_replay = true;
231 } else if (req->get_retry_attempt()) {
232 // process completed request in clientreplay stage. The completed request
233 // might have created new file/directorie. This guarantees MDS sends a reply
234 // to client before other request modifies the new file/directorie.
235 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
236 dout(3) << "queuing completed op" << dendl;
237 queue_replay = true;
238 }
239 // this request was created before the cap reconnect message, drop any embedded
240 // cap releases.
241 req->releases.clear();
242 }
243 if (queue_replay) {
244 req->mark_queued_for_replay();
245 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
246 return;
247 }
248 }
249
250 bool wait_for_active = true;
251 if (mds->is_stopping()) {
252 wait_for_active = false;
253 } else if (mds->is_clientreplay()) {
254 if (req->is_queued_for_replay()) {
255 wait_for_active = false;
256 }
257 }
258 if (wait_for_active) {
259 dout(3) << "not active yet, waiting" << dendl;
260 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
261 return;
262 }
263 }
264
265 switch (m->get_type()) {
266 case CEPH_MSG_CLIENT_SESSION:
267 handle_client_session(static_cast<MClientSession*>(m));
268 return;
269 case CEPH_MSG_CLIENT_REQUEST:
270 handle_client_request(static_cast<MClientRequest*>(m));
271 return;
272 case MSG_MDS_SLAVE_REQUEST:
273 handle_slave_request(static_cast<MMDSSlaveRequest*>(m));
274 return;
275 default:
276 derr << "server unknown message " << m->get_type() << dendl;
277 assert(0 == "server unknown message");
278 }
279 }
280
281
282
283 // ----------------------------------------------------------
284 // SESSION management
285
286 class C_MDS_session_finish : public ServerLogContext {
287 Session *session;
288 uint64_t state_seq;
289 bool open;
290 version_t cmapv;
291 interval_set<inodeno_t> inos;
292 version_t inotablev;
293 Context *fin;
294 public:
295 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
296 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
297 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
298 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { }
299 void finish(int r) override {
300 assert(r == 0);
301 server->_session_logged(session, state_seq, open, cmapv, inos, inotablev);
302 if (fin) {
303 fin->complete(r);
304 }
305 }
306 };
307
308 /* This function DOES put the passed message before returning*/
309 void Server::handle_client_session(MClientSession *m)
310 {
311 version_t pv;
312 bool blacklisted = false;
313 Session *session = mds->get_session(m);
314
315 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
316 assert(m->get_source().is_client()); // should _not_ come from an mds!
317
318 if (!session) {
319 dout(0) << " ignoring sessionless msg " << *m << dendl;
320 m->put();
321 return;
322 }
323
324 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
325 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
326 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
327 // close requests need to be handled when mds is active
328 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
329 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
330 return;
331 }
332 } else {
333 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
334 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
335 return;
336 }
337 }
338
339 if (logger)
340 logger->inc(l_mdss_handle_client_session);
341
342 uint64_t sseq = 0;
343 switch (m->get_op()) {
344 case CEPH_SESSION_REQUEST_OPEN:
345 if (session->is_opening() ||
346 session->is_open() ||
347 session->is_stale() ||
348 session->is_killing() ||
349 terminating_sessions) {
350 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
351 // set client metadata for session opened by prepare_force_open_sessions
352 if (!m->client_meta.empty())
353 session->set_client_metadata(m->client_meta);
354 m->put();
355 return;
356 }
357 assert(session->is_closed() ||
358 session->is_closing());
359
360 if (mds->is_stopping()) {
361 dout(10) << "mds is stopping, dropping open req" << dendl;
362 m->put();
363 return;
364 }
365
366 blacklisted = mds->objecter->with_osdmap(
367 [session](const OSDMap &osd_map) -> bool {
368 return osd_map.is_blacklisted(session->info.inst.addr);
369 });
370
371 if (blacklisted) {
372 dout(10) << "rejecting blacklisted client " << session->info.inst.addr << dendl;
373 mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
374 m->put();
375 return;
376 }
377
378 session->set_client_metadata(m->client_meta);
379 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN "
380 << session->info.client_metadata.size() << " metadata entries:" << dendl;
381 for (map<string, string>::iterator i = session->info.client_metadata.begin();
382 i != session->info.client_metadata.end(); ++i) {
383 dout(20) << " " << i->first << ": " << i->second << dendl;
384 }
385
386 // Special case for the 'root' metadata path; validate that the claimed
387 // root is actually within the caps of the session
388 if (session->info.client_metadata.count("root")) {
389 const auto claimed_root = session->info.client_metadata.at("root");
390 // claimed_root has a leading "/" which we strip before passing
391 // into caps check
392 if (claimed_root.empty() || claimed_root[0] != '/' ||
393 !session->auth_caps.path_capable(claimed_root.substr(1))) {
394 derr << __func__ << " forbidden path claimed as mount root: "
395 << claimed_root << " by " << m->get_source() << dendl;
396 // Tell the client we're rejecting their open
397 mds->send_message_client(new MClientSession(CEPH_SESSION_REJECT), session);
398 mds->clog->warn() << "client session with invalid root '" <<
399 claimed_root << "' denied (" << session->info.inst << ")";
400 session->clear();
401 // Drop out; don't record this session in SessionMap or journal it.
402 break;
403 }
404 }
405
406 if (session->is_closed())
407 mds->sessionmap.add_session(session);
408
409 pv = mds->sessionmap.mark_projected(session);
410 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
411 mds->sessionmap.touch_session(session);
412 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, m->client_meta),
413 new C_MDS_session_finish(this, session, sseq, true, pv));
414 mdlog->flush();
415 break;
416
417 case CEPH_SESSION_REQUEST_RENEWCAPS:
418 if (session->is_open() ||
419 session->is_stale()) {
420 mds->sessionmap.touch_session(session);
421 if (session->is_stale()) {
422 mds->sessionmap.set_state(session, Session::STATE_OPEN);
423 mds->locker->resume_stale_caps(session);
424 mds->sessionmap.touch_session(session);
425 }
426 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_RENEWCAPS, m->get_seq()));
427 } else {
428 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
429 }
430 break;
431
432 case CEPH_SESSION_REQUEST_CLOSE:
433 {
434 if (session->is_closed() ||
435 session->is_closing() ||
436 session->is_killing()) {
437 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
438 m->put();
439 return;
440 }
441 if (session->is_importing()) {
442 dout(10) << "ignoring close req on importing session" << dendl;
443 m->put();
444 return;
445 }
446 assert(session->is_open() ||
447 session->is_stale() ||
448 session->is_opening());
449 if (m->get_seq() < session->get_push_seq()) {
450 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
451 << ", dropping" << dendl;
452 m->put();
453 return;
454 }
455 // We are getting a seq that is higher than expected.
456 // Handle the same as any other seqn error.
457 //
458 if (m->get_seq() != session->get_push_seq()) {
459 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
460 << ", BUGGY!" << dendl;
461 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
462 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
463 m->put();
464 return;
465 }
466 journal_close_session(session, Session::STATE_CLOSING, NULL);
467 }
468 break;
469
470 case CEPH_SESSION_FLUSHMSG_ACK:
471 finish_flush_session(session, m->get_seq());
472 break;
473
474 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
475 if (mds->is_active())
476 mdlog->flush();
477 break;
478
479 default:
480 ceph_abort();
481 }
482 m->put();
483 }
484
485 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
486 {
487 for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
488 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
489 assert(session);
490 if (!session->is_open() ||
491 !session->connection.get() ||
492 !session->connection->has_feature(CEPH_FEATURE_EXPORT_PEER))
493 continue;
494 version_t seq = session->wait_for_flush(gather.new_sub());
495 mds->send_message_client(new MClientSession(CEPH_SESSION_FLUSHMSG, seq), session);
496 }
497 }
498
499 void Server::finish_flush_session(Session *session, version_t seq)
500 {
501 list<MDSInternalContextBase*> finished;
502 session->finish_flush(seq, finished);
503 mds->queue_waiters(finished);
504 }
505
506 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
507 interval_set<inodeno_t>& inos, version_t piv)
508 {
509 dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close")
510 << " " << pv << dendl;
511
512 if (piv) {
513 assert(session->is_closing() || session->is_killing() ||
514 session->is_opening()); // re-open closing session
515 session->info.prealloc_inos.subtract(inos);
516 mds->inotable->apply_release_ids(inos);
517 assert(mds->inotable->get_version() == piv);
518 }
519
520 mds->sessionmap.mark_dirty(session);
521
522 // apply
523 if (session->get_state_seq() != state_seq) {
524 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
525 << ", noop" << dendl;
526 // close must have been canceled (by an import?), or any number of other things..
527 } else if (open) {
528 assert(session->is_opening());
529 mds->sessionmap.set_state(session, Session::STATE_OPEN);
530 mds->sessionmap.touch_session(session);
531 assert(session->connection != NULL);
532 session->connection->send_message(new MClientSession(CEPH_SESSION_OPEN));
533 if (mdcache->is_readonly())
534 session->connection->send_message(new MClientSession(CEPH_SESSION_FORCE_RO));
535 } else if (session->is_closing() ||
536 session->is_killing()) {
537 // kill any lingering capabilities, leases, requests
538 while (!session->caps.empty()) {
539 Capability *cap = session->caps.front();
540 CInode *in = cap->get_inode();
541 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
542 mds->locker->remove_client_cap(in, session->info.inst.name.num());
543 }
544 while (!session->leases.empty()) {
545 ClientLease *r = session->leases.front();
546 CDentry *dn = static_cast<CDentry*>(r->parent);
547 dout(20) << " killing client lease of " << *dn << dendl;
548 dn->remove_client_lease(r, mds->locker);
549 }
550 if (client_reconnect_gather.count(session->info.get_client())) {
551 dout(20) << " removing client from reconnect set" << dendl;
552 client_reconnect_gather.erase(session->info.get_client());
553
554 if (client_reconnect_gather.empty()) {
555 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
556 reconnect_gather_finish();
557 }
558 }
559
560 if (session->is_closing()) {
561 // mark con disposable. if there is a fault, we will get a
562 // reset and clean it up. if the client hasn't received the
563 // CLOSE message yet, they will reconnect and get an
564 // ms_handle_remote_reset() and realize they had in fact closed.
565 // do this *before* sending the message to avoid a possible
566 // race.
567 if (session->connection != NULL) {
568 // Conditional because terminate_sessions will indiscrimately
569 // put sessions in CLOSING whether they ever had a conn or not.
570 session->connection->mark_disposable();
571 }
572
573 // reset session
574 mds->send_message_client(new MClientSession(CEPH_SESSION_CLOSE), session);
575 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
576 session->clear();
577 mds->sessionmap.remove_session(session);
578 } else if (session->is_killing()) {
579 // destroy session, close connection
580 if (session->connection != NULL) {
581 session->connection->mark_down();
582 session->connection->set_priv(NULL);
583 }
584 mds->sessionmap.remove_session(session);
585 } else {
586 ceph_abort();
587 }
588 } else {
589 ceph_abort();
590 }
591 }
592
593 /**
594 * Inject sessions from some source other than actual connections.
595 *
596 * For example:
597 * - sessions inferred from journal replay
598 * - sessions learned from other MDSs during rejoin
599 * - sessions learned from other MDSs during dir/caps migration
600 * - sessions learned from other MDSs during a cross-MDS rename
601 */
602 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
603 map<client_t, pair<Session*,uint64_t> >& smap)
604 {
605 version_t pv = mds->sessionmap.get_projected();
606
607 dout(10) << "prepare_force_open_sessions " << pv
608 << " on " << cm.size() << " clients"
609 << dendl;
610
611 mds->objecter->with_osdmap(
612 [this, &cm](const OSDMap &osd_map) {
613 for (auto p = cm.begin(); p != cm.end(); ) {
614 if (osd_map.is_blacklisted(p->second.addr)) {
615 dout(10) << " ignoring blacklisted client." << p->first
616 << " (" << p->second.addr << ")" << dendl;
617 cm.erase(p++);
618 } else {
619 ++p;
620 }
621 }
622 });
623
624 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
625 Session *session = mds->sessionmap.get_or_add_session(p->second);
626 pv = mds->sessionmap.mark_projected(session);
627 uint64_t sseq;
628 if (session->is_closed() ||
629 session->is_closing() ||
630 session->is_killing()) {
631 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
632 } else {
633 assert(session->is_open() ||
634 session->is_opening() ||
635 session->is_stale());
636 sseq = 0;
637 }
638 smap[p->first] = make_pair(session, sseq);
639 session->inc_importing();
640 }
641 return pv;
642 }
643
644 void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
645 bool dec_import)
646 {
647 /*
648 * FIXME: need to carefully consider the race conditions between a
649 * client trying to close a session and an MDS doing an import
650 * trying to force open a session...
651 */
652 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
653 << " initial v " << mds->sessionmap.get_version() << dendl;
654
655 for (auto &it : smap) {
656 Session *session = it.second.first;
657 uint64_t sseq = it.second.second;
658 if (sseq > 0) {
659 if (session->get_state_seq() != sseq) {
660 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
661 } else {
662 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
663 mds->sessionmap.set_state(session, Session::STATE_OPEN);
664 mds->sessionmap.touch_session(session);
665 mds->send_message_client(new MClientSession(CEPH_SESSION_OPEN), session);
666 if (mdcache->is_readonly())
667 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
668 }
669 } else {
670 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
671 assert(session->is_open() || session->is_stale());
672 }
673
674 if (dec_import) {
675 session->dec_importing();
676 }
677
678 mds->sessionmap.mark_dirty(session);
679 }
680
681 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
682 }
683
684 class C_MDS_TerminatedSessions : public ServerContext {
685 void finish(int r) override {
686 server->terminating_sessions = false;
687 }
688 public:
689 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
690 };
691
692 void Server::terminate_sessions()
693 {
694 dout(2) << "terminate_sessions" << dendl;
695
696 terminating_sessions = true;
697
698 // kill them off. clients will retry etc.
699 set<Session*> sessions;
700 mds->sessionmap.get_client_session_set(sessions);
701 for (set<Session*>::const_iterator p = sessions.begin();
702 p != sessions.end();
703 ++p) {
704 Session *session = *p;
705 if (session->is_closing() ||
706 session->is_killing() ||
707 session->is_closed())
708 continue;
709 journal_close_session(session, Session::STATE_CLOSING, NULL);
710 }
711
712 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
713 }
714
715
716 void Server::find_idle_sessions()
717 {
718 auto now = clock::now();
719 auto last_cleared_laggy = mds->last_cleared_laggy();
720
721 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
722
723 // timeout/stale
724 // (caps go stale, lease die)
725 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
726 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
727 while (1) {
728 Session *session = mds->sessionmap.get_oldest_session(Session::STATE_OPEN);
729 if (!session) break;
730 auto last_cap_renew_span = std::chrono::duration<double>(now-session->last_cap_renew).count();
731 if (last_cap_renew_span < cutoff) {
732 dout(20) << "laggiest active session is " << session->info.inst << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
733 break;
734 }
735
736 dout(10) << "new stale session " << session->info.inst << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
737 mds->sessionmap.set_state(session, Session::STATE_STALE);
738 mds->locker->revoke_stale_caps(session);
739 mds->locker->remove_stale_leases(session);
740 mds->send_message_client(new MClientSession(CEPH_SESSION_STALE, session->get_push_seq()), session);
741 finish_flush_session(session, session->get_push_seq());
742 }
743
744 // autoclose
745 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
746
747 // don't kick clients if we've been laggy
748 if (last_cleared_laggy < cutoff) {
749 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
750 << "), not kicking any clients to be safe" << dendl;
751 return;
752 }
753
754 if (mds->sessionmap.get_sessions().size() == 1 && mds->mdsmap->get_num_in_mds() == 1) {
755 dout(20) << "skipping client eviction because there is only one" << dendl;
756 return;
757 }
758
759 // Collect a list of sessions exceeding the autoclose threshold
760 std::vector<Session *> to_evict;
761 const auto sessions_p = mds->sessionmap.by_state.find(Session::STATE_STALE);
762 if (sessions_p == mds->sessionmap.by_state.end() || sessions_p->second->empty()) {
763 return;
764 }
765 const auto &stale_sessions = sessions_p->second;
766 assert(stale_sessions != nullptr);
767
768 for (const auto &session: *stale_sessions) {
769 auto last_cap_renew_span = std::chrono::duration<double>(now-session->last_cap_renew).count();
770 if (session->is_importing()) {
771 dout(10) << "stopping at importing session " << session->info.inst << dendl;
772 break;
773 }
774 assert(session->is_stale());
775 if (last_cap_renew_span < cutoff) {
776 dout(20) << "oldest stale session is " << session->info.inst << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
777 break;
778 }
779
780 to_evict.push_back(session);
781 }
782
783 for (const auto &session: to_evict) {
784 auto last_cap_renew_span = std::chrono::duration<double>(now-session->last_cap_renew).count();
785 mds->clog->warn() << "evicting unresponsive client " << *session << ", after " << last_cap_renew_span << " seconds";
786 dout(10) << "autoclosing stale session " << session->info.inst << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
787
788 if (g_conf->mds_session_blacklist_on_timeout) {
789 std::stringstream ss;
790 mds->evict_client(session->info.inst.name.num(), false, true,
791 ss, nullptr);
792 } else {
793 kill_session(session, NULL);
794 }
795 }
796 }
797
798 void Server::evict_cap_revoke_non_responders() {
799 if (!cap_revoke_eviction_timeout) {
800 return;
801 }
802
803 std::list<client_t> to_evict;
804 mds->locker->get_late_revoking_clients(&to_evict, cap_revoke_eviction_timeout);
805
806 for (auto const &client: to_evict) {
807 mds->clog->warn() << "client id " << client << " has not responded to"
808 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
809 << " seconds, evicting";
810 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
811 << client << dendl;
812
813 std::stringstream ss;
814 bool evicted = mds->evict_client(client.v, false,
815 g_conf->mds_session_blacklist_on_evict,
816 ss, nullptr);
817 if (evicted && logger) {
818 logger->inc(l_mdss_cap_revoke_eviction);
819 }
820 }
821 }
822
823 void Server::handle_conf_change(const struct md_config_t *conf,
824 const std::set <std::string> &changed) {
825 if (changed.count("mds_cap_revoke_eviction_timeout")) {
826 cap_revoke_eviction_timeout = conf->get_val<double>("mds_cap_revoke_eviction_timeout");
827 dout(20) << __func__ << " cap revoke eviction timeout changed to "
828 << cap_revoke_eviction_timeout << dendl;
829 }
830 }
831
832 /*
833 * XXX bump in the interface here, not using an MDSInternalContextBase here
834 * because all the callers right now happen to use a SaferCond
835 */
836 void Server::kill_session(Session *session, Context *on_safe)
837 {
838 assert(mds->mds_lock.is_locked_by_me());
839
840 if ((session->is_opening() ||
841 session->is_open() ||
842 session->is_stale()) &&
843 !session->is_importing()) {
844 dout(10) << "kill_session " << session << dendl;
845 journal_close_session(session, Session::STATE_KILLING, on_safe);
846 } else {
847 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
848 assert(session->is_closing() ||
849 session->is_closed() ||
850 session->is_killing() ||
851 session->is_importing());
852 if (on_safe) {
853 on_safe->complete(0);
854 }
855 }
856 }
857
858 size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
859 {
860 std::list<Session*> victims;
861 const auto sessions = mds->sessionmap.get_sessions();
862 for (const auto p : sessions) {
863 if (!p.first.is_client()) {
864 // Do not apply OSDMap blacklist to MDS daemons, we find out
865 // about their death via MDSMap.
866 continue;
867 }
868
869 Session *s = p.second;
870 if (blacklist.count(s->info.inst.addr)) {
871 victims.push_back(s);
872 }
873 }
874
875 for (const auto s : victims) {
876 kill_session(s, nullptr);
877 }
878
879 dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
880
881 return victims.size();
882 }
883
884 void Server::journal_close_session(Session *session, int state, Context *on_safe)
885 {
886 uint64_t sseq = mds->sessionmap.set_state(session, state);
887 version_t pv = mds->sessionmap.mark_projected(session);
888 version_t piv = 0;
889
890 // release alloc and pending-alloc inos for this session
891 // and wipe out session state, in case the session close aborts for some reason
892 interval_set<inodeno_t> both;
893 both.insert(session->info.prealloc_inos);
894 both.insert(session->pending_prealloc_inos);
895 if (both.size()) {
896 mds->inotable->project_release_ids(both);
897 piv = mds->inotable->get_projected_version();
898 } else
899 piv = 0;
900
901 mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv),
902 new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe));
903 mdlog->flush();
904
905 // clean up requests, too
906 elist<MDRequestImpl*>::iterator p =
907 session->requests.begin(member_offset(MDRequestImpl,
908 item_session_request));
909 while (!p.end()) {
910 MDRequestRef mdr = mdcache->request_get((*p)->reqid);
911 ++p;
912 mdcache->request_kill(mdr);
913 }
914
915 finish_flush_session(session, session->get_push_seq());
916 }
917
918 void Server::reconnect_clients(MDSInternalContext *reconnect_done_)
919 {
920 reconnect_done = reconnect_done_;
921
922 set<Session*> sessions;
923 mds->sessionmap.get_client_session_set(sessions);
924 for (auto session : sessions) {
925 if (session->is_open())
926 client_reconnect_gather.insert(session->get_client());
927 }
928
929 if (client_reconnect_gather.empty()) {
930 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
931 reconnect_gather_finish();
932 return;
933 }
934
935 // clients will get the mdsmap and discover we're reconnecting via the monitor.
936
937 reconnect_start = ceph_clock_now();
938 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
939 mds->sessionmap.dump();
940 }
941
942 /* This function DOES put the passed message before returning*/
943 void Server::handle_client_reconnect(MClientReconnect *m)
944 {
945 dout(7) << "handle_client_reconnect " << m->get_source() << dendl;
946 client_t from = m->get_source().num();
947 Session *session = mds->get_session(m);
948 assert(session);
949
950 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
951 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
952 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
953 return;
954 }
955
956 utime_t delay = ceph_clock_now();
957 delay -= reconnect_start;
958 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
959
960 bool deny = false;
961 if (!mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
962 // XXX maybe in the future we can do better than this?
963 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
964 mds->clog->info() << "denied reconnect attempt (mds is "
965 << ceph_mds_state_name(mds->get_state())
966 << ") from " << m->get_source_inst()
967 << " after " << delay << " (allowed interval " << g_conf->mds_reconnect_timeout << ")";
968 deny = true;
969 } else if (!session->is_open()) {
970 dout(1) << " session is closed, ignoring reconnect, sending close" << dendl;
971 mds->clog->info() << "denied reconnect attempt (mds is "
972 << ceph_mds_state_name(mds->get_state())
973 << ") from " << m->get_source_inst() << " (session is closed)";
974 deny = true;
975 } else if (mdcache->is_readonly()) {
976 dout(1) << " read-only FS, ignoring reconnect, sending close" << dendl;
977 mds->clog->info() << "denied reconnect attempt (mds is read-only)";
978 deny = true;
979 }
980
981 if (deny) {
982 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_CLOSE));
983 m->put();
984 return;
985 }
986
987 // notify client of success with an OPEN
988 m->get_connection()->send_message(new MClientSession(CEPH_SESSION_OPEN));
989 session->last_cap_renew = clock::now();
990 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
991
992 // snaprealms
993 for (vector<ceph_mds_snaprealm_reconnect>::iterator p = m->realms.begin();
994 p != m->realms.end();
995 ++p) {
996 CInode *in = mdcache->get_inode(inodeno_t(p->ino));
997 if (in && in->state_test(CInode::STATE_PURGING))
998 continue;
999 if (in) {
1000 assert(in->snaprealm);
1001 if (in->snaprealm->have_past_parents_open()) {
1002 dout(15) << "open snaprealm (w/ past parents) on " << *in << dendl;
1003 mdcache->finish_snaprealm_reconnect(from, in->snaprealm, snapid_t(p->seq));
1004 } else {
1005 dout(15) << "open snaprealm (w/o past parents) on " << *in << dendl;
1006 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
1007 }
1008 } else {
1009 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(p->ino)
1010 << " seq " << p->seq << dendl;
1011 mdcache->add_reconnected_snaprealm(from, inodeno_t(p->ino), snapid_t(p->seq));
1012 }
1013 }
1014
1015 // caps
1016 for (map<inodeno_t, cap_reconnect_t>::iterator p = m->caps.begin();
1017 p != m->caps.end();
1018 ++p) {
1019 // make sure our last_cap_id is MAX over all issued caps
1020 if (p->second.capinfo.cap_id > mdcache->last_cap_id)
1021 mdcache->last_cap_id = p->second.capinfo.cap_id;
1022
1023 CInode *in = mdcache->get_inode(p->first);
1024 if (in && in->state_test(CInode::STATE_PURGING))
1025 continue;
1026 if (in && in->is_auth()) {
1027 // we recovered it, and it's ours. take note.
1028 dout(15) << "open cap realm " << inodeno_t(p->second.capinfo.snaprealm)
1029 << " on " << *in << dendl;
1030 in->reconnect_cap(from, p->second, session);
1031 mdcache->add_reconnected_cap(from, p->first, p->second);
1032 recover_filelocks(in, p->second.flockbl, m->get_orig_source().num());
1033 continue;
1034 }
1035
1036 if (in && !in->is_auth()) {
1037 // not mine.
1038 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1039 // add to cap export list.
1040 p->second.path.clear(); // we don't need path
1041 mdcache->rejoin_export_caps(p->first, from, p->second,
1042 in->authority().first);
1043 } else {
1044 // don't know if the inode is mine
1045 dout(10) << "missing ino " << p->first << ", will load later" << dendl;
1046 p->second.path.clear(); // we don't need path
1047 mdcache->rejoin_recovered_caps(p->first, from, p->second, MDS_RANK_NONE);
1048 }
1049 }
1050 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1051
1052 // remove from gather set
1053 client_reconnect_gather.erase(from);
1054 if (client_reconnect_gather.empty())
1055 reconnect_gather_finish();
1056
1057 m->put();
1058 }
1059
1060
1061
1062 void Server::reconnect_gather_finish()
1063 {
1064 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1065 assert(reconnect_done);
1066 reconnect_done->complete(0);
1067 reconnect_done = NULL;
1068 }
1069
1070 void Server::reconnect_tick()
1071 {
1072 if (reconnect_evicting) {
1073 dout(4) << "reconnect_tick: waiting for evictions" << dendl;
1074 return;
1075 }
1076
1077 utime_t reconnect_end = reconnect_start;
1078 reconnect_end += g_conf->mds_reconnect_timeout;
1079 if (ceph_clock_now() >= reconnect_end &&
1080 !client_reconnect_gather.empty()) {
1081 dout(10) << "reconnect timed out" << dendl;
1082
1083 // If we're doing blacklist evictions, use this to wait for them before
1084 // proceeding to reconnect_gather_finish
1085 MDSGatherBuilder gather(g_ceph_context);
1086
1087 for (set<client_t>::iterator p = client_reconnect_gather.begin();
1088 p != client_reconnect_gather.end();
1089 ++p) {
1090 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
1091 assert(session);
1092 dout(1) << "reconnect gave up on " << session->info.inst << dendl;
1093
1094 mds->clog->warn() << "evicting unresponsive client " << *session
1095 << ", after waiting " << g_conf->mds_reconnect_timeout
1096 << " seconds during MDS startup";
1097
1098 if (g_conf->mds_session_blacklist_on_timeout) {
1099 std::stringstream ss;
1100 mds->evict_client(session->info.inst.name.num(), false, true, ss,
1101 gather.new_sub());
1102 } else {
1103 kill_session(session, NULL);
1104 }
1105
1106 failed_reconnects++;
1107 }
1108 client_reconnect_gather.clear();
1109
1110 if (gather.has_subs()) {
1111 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1112 gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
1113 [this](int r){reconnect_gather_finish();})));
1114 gather.activate();
1115 reconnect_evicting = true;
1116 } else {
1117 reconnect_gather_finish();
1118 }
1119 }
1120 }
1121
1122 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1123 {
1124 if (!locks.length()) return;
1125 int numlocks;
1126 ceph_filelock lock;
1127 bufferlist::iterator p = locks.begin();
1128 ::decode(numlocks, p);
1129 for (int i = 0; i < numlocks; ++i) {
1130 ::decode(lock, p);
1131 lock.client = client;
1132 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1133 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1134 }
1135 ::decode(numlocks, p);
1136 for (int i = 0; i < numlocks; ++i) {
1137 ::decode(lock, p);
1138 lock.client = client;
1139 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1140 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1141 }
1142 }
1143
1144
1145 /**
1146 * Call this when the MDCache is oversized, to send requests to the clients
1147 * to trim some caps, and consequently unpin some inodes in the MDCache so
1148 * that it can trim too.
1149 */
1150 void Server::recall_client_state(void)
1151 {
1152 /* try to recall at least 80% of all caps */
1153 uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
1154 uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
1155 if (max_caps_per_client < min_caps_per_client) {
1156 dout(0) << "max_caps_per_client " << max_caps_per_client
1157 << " < min_caps_per_client " << min_caps_per_client << dendl;
1158 max_caps_per_client = min_caps_per_client + 1;
1159 }
1160
1161 /* unless this ratio is smaller: */
1162 /* ratio: determine the amount of caps to recall from each client. Use
1163 * percentage full over the cache reservation. Cap the ratio at 80% of client
1164 * caps. */
1165 double ratio = 1.0-fmin(0.80, mdcache->cache_toofull_ratio());
1166
1167 dout(10) << "recall_client_state " << ratio
1168 << ", caps per client " << min_caps_per_client << "-" << max_caps_per_client
1169 << dendl;
1170
1171 set<Session*> sessions;
1172 mds->sessionmap.get_client_session_set(sessions);
1173 for (auto &session : sessions) {
1174 if (!session->is_open() ||
1175 !session->info.inst.name.is_client())
1176 continue;
1177
1178 dout(10) << " session " << session->info.inst
1179 << " caps " << session->caps.size()
1180 << ", leases " << session->leases.size()
1181 << dendl;
1182
1183 uint64_t newlim = MAX(MIN((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
1184 if (session->caps.size() > newlim) {
1185 MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
1186 m->head.max_caps = newlim;
1187 mds->send_message_client(m, session);
1188 session->notify_recall_sent(newlim);
1189 }
1190 }
1191 }
1192
1193 void Server::force_clients_readonly()
1194 {
1195 dout(10) << "force_clients_readonly" << dendl;
1196 set<Session*> sessions;
1197 mds->sessionmap.get_client_session_set(sessions);
1198 for (set<Session*>::const_iterator p = sessions.begin();
1199 p != sessions.end();
1200 ++p) {
1201 Session *session = *p;
1202 if (!session->info.inst.name.is_client() ||
1203 !(session->is_open() || session->is_stale()))
1204 continue;
1205 mds->send_message_client(new MClientSession(CEPH_SESSION_FORCE_RO), session);
1206 }
1207 }
1208
1209 /*******
1210 * some generic stuff for finishing off requests
1211 */
1212 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1213 {
1214 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1215 assert(!mdr->has_completed);
1216
1217 // note trace items for eventual reply.
1218 mdr->tracei = in;
1219 if (in)
1220 mdr->pin(in);
1221
1222 mdr->tracedn = dn;
1223 if (dn)
1224 mdr->pin(dn);
1225
1226 early_reply(mdr, in, dn);
1227
1228 mdr->committing = true;
1229 submit_mdlog_entry(le, fin, mdr, __func__);
1230
1231 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1232 if (mds->queue_one_replay()) {
1233 dout(10) << " queued next replay op" << dendl;
1234 } else {
1235 dout(10) << " journaled last replay op, flushing" << dendl;
1236 mdlog->flush();
1237 }
1238 } else if (mdr->did_early_reply)
1239 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
1240 else
1241 mdlog->flush();
1242 }
1243
1244 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1245 const char *event)
1246 {
1247 if (mdr) {
1248 string event_str("submit entry: ");
1249 event_str += event;
1250 mdr->mark_event_string(event_str);
1251 }
1252 mdlog->submit_entry(le, fin);
1253 }
1254
1255 /*
1256 * send response built from mdr contents and error code; clean up mdr
1257 */
1258 void Server::respond_to_request(MDRequestRef& mdr, int r)
1259 {
1260 if (mdr->client_request) {
1261 reply_client_request(mdr, new MClientReply(mdr->client_request, r));
1262 } else if (mdr->internal_op > -1) {
1263 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1264 if (!mdr->internal_op_finish)
1265 assert(0 == "trying to respond to internal op without finisher");
1266 mdr->internal_op_finish->complete(r);
1267 mdcache->request_finish(mdr);
1268 }
1269 }
1270
1271 // statistics mds req op number and latency
1272 void Server::perf_gather_op_latency(const MClientRequest* req, utime_t lat)
1273 {
1274 int code = l_mdss_first;
1275 switch(req->get_op()) {
1276 case CEPH_MDS_OP_LOOKUPHASH:
1277 code = l_mdss_req_lookuphash_latency;
1278 break;
1279 case CEPH_MDS_OP_LOOKUPINO:
1280 code = l_mdss_req_lookupino_latency;
1281 break;
1282 case CEPH_MDS_OP_LOOKUPPARENT:
1283 code = l_mdss_req_lookupparent_latency;
1284 break;
1285 case CEPH_MDS_OP_LOOKUPNAME:
1286 code = l_mdss_req_lookupname_latency;
1287 break;
1288 case CEPH_MDS_OP_LOOKUP:
1289 code = l_mdss_req_lookup_latency;
1290 break;
1291 case CEPH_MDS_OP_LOOKUPSNAP:
1292 code = l_mdss_req_lookupsnap_latency;
1293 break;
1294 case CEPH_MDS_OP_GETATTR:
1295 code = l_mdss_req_getattr_latency;
1296 break;
1297 case CEPH_MDS_OP_SETATTR:
1298 code = l_mdss_req_setattr_latency;
1299 break;
1300 case CEPH_MDS_OP_SETLAYOUT:
1301 code = l_mdss_req_setlayout_latency;
1302 break;
1303 case CEPH_MDS_OP_SETDIRLAYOUT:
1304 code = l_mdss_req_setdirlayout_latency;
1305 break;
1306 case CEPH_MDS_OP_SETXATTR:
1307 code = l_mdss_req_setxattr_latency;
1308 break;
1309 case CEPH_MDS_OP_RMXATTR:
1310 code = l_mdss_req_rmxattr_latency;
1311 break;
1312 case CEPH_MDS_OP_READDIR:
1313 code = l_mdss_req_readdir_latency;
1314 break;
1315 case CEPH_MDS_OP_SETFILELOCK:
1316 code = l_mdss_req_setfilelock_latency;
1317 break;
1318 case CEPH_MDS_OP_GETFILELOCK:
1319 code = l_mdss_req_getfilelock_latency;
1320 break;
1321 case CEPH_MDS_OP_CREATE:
1322 code = l_mdss_req_create_latency;
1323 break;
1324 case CEPH_MDS_OP_OPEN:
1325 code = l_mdss_req_open_latency;
1326 break;
1327 case CEPH_MDS_OP_MKNOD:
1328 code = l_mdss_req_mknod_latency;
1329 break;
1330 case CEPH_MDS_OP_LINK:
1331 code = l_mdss_req_link_latency;
1332 break;
1333 case CEPH_MDS_OP_UNLINK:
1334 code = l_mdss_req_unlink_latency;
1335 break;
1336 case CEPH_MDS_OP_RMDIR:
1337 code = l_mdss_req_rmdir_latency;
1338 break;
1339 case CEPH_MDS_OP_RENAME:
1340 code = l_mdss_req_rename_latency;
1341 break;
1342 case CEPH_MDS_OP_MKDIR:
1343 code = l_mdss_req_mkdir_latency;
1344 break;
1345 case CEPH_MDS_OP_SYMLINK:
1346 code = l_mdss_req_symlink_latency;
1347 break;
1348 case CEPH_MDS_OP_LSSNAP:
1349 code = l_mdss_req_lssnap_latency;
1350 break;
1351 case CEPH_MDS_OP_MKSNAP:
1352 code = l_mdss_req_mksnap_latency;
1353 break;
1354 case CEPH_MDS_OP_RMSNAP:
1355 code = l_mdss_req_rmsnap_latency;
1356 break;
1357 case CEPH_MDS_OP_RENAMESNAP:
1358 code = l_mdss_req_renamesnap_latency;
1359 break;
1360 default: ceph_abort();
1361 }
1362 logger->tinc(code, lat);
1363 }
1364
1365 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
1366 {
1367 if (!g_conf->mds_early_reply)
1368 return;
1369
1370 if (mdr->no_early_reply) {
1371 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
1372 return;
1373 }
1374
1375 if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
1376 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
1377 return;
1378 }
1379
1380 if (mdr->alloc_ino) {
1381 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
1382 return;
1383 }
1384
1385 MClientRequest *req = mdr->client_request;
1386 entity_inst_t client_inst = req->get_source_inst();
1387 if (client_inst.name.is_mds())
1388 return;
1389
1390 if (req->is_replay()) {
1391 dout(10) << " no early reply on replay op" << dendl;
1392 return;
1393 }
1394
1395
1396 MClientReply *reply = new MClientReply(req, 0);
1397 reply->set_unsafe();
1398
1399 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1400 //
1401 //_rename_finish() does not send dentry link/unlink message to replicas.
1402 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1403 // that have projected linkages from getting new replica.
1404 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
1405
1406 dout(10) << "early_reply " << reply->get_result()
1407 << " (" << cpp_strerror(reply->get_result())
1408 << ") " << *req << dendl;
1409
1410 if (tracei || tracedn) {
1411 if (tracei)
1412 mdr->cap_releases.erase(tracei->vino());
1413 if (tracedn)
1414 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1415
1416 set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid,
1417 req->get_dentry_wanted(), mdr);
1418 }
1419
1420 reply->set_extra_bl(mdr->reply_extra_bl);
1421 req->get_connection()->send_message(reply);
1422
1423 mdr->did_early_reply = true;
1424
1425 mds->logger->inc(l_mds_reply);
1426 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
1427 mds->logger->tinc(l_mds_reply_latency, lat);
1428 if (client_inst.name.is_client()) {
1429 mds->sessionmap.hit_session(mdr->session);
1430 }
1431 perf_gather_op_latency(req, lat);
1432 dout(20) << "lat " << lat << dendl;
1433
1434 mdr->mark_event("early_replied");
1435 }
1436
1437 /*
1438 * send given reply
1439 * include a trace to tracei
1440 * Clean up mdr
1441 */
1442 void Server::reply_client_request(MDRequestRef& mdr, MClientReply *reply)
1443 {
1444 assert(mdr.get());
1445 MClientRequest *req = mdr->client_request;
1446
1447 dout(7) << "reply_client_request " << reply->get_result()
1448 << " (" << cpp_strerror(reply->get_result())
1449 << ") " << *req << dendl;
1450
1451 mdr->mark_event("replying");
1452
1453 Session *session = mdr->session;
1454
1455 // note successful request in session map?
1456 //
1457 // setfilelock requests are special, they only modify states in MDS memory.
1458 // The states get lost when MDS fails. If Client re-send a completed
1459 // setfilelock request, it means that client did not receive corresponding
1460 // setfilelock reply. So MDS should re-execute the setfilelock request.
1461 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
1462 reply->get_result() == 0 && session) {
1463 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
1464 session->add_completed_request(mdr->reqid.tid, created);
1465 if (mdr->ls) {
1466 mdr->ls->touched_sessions.insert(session->info.inst.name);
1467 }
1468 }
1469
1470 // give any preallocated inos to the session
1471 apply_allocated_inos(mdr, session);
1472
1473 // get tracei/tracedn from mdr?
1474 snapid_t snapid = mdr->snapid;
1475 CInode *tracei = mdr->tracei;
1476 CDentry *tracedn = mdr->tracedn;
1477
1478 bool is_replay = mdr->client_request->is_replay();
1479 bool did_early_reply = mdr->did_early_reply;
1480 entity_inst_t client_inst = req->get_source_inst();
1481 int dentry_wanted = req->get_dentry_wanted();
1482
1483 if (!did_early_reply && !is_replay) {
1484
1485 mds->logger->inc(l_mds_reply);
1486 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
1487 mds->logger->tinc(l_mds_reply_latency, lat);
1488 if (client_inst.name.is_client()) {
1489 mds->sessionmap.hit_session(session);
1490 }
1491 perf_gather_op_latency(req, lat);
1492 dout(20) << "lat " << lat << dendl;
1493
1494 if (tracei)
1495 mdr->cap_releases.erase(tracei->vino());
1496 if (tracedn)
1497 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1498 }
1499
1500 // drop non-rdlocks before replying, so that we can issue leases
1501 mdcache->request_drop_non_rdlocks(mdr);
1502
1503 // reply at all?
1504 if (client_inst.name.is_mds() || !session) {
1505 reply->put(); // mds doesn't need a reply
1506 reply = 0;
1507 } else {
1508 // send reply.
1509 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
1510 (tracei || tracedn)) {
1511 if (is_replay) {
1512 if (tracei)
1513 mdcache->try_reconnect_cap(tracei, session);
1514 } else {
1515 // include metadata in reply
1516 set_trace_dist(session, reply, tracei, tracedn,
1517 snapid, dentry_wanted,
1518 mdr);
1519 }
1520 }
1521
1522 // We can set the extra bl unconditionally: if it's already been sent in the
1523 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
1524 reply->set_extra_bl(mdr->reply_extra_bl);
1525
1526 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
1527 req->get_connection()->send_message(reply);
1528 }
1529
1530 if (req->is_queued_for_replay() &&
1531 (mdr->has_completed || reply->get_result() < 0)) {
1532 if (reply->get_result() < 0) {
1533 int r = reply->get_result();
1534 derr << "reply_client_request: failed to replay " << *req
1535 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
1536 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
1537 }
1538 mds->queue_one_replay();
1539 }
1540
1541 // clean up request
1542 mdcache->request_finish(mdr);
1543
1544 // take a closer look at tracei, if it happens to be a remote link
1545 if (tracei &&
1546 tracedn &&
1547 tracedn->get_projected_linkage()->is_remote()) {
1548 mdcache->eval_remote(tracedn);
1549 }
1550 }
1551
1552
1553 void Server::encode_empty_dirstat(bufferlist& bl)
1554 {
1555 static DirStat empty;
1556 empty.encode(bl);
1557 }
1558
1559 void Server::encode_infinite_lease(bufferlist& bl)
1560 {
1561 LeaseStat e;
1562 e.seq = 0;
1563 e.mask = -1;
1564 e.duration_ms = -1;
1565 ::encode(e, bl);
1566 dout(20) << "encode_infinite_lease " << e << dendl;
1567 }
1568
1569 void Server::encode_null_lease(bufferlist& bl)
1570 {
1571 LeaseStat e;
1572 e.seq = 0;
1573 e.mask = 0;
1574 e.duration_ms = 0;
1575 ::encode(e, bl);
1576 dout(20) << "encode_null_lease " << e << dendl;
1577 }
1578
1579
1580 /*
1581 * pass inode OR dentry (not both, or we may get confused)
1582 *
1583 * trace is in reverse order (i.e. root inode comes last)
1584 */
1585 void Server::set_trace_dist(Session *session, MClientReply *reply,
1586 CInode *in, CDentry *dn,
1587 snapid_t snapid,
1588 int dentry_wanted,
1589 MDRequestRef& mdr)
1590 {
1591 // skip doing this for debugging purposes?
1592 if (g_conf->mds_inject_traceless_reply_probability &&
1593 mdr->ls && !mdr->o_trunc &&
1594 (rand() % 10000 < g_conf->mds_inject_traceless_reply_probability * 10000.0)) {
1595 dout(5) << "deliberately skipping trace for " << *reply << dendl;
1596 return;
1597 }
1598
1599 // inode, dentry, dir, ..., inode
1600 bufferlist bl;
1601 mds_rank_t whoami = mds->get_nodeid();
1602 client_t client = session->get_client();
1603 utime_t now = ceph_clock_now();
1604
1605 dout(20) << "set_trace_dist snapid " << snapid << dendl;
1606
1607 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
1608
1609 // realm
1610 if (snapid == CEPH_NOSNAP) {
1611 SnapRealm *realm;
1612 if (in)
1613 realm = in->find_snaprealm();
1614 else
1615 realm = dn->get_dir()->get_inode()->find_snaprealm();
1616 reply->snapbl = realm->get_snap_trace();
1617 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
1618 }
1619
1620 // dir + dentry?
1621 if (dn) {
1622 reply->head.is_dentry = 1;
1623 CDir *dir = dn->get_dir();
1624 CInode *diri = dir->get_inode();
1625
1626 diri->encode_inodestat(bl, session, NULL, snapid);
1627 dout(20) << "set_trace_dist added diri " << *diri << dendl;
1628
1629 #ifdef MDS_VERIFY_FRAGSTAT
1630 if (dir->is_complete())
1631 dir->verify_fragstat();
1632 #endif
1633 dir->encode_dirstat(bl, whoami);
1634 dout(20) << "set_trace_dist added dir " << *dir << dendl;
1635
1636 ::encode(dn->get_name(), bl);
1637 if (snapid == CEPH_NOSNAP)
1638 mds->locker->issue_client_lease(dn, client, bl, now, session);
1639 else
1640 encode_null_lease(bl);
1641 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
1642 } else
1643 reply->head.is_dentry = 0;
1644
1645 // inode
1646 if (in) {
1647 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
1648 dout(20) << "set_trace_dist added in " << *in << dendl;
1649 reply->head.is_target = 1;
1650 } else
1651 reply->head.is_target = 0;
1652
1653 reply->set_trace(bl);
1654 }
1655
1656
1657
1658
1659 /***
1660 * process a client request
1661 * This function DOES put the passed message before returning
1662 */
1663 void Server::handle_client_request(MClientRequest *req)
1664 {
1665 dout(4) << "handle_client_request " << *req << dendl;
1666
1667 if (mds->logger)
1668 mds->logger->inc(l_mds_request);
1669 if (logger)
1670 logger->inc(l_mdss_handle_client_request);
1671
1672 if (!mdcache->is_open()) {
1673 dout(5) << "waiting for root" << dendl;
1674 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
1675 return;
1676 }
1677
1678 // active session?
1679 Session *session = 0;
1680 if (req->get_source().is_client()) {
1681 session = mds->get_session(req);
1682 if (!session) {
1683 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
1684 } else if (session->is_closed() ||
1685 session->is_closing() ||
1686 session->is_killing()) {
1687 dout(5) << "session closed|closing|killing, dropping" << dendl;
1688 session = NULL;
1689 }
1690 if (!session) {
1691 if (req->is_queued_for_replay())
1692 mds->queue_one_replay();
1693 req->put();
1694 return;
1695 }
1696 }
1697
1698 // old mdsmap?
1699 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
1700 // send it? hrm, this isn't ideal; they may get a lot of copies if
1701 // they have a high request rate.
1702 }
1703
1704 // completed request?
1705 bool has_completed = false;
1706 if (req->is_replay() || req->get_retry_attempt()) {
1707 assert(session);
1708 inodeno_t created;
1709 if (session->have_completed_request(req->get_reqid().tid, &created)) {
1710 has_completed = true;
1711 // Don't send traceless reply if the completed request has created
1712 // new inode. Treat the request as lookup request instead.
1713 if (req->is_replay() ||
1714 ((created == inodeno_t() || !mds->is_clientreplay()) &&
1715 req->get_op() != CEPH_MDS_OP_OPEN &&
1716 req->get_op() != CEPH_MDS_OP_CREATE)) {
1717 dout(5) << "already completed " << req->get_reqid() << dendl;
1718 MClientReply *reply = new MClientReply(req, 0);
1719 if (created != inodeno_t()) {
1720 bufferlist extra;
1721 ::encode(created, extra);
1722 reply->set_extra_bl(extra);
1723 }
1724 req->get_connection()->send_message(reply);
1725
1726 if (req->is_queued_for_replay())
1727 mds->queue_one_replay();
1728
1729 req->put();
1730 return;
1731 }
1732 if (req->get_op() != CEPH_MDS_OP_OPEN &&
1733 req->get_op() != CEPH_MDS_OP_CREATE) {
1734 dout(10) << " completed request which created new inode " << created
1735 << ", convert it to lookup request" << dendl;
1736 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
1737 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
1738 }
1739 }
1740 }
1741
1742 // trim completed_request list
1743 if (req->get_oldest_client_tid() > 0) {
1744 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
1745 assert(session);
1746 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
1747 // Sessions 'completed_requests' was dirtied, mark it to be
1748 // potentially flushed at segment expiry.
1749 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
1750
1751 if (session->get_num_trim_requests_warnings() > 0 &&
1752 session->get_num_completed_requests() * 2 < g_conf->mds_max_completed_requests)
1753 session->reset_num_trim_requests_warnings();
1754 } else {
1755 if (session->get_num_completed_requests() >=
1756 (g_conf->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
1757 session->inc_num_trim_requests_warnings();
1758 stringstream ss;
1759 ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
1760 << req->get_oldest_client_tid() << "), "
1761 << session->get_num_completed_requests()
1762 << " completed requests recorded in session\n";
1763 mds->clog->warn() << ss.str();
1764 dout(20) << __func__ << " " << ss.str() << dendl;
1765 }
1766 }
1767 }
1768
1769 // register + dispatch
1770 MDRequestRef mdr = mdcache->request_start(req);
1771 if (!mdr.get())
1772 return;
1773
1774 if (session) {
1775 mdr->session = session;
1776 session->requests.push_back(&mdr->item_session_request);
1777 }
1778
1779 if (has_completed)
1780 mdr->has_completed = true;
1781
1782 // process embedded cap releases?
1783 // (only if NOT replay!)
1784 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
1785 client_t client = req->get_source().num();
1786 for (vector<MClientRequest::Release>::iterator p = req->releases.begin();
1787 p != req->releases.end();
1788 ++p)
1789 mds->locker->process_request_cap_release(mdr, client, p->item, p->dname);
1790 req->releases.clear();
1791 }
1792
1793 dispatch_client_request(mdr);
1794 return;
1795 }
1796
1797 void Server::handle_osd_map()
1798 {
1799 /* Note that we check the OSDMAP_FULL flag directly rather than
1800 * using osdmap_full_flag(), because we want to know "is the flag set"
1801 * rather than "does the flag apply to us?" */
1802 mds->objecter->with_osdmap([this](const OSDMap& o) {
1803 auto pi = o.get_pg_pool(mds->mdsmap->get_metadata_pool());
1804 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
1805 dout(7) << __func__ << ": full = " << is_full << " epoch = "
1806 << o.get_epoch() << dendl;
1807 });
1808 }
1809
1810 void Server::dispatch_client_request(MDRequestRef& mdr)
1811 {
1812 // we shouldn't be waiting on anyone.
1813 assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
1814
1815 if (mdr->killed) {
1816 dout(10) << "request " << *mdr << " was killed" << dendl;
1817 return;
1818 } else if (mdr->aborted) {
1819 mdr->aborted = false;
1820 mdcache->request_kill(mdr);
1821 return;
1822 }
1823
1824 MClientRequest *req = mdr->client_request;
1825
1826 if (logger) logger->inc(l_mdss_dispatch_client_request);
1827
1828 dout(7) << "dispatch_client_request " << *req << dendl;
1829
1830 if (req->may_write()) {
1831 if (mdcache->is_readonly()) {
1832 dout(10) << " read-only FS" << dendl;
1833 respond_to_request(mdr, -EROFS);
1834 return;
1835 }
1836 if (mdr->has_more() && mdr->more()->slave_error) {
1837 dout(10) << " got error from slaves" << dendl;
1838 respond_to_request(mdr, mdr->more()->slave_error);
1839 return;
1840 }
1841 }
1842
1843 if (is_full) {
1844 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1845 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
1846 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
1847 req->get_op() == CEPH_MDS_OP_RMXATTR ||
1848 req->get_op() == CEPH_MDS_OP_SETXATTR ||
1849 req->get_op() == CEPH_MDS_OP_CREATE ||
1850 req->get_op() == CEPH_MDS_OP_SYMLINK ||
1851 req->get_op() == CEPH_MDS_OP_MKSNAP ||
1852 ((req->get_op() == CEPH_MDS_OP_LINK ||
1853 req->get_op() == CEPH_MDS_OP_RENAME) &&
1854 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
1855 ) {
1856
1857 dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
1858 respond_to_request(mdr, -ENOSPC);
1859 return;
1860 } else {
1861 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
1862 }
1863 }
1864
1865 switch (req->get_op()) {
1866 case CEPH_MDS_OP_LOOKUPHASH:
1867 case CEPH_MDS_OP_LOOKUPINO:
1868 handle_client_lookup_ino(mdr, false, false);
1869 break;
1870 case CEPH_MDS_OP_LOOKUPPARENT:
1871 handle_client_lookup_ino(mdr, true, false);
1872 break;
1873 case CEPH_MDS_OP_LOOKUPNAME:
1874 handle_client_lookup_ino(mdr, false, true);
1875 break;
1876
1877 // inodes ops.
1878 case CEPH_MDS_OP_LOOKUP:
1879 handle_client_getattr(mdr, true);
1880 break;
1881
1882 case CEPH_MDS_OP_LOOKUPSNAP:
1883 // lookupsnap does not reference a CDentry; treat it as a getattr
1884 case CEPH_MDS_OP_GETATTR:
1885 handle_client_getattr(mdr, false);
1886 break;
1887
1888 case CEPH_MDS_OP_SETATTR:
1889 handle_client_setattr(mdr);
1890 break;
1891 case CEPH_MDS_OP_SETLAYOUT:
1892 handle_client_setlayout(mdr);
1893 break;
1894 case CEPH_MDS_OP_SETDIRLAYOUT:
1895 handle_client_setdirlayout(mdr);
1896 break;
1897 case CEPH_MDS_OP_SETXATTR:
1898 handle_client_setxattr(mdr);
1899 break;
1900 case CEPH_MDS_OP_RMXATTR:
1901 handle_client_removexattr(mdr);
1902 break;
1903
1904 case CEPH_MDS_OP_READDIR:
1905 handle_client_readdir(mdr);
1906 break;
1907
1908 case CEPH_MDS_OP_SETFILELOCK:
1909 handle_client_file_setlock(mdr);
1910 break;
1911
1912 case CEPH_MDS_OP_GETFILELOCK:
1913 handle_client_file_readlock(mdr);
1914 break;
1915
1916 // funky.
1917 case CEPH_MDS_OP_CREATE:
1918 if (mdr->has_completed)
1919 handle_client_open(mdr); // already created.. just open
1920 else
1921 handle_client_openc(mdr);
1922 break;
1923
1924 case CEPH_MDS_OP_OPEN:
1925 handle_client_open(mdr);
1926 break;
1927
1928 // namespace.
1929 // no prior locks.
1930 case CEPH_MDS_OP_MKNOD:
1931 handle_client_mknod(mdr);
1932 break;
1933 case CEPH_MDS_OP_LINK:
1934 handle_client_link(mdr);
1935 break;
1936 case CEPH_MDS_OP_UNLINK:
1937 case CEPH_MDS_OP_RMDIR:
1938 handle_client_unlink(mdr);
1939 break;
1940 case CEPH_MDS_OP_RENAME:
1941 handle_client_rename(mdr);
1942 break;
1943 case CEPH_MDS_OP_MKDIR:
1944 handle_client_mkdir(mdr);
1945 break;
1946 case CEPH_MDS_OP_SYMLINK:
1947 handle_client_symlink(mdr);
1948 break;
1949
1950
1951 // snaps
1952 case CEPH_MDS_OP_LSSNAP:
1953 handle_client_lssnap(mdr);
1954 break;
1955 case CEPH_MDS_OP_MKSNAP:
1956 handle_client_mksnap(mdr);
1957 break;
1958 case CEPH_MDS_OP_RMSNAP:
1959 handle_client_rmsnap(mdr);
1960 break;
1961 case CEPH_MDS_OP_RENAMESNAP:
1962 handle_client_renamesnap(mdr);
1963 break;
1964
1965 default:
1966 dout(1) << " unknown client op " << req->get_op() << dendl;
1967 respond_to_request(mdr, -EOPNOTSUPP);
1968 }
1969 }
1970
1971
1972 // ---------------------------------------
1973 // SLAVE REQUESTS
1974
1975 /* This function DOES put the passed message before returning*/
1976 void Server::handle_slave_request(MMDSSlaveRequest *m)
1977 {
1978 dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
1979 mds_rank_t from = mds_rank_t(m->get_source().num());
1980
1981 if (logger) logger->inc(l_mdss_handle_slave_request);
1982
1983 // reply?
1984 if (m->is_reply())
1985 return handle_slave_request_reply(m);
1986
1987 // the purpose of rename notify is enforcing causal message ordering. making sure
1988 // bystanders have received all messages from rename srcdn's auth MDS.
1989 if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
1990 MMDSSlaveRequest *reply = new MMDSSlaveRequest(m->get_reqid(), m->get_attempt(),
1991 MMDSSlaveRequest::OP_RENAMENOTIFYACK);
1992 mds->send_message(reply, m->get_connection());
1993 m->put();
1994 return;
1995 }
1996
1997 CDentry *straydn = NULL;
1998 if (m->stray.length() > 0) {
1999 straydn = mdcache->add_replica_stray(m->stray, from);
2000 assert(straydn);
2001 m->stray.clear();
2002 }
2003
2004 // am i a new slave?
2005 MDRequestRef mdr;
2006 if (mdcache->have_request(m->get_reqid())) {
2007 // existing?
2008 mdr = mdcache->request_get(m->get_reqid());
2009
2010 // is my request newer?
2011 if (mdr->attempt > m->get_attempt()) {
2012 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2013 << ", dropping " << *m << dendl;
2014 m->put();
2015 return;
2016 }
2017
2018
2019 if (mdr->attempt < m->get_attempt()) {
2020 // mine is old, close it out
2021 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2022 << ", closing out" << dendl;
2023 mdcache->request_finish(mdr);
2024 mdr.reset();
2025 } else if (mdr->slave_to_mds != from) {
2026 dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
2027 m->put();
2028 return;
2029 }
2030
2031 if (m->get_op() == MMDSSlaveRequest::OP_FINISH && m->is_abort()) {
2032 mdr->aborted = true;
2033 if (mdr->slave_request) {
2034 // only abort on-going xlock, wrlock and auth pin
2035 assert(!mdr->slave_did_prepare());
2036 } else {
2037 mdcache->request_finish(mdr);
2038 }
2039 m->put();
2040 return;
2041 }
2042 }
2043 if (!mdr.get()) {
2044 // new?
2045 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
2046 dout(10) << "missing slave request for " << m->get_reqid()
2047 << " OP_FINISH, must have lost race with a forward" << dendl;
2048 m->put();
2049 return;
2050 }
2051 mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
2052 mdr->set_op_stamp(m->op_stamp);
2053 }
2054 assert(mdr->slave_request == 0); // only one at a time, please!
2055
2056 if (straydn) {
2057 mdr->pin(straydn);
2058 mdr->straydn = straydn;
2059 }
2060
2061 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2062 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2063 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2064 return;
2065 } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2066 mdr->locks.empty()) {
2067 dout(3) << "not active yet, waiting" << dendl;
2068 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2069 return;
2070 }
2071
2072 mdr->reset_slave_request(m);
2073
2074 dispatch_slave_request(mdr);
2075 }
2076
2077 /* This function DOES put the passed message before returning*/
2078 void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
2079 {
2080 mds_rank_t from = mds_rank_t(m->get_source().num());
2081
2082 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2083 metareqid_t r = m->get_reqid();
2084 if (!mdcache->have_uncommitted_master(r, from)) {
2085 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2086 << from << " reqid " << r << dendl;
2087 m->put();
2088 return;
2089 }
2090 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2091 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2092 return;
2093 }
2094
2095 if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
2096 metareqid_t r = m->get_reqid();
2097 mdcache->committed_master_slave(r, from);
2098 m->put();
2099 return;
2100 }
2101
2102 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2103 if (m->get_attempt() != mdr->attempt) {
2104 dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
2105 << m->get_attempt() << dendl;
2106 m->put();
2107 return;
2108 }
2109
2110 switch (m->get_op()) {
2111 case MMDSSlaveRequest::OP_XLOCKACK:
2112 {
2113 // identify lock, master request
2114 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2115 m->get_object_info());
2116 mdr->more()->slaves.insert(from);
2117 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2118 mdr->xlocks.insert(lock);
2119 mdr->locks.insert(lock);
2120 mdr->finish_locking(lock);
2121 lock->get_xlock(mdr, mdr->get_client());
2122
2123 assert(mdr->more()->waiting_on_slave.count(from));
2124 mdr->more()->waiting_on_slave.erase(from);
2125 assert(mdr->more()->waiting_on_slave.empty());
2126 mdcache->dispatch_request(mdr);
2127 }
2128 break;
2129
2130 case MMDSSlaveRequest::OP_WRLOCKACK:
2131 {
2132 // identify lock, master request
2133 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2134 m->get_object_info());
2135 mdr->more()->slaves.insert(from);
2136 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2137 mdr->remote_wrlocks[lock] = from;
2138 mdr->locks.insert(lock);
2139 mdr->finish_locking(lock);
2140
2141 assert(mdr->more()->waiting_on_slave.count(from));
2142 mdr->more()->waiting_on_slave.erase(from);
2143 assert(mdr->more()->waiting_on_slave.empty());
2144 mdcache->dispatch_request(mdr);
2145 }
2146 break;
2147
2148 case MMDSSlaveRequest::OP_AUTHPINACK:
2149 handle_slave_auth_pin_ack(mdr, m);
2150 break;
2151
2152 case MMDSSlaveRequest::OP_LINKPREPACK:
2153 handle_slave_link_prep_ack(mdr, m);
2154 break;
2155
2156 case MMDSSlaveRequest::OP_RMDIRPREPACK:
2157 handle_slave_rmdir_prep_ack(mdr, m);
2158 break;
2159
2160 case MMDSSlaveRequest::OP_RENAMEPREPACK:
2161 handle_slave_rename_prep_ack(mdr, m);
2162 break;
2163
2164 case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
2165 handle_slave_rename_notify_ack(mdr, m);
2166 break;
2167
2168 default:
2169 ceph_abort();
2170 }
2171
2172 // done with reply.
2173 m->put();
2174 }
2175
2176 /* This function DOES put the mdr->slave_request before returning*/
2177 void Server::dispatch_slave_request(MDRequestRef& mdr)
2178 {
2179 dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
2180
2181 if (mdr->aborted) {
2182 dout(7) << " abort flag set, finishing" << dendl;
2183 mdcache->request_finish(mdr);
2184 return;
2185 }
2186
2187 if (logger) logger->inc(l_mdss_dispatch_slave_request);
2188
2189 int op = mdr->slave_request->get_op();
2190 switch (op) {
2191 case MMDSSlaveRequest::OP_XLOCK:
2192 case MMDSSlaveRequest::OP_WRLOCK:
2193 {
2194 // identify object
2195 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2196 mdr->slave_request->get_object_info());
2197
2198 if (!lock) {
2199 dout(10) << "don't have object, dropping" << dendl;
2200 ceph_abort(); // can this happen, if we auth pinned properly.
2201 }
2202 if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2203 dout(10) << "not auth for remote xlock attempt, dropping on "
2204 << *lock << " on " << *lock->get_parent() << dendl;
2205 } else {
2206 // use acquire_locks so that we get auth_pinning.
2207 set<SimpleLock*> rdlocks;
2208 set<SimpleLock*> wrlocks = mdr->wrlocks;
2209 set<SimpleLock*> xlocks = mdr->xlocks;
2210
2211 int replycode = 0;
2212 switch (op) {
2213 case MMDSSlaveRequest::OP_XLOCK:
2214 xlocks.insert(lock);
2215 replycode = MMDSSlaveRequest::OP_XLOCKACK;
2216 break;
2217 case MMDSSlaveRequest::OP_WRLOCK:
2218 wrlocks.insert(lock);
2219 replycode = MMDSSlaveRequest::OP_WRLOCKACK;
2220 break;
2221 }
2222
2223 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
2224 return;
2225
2226 // ack
2227 MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, replycode);
2228 r->set_lock_type(lock->get_type());
2229 lock->get_parent()->set_object_info(r->get_object_info());
2230 mds->send_message(r, mdr->slave_request->get_connection());
2231 }
2232
2233 // done.
2234 mdr->reset_slave_request();
2235 }
2236 break;
2237
2238 case MMDSSlaveRequest::OP_UNXLOCK:
2239 case MMDSSlaveRequest::OP_UNWRLOCK:
2240 {
2241 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2242 mdr->slave_request->get_object_info());
2243 assert(lock);
2244 bool need_issue = false;
2245 switch (op) {
2246 case MMDSSlaveRequest::OP_UNXLOCK:
2247 mds->locker->xlock_finish(lock, mdr.get(), &need_issue);
2248 break;
2249 case MMDSSlaveRequest::OP_UNWRLOCK:
2250 mds->locker->wrlock_finish(lock, mdr.get(), &need_issue);
2251 break;
2252 }
2253 if (need_issue)
2254 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2255
2256 // done. no ack necessary.
2257 mdr->reset_slave_request();
2258 }
2259 break;
2260
2261 case MMDSSlaveRequest::OP_DROPLOCKS:
2262 mds->locker->drop_locks(mdr.get());
2263 mdr->reset_slave_request();
2264 break;
2265
2266 case MMDSSlaveRequest::OP_AUTHPIN:
2267 handle_slave_auth_pin(mdr);
2268 break;
2269
2270 case MMDSSlaveRequest::OP_LINKPREP:
2271 case MMDSSlaveRequest::OP_UNLINKPREP:
2272 handle_slave_link_prep(mdr);
2273 break;
2274
2275 case MMDSSlaveRequest::OP_RMDIRPREP:
2276 handle_slave_rmdir_prep(mdr);
2277 break;
2278
2279 case MMDSSlaveRequest::OP_RENAMEPREP:
2280 handle_slave_rename_prep(mdr);
2281 break;
2282
2283 case MMDSSlaveRequest::OP_FINISH:
2284 // information about rename imported caps
2285 if (mdr->slave_request->inode_export.length() > 0)
2286 mdr->more()->inode_import.claim(mdr->slave_request->inode_export);
2287 // finish off request.
2288 mdcache->request_finish(mdr);
2289 break;
2290
2291 default:
2292 ceph_abort();
2293 }
2294 }
2295
2296 /* This function DOES put the mdr->slave_request before returning*/
2297 void Server::handle_slave_auth_pin(MDRequestRef& mdr)
2298 {
2299 dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
2300
2301 // build list of objects
2302 list<MDSCacheObject*> objects;
2303 CInode *auth_pin_freeze = NULL;
2304 bool fail = false, wouldblock = false, readonly = false;
2305
2306 if (mdcache->is_readonly()) {
2307 dout(10) << " read-only FS" << dendl;
2308 readonly = true;
2309 fail = true;
2310 }
2311
2312 if (!fail) {
2313 for (vector<MDSCacheObjectInfo>::iterator p = mdr->slave_request->get_authpins().begin();
2314 p != mdr->slave_request->get_authpins().end();
2315 ++p) {
2316 MDSCacheObject *object = mdcache->get_object(*p);
2317 if (!object) {
2318 dout(10) << " don't have " << *p << dendl;
2319 fail = true;
2320 break;
2321 }
2322
2323 objects.push_back(object);
2324 if (*p == mdr->slave_request->get_authpin_freeze())
2325 auth_pin_freeze = static_cast<CInode*>(object);
2326 }
2327 }
2328
2329 // can we auth pin them?
2330 if (!fail) {
2331 for (list<MDSCacheObject*>::iterator p = objects.begin();
2332 p != objects.end();
2333 ++p) {
2334 if (!(*p)->is_auth()) {
2335 dout(10) << " not auth for " << **p << dendl;
2336 fail = true;
2337 break;
2338 }
2339 if (mdr->is_auth_pinned(*p))
2340 continue;
2341 if (!mdr->can_auth_pin(*p)) {
2342 if (mdr->slave_request->is_nonblock()) {
2343 dout(10) << " can't auth_pin (freezing?) " << **p << " nonblocking" << dendl;
2344 fail = true;
2345 wouldblock = true;
2346 break;
2347 }
2348 // wait
2349 dout(10) << " waiting for authpinnable on " << **p << dendl;
2350 (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2351 mdr->drop_local_auth_pins();
2352
2353 mds->locker->notify_freeze_waiter(*p);
2354 return;
2355 }
2356 }
2357 }
2358
2359 // auth pin!
2360 if (fail) {
2361 mdr->drop_local_auth_pins(); // just in case
2362 } else {
2363 /* freeze authpin wrong inode */
2364 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
2365 mdr->more()->rename_inode != auth_pin_freeze)
2366 mdr->unfreeze_auth_pin(true);
2367
2368 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2369 * on the source inode to complete. This happens after all locks for the rename
2370 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2371 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2372 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2373 * The solution is freeze the inode and prevent other MDRequests from getting new
2374 * auth pins.
2375 */
2376 if (auth_pin_freeze) {
2377 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
2378 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
2379 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
2380 mds->mdlog->flush();
2381 return;
2382 }
2383 }
2384 for (list<MDSCacheObject*>::iterator p = objects.begin();
2385 p != objects.end();
2386 ++p) {
2387 dout(10) << "auth_pinning " << **p << dendl;
2388 mdr->auth_pin(*p);
2389 }
2390 }
2391
2392 // ack!
2393 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
2394
2395 // return list of my auth_pins (if any)
2396 for (set<MDSCacheObject*>::iterator p = mdr->auth_pins.begin();
2397 p != mdr->auth_pins.end();
2398 ++p) {
2399 MDSCacheObjectInfo info;
2400 (*p)->set_object_info(info);
2401 reply->get_authpins().push_back(info);
2402 if (*p == (MDSCacheObject*)auth_pin_freeze)
2403 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
2404 }
2405
2406 if (wouldblock)
2407 reply->mark_error_wouldblock();
2408 if (readonly)
2409 reply->mark_error_rofs();
2410
2411 mds->send_message_mds(reply, mdr->slave_to_mds);
2412
2413 // clean up this request
2414 mdr->reset_slave_request();
2415 return;
2416 }
2417
2418 /* This function DOES NOT put the passed ack before returning*/
2419 void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
2420 {
2421 dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
2422 mds_rank_t from = mds_rank_t(ack->get_source().num());
2423
2424 // added auth pins?
2425 set<MDSCacheObject*> pinned;
2426 for (vector<MDSCacheObjectInfo>::iterator p = ack->get_authpins().begin();
2427 p != ack->get_authpins().end();
2428 ++p) {
2429 MDSCacheObject *object = mdcache->get_object(*p);
2430 assert(object); // we pinned it
2431 dout(10) << " remote has pinned " << *object << dendl;
2432 if (!mdr->is_auth_pinned(object))
2433 mdr->remote_auth_pins[object] = from;
2434 if (*p == ack->get_authpin_freeze())
2435 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
2436 pinned.insert(object);
2437 }
2438
2439 // removed frozen auth pin ?
2440 if (mdr->more()->is_remote_frozen_authpin &&
2441 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
2442 auto p = mdr->remote_auth_pins.find(mdr->more()->rename_inode);
2443 assert(p != mdr->remote_auth_pins.end());
2444 if (p->second == from) {
2445 mdr->more()->is_remote_frozen_authpin = false;
2446 }
2447 }
2448
2449 // removed auth pins?
2450 map<MDSCacheObject*, mds_rank_t>::iterator p = mdr->remote_auth_pins.begin();
2451 while (p != mdr->remote_auth_pins.end()) {
2452 MDSCacheObject* object = p->first;
2453 if (p->second == from && pinned.count(object) == 0) {
2454 dout(10) << " remote has unpinned " << *object << dendl;
2455 mdr->remote_auth_pins.erase(p++);
2456 } else {
2457 ++p;
2458 }
2459 }
2460
2461 if (ack->is_error_rofs()) {
2462 mdr->more()->slave_error = -EROFS;
2463 mdr->aborted = true;
2464 } else if (ack->is_error_wouldblock()) {
2465 mdr->more()->slave_error = -EWOULDBLOCK;
2466 mdr->aborted = true;
2467 }
2468
2469 // note slave
2470 mdr->more()->slaves.insert(from);
2471
2472 // clear from waiting list
2473 assert(mdr->more()->waiting_on_slave.count(from));
2474 mdr->more()->waiting_on_slave.erase(from);
2475
2476 // go again?
2477 if (mdr->more()->waiting_on_slave.empty())
2478 mdcache->dispatch_request(mdr);
2479 else
2480 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
2481 }
2482
2483
2484 // ---------------------------------------
2485 // HELPERS
2486
2487
2488 /**
2489 * check whether we are permitted to complete a request
2490 *
2491 * Check whether we have permission to perform the operation specified
2492 * by mask on the given inode, based on the capability in the mdr's
2493 * session.
2494 */
2495 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
2496 {
2497 if (mdr->session) {
2498 int r = mdr->session->check_access(
2499 in, mask,
2500 mdr->client_request->get_caller_uid(),
2501 mdr->client_request->get_caller_gid(),
2502 &mdr->client_request->get_caller_gid_list(),
2503 mdr->client_request->head.args.setattr.uid,
2504 mdr->client_request->head.args.setattr.gid);
2505 if (r < 0) {
2506 respond_to_request(mdr, r);
2507 return false;
2508 }
2509 }
2510 return true;
2511 }
2512
2513 /**
2514 * check whether fragment has reached maximum size
2515 *
2516 */
2517 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
2518 {
2519 const auto size = in->get_frag_size();
2520 if (size >= g_conf->mds_bal_fragment_size_max) {
2521 dout(10) << "fragment " << *in << " size exceeds " << g_conf->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
2522 respond_to_request(mdr, -ENOSPC);
2523 return false;
2524 }
2525
2526 return true;
2527 }
2528
2529
2530 /** validate_dentry_dir
2531 *
2532 * verify that the dir exists and would own the dname.
2533 * do not check if the dentry exists.
2534 */
2535 CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, boost::string_view dname)
2536 {
2537 // make sure parent is a dir?
2538 if (!diri->is_dir()) {
2539 dout(7) << "validate_dentry_dir: not a dir" << dendl;
2540 respond_to_request(mdr, -ENOTDIR);
2541 return NULL;
2542 }
2543
2544 // which dirfrag?
2545 frag_t fg = diri->pick_dirfrag(dname);
2546 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
2547 if (!dir)
2548 return 0;
2549
2550 // frozen?
2551 if (dir->is_frozen()) {
2552 dout(7) << "dir is frozen " << *dir << dendl;
2553 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2554 return NULL;
2555 }
2556
2557 return dir;
2558 }
2559
2560
2561 /** prepare_null_dentry
2562 * prepare a null (or existing) dentry in given dir.
2563 * wait for any dn lock.
2564 */
2565 CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, boost::string_view dname, bool okexist)
2566 {
2567 dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
2568 assert(dir->is_auth());
2569
2570 client_t client = mdr->get_client();
2571
2572 // does it already exist?
2573 CDentry *dn = dir->lookup(dname);
2574 if (dn) {
2575 /*
2576 if (dn->lock.is_xlocked_by_other(mdr)) {
2577 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2578 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2579 return 0;
2580 }
2581 */
2582 if (!dn->get_linkage(client, mdr)->is_null()) {
2583 // name already exists
2584 dout(10) << "dentry " << dname << " exists in " << *dir << dendl;
2585 if (!okexist) {
2586 respond_to_request(mdr, -EEXIST);
2587 return 0;
2588 }
2589 } else {
2590 dn->first = dir->inode->find_snaprealm()->get_newest_seq() + 1;
2591 }
2592
2593 return dn;
2594 }
2595
2596 // make sure dir is complete
2597 if (!dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2598 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2599 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2600 return 0;
2601 }
2602
2603 // create
2604 dn = dir->add_null_dentry(dname, dir->inode->find_snaprealm()->get_newest_seq() + 1);
2605 dn->mark_new();
2606 dout(10) << "prepare_null_dentry added " << *dn << dendl;
2607 return dn;
2608 }
2609
2610 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
2611 {
2612 CDentry *straydn = mdr->straydn;
2613 if (straydn) {
2614 string straydname;
2615 in->name_stray_dentry(straydname);
2616 if (straydn->get_name() == straydname)
2617 return straydn;
2618
2619 assert(!mdr->done_locking);
2620 mdr->unpin(straydn);
2621 }
2622
2623 CDir *straydir = mdcache->get_stray_dir(in);
2624
2625 if (!mdr->client_request->is_replay() &&
2626 !check_fragment_space(mdr, straydir))
2627 return NULL;
2628
2629 straydn = mdcache->get_or_create_stray_dentry(in);
2630 mdr->straydn = straydn;
2631 mdr->pin(straydn);
2632 return straydn;
2633 }
2634
2635 /** prepare_new_inode
2636 *
2637 * create a new inode. set c/m/atime. hit dir pop.
2638 */
2639 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
2640 file_layout_t *layout)
2641 {
2642 CInode *in = new CInode(mdcache);
2643
2644 // Server::prepare_force_open_sessions() can re-open session in closing
2645 // state. In that corner case, session's prealloc_inos are being freed.
2646 // To simplify the code, we disallow using/refilling session's prealloc_ino
2647 // while session is opening.
2648 bool allow_prealloc_inos = !mdr->session->is_opening();
2649
2650 // assign ino
2651 if (allow_prealloc_inos &&
2652 mdr->session->info.prealloc_inos.size()) {
2653 mdr->used_prealloc_ino =
2654 in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used
2655 mds->sessionmap.mark_projected(mdr->session);
2656
2657 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
2658 << " (" << mdr->session->info.prealloc_inos
2659 << ", " << mdr->session->info.prealloc_inos.size() << " left)"
2660 << dendl;
2661 } else {
2662 mdr->alloc_ino =
2663 in->inode.ino = mds->inotable->project_alloc_id();
2664 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
2665 }
2666
2667 if (useino && useino != in->inode.ino) {
2668 dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
2669 mds->clog->error() << mdr->client_request->get_source()
2670 << " specified ino " << useino
2671 << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
2672 //ceph_abort(); // just for now.
2673 }
2674
2675 if (allow_prealloc_inos &&
2676 mdr->session->get_num_projected_prealloc_inos() < g_conf->mds_client_prealloc_inos / 2) {
2677 int need = g_conf->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
2678 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
2679 assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
2680 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
2681 mds->sessionmap.mark_projected(mdr->session);
2682 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
2683 }
2684
2685 in->inode.version = 1;
2686 in->inode.xattr_version = 1;
2687 in->inode.nlink = 1; // FIXME
2688
2689 in->inode.mode = mode;
2690
2691 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
2692 if (in->inode.is_dir()) {
2693 in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash;
2694 } else if (layout) {
2695 in->inode.layout = *layout;
2696 } else {
2697 in->inode.layout = mdcache->default_file_layout;
2698 }
2699
2700 in->inode.truncate_size = -1ull; // not truncated, yet!
2701 in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
2702
2703 CInode *diri = dir->get_inode();
2704
2705 dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
2706
2707 if (diri->inode.mode & S_ISGID) {
2708 dout(10) << " dir is sticky" << dendl;
2709 in->inode.gid = diri->inode.gid;
2710 if (S_ISDIR(mode)) {
2711 dout(10) << " new dir also sticky" << dendl;
2712 in->inode.mode |= S_ISGID;
2713 }
2714 } else
2715 in->inode.gid = mdr->client_request->get_caller_gid();
2716
2717 in->inode.uid = mdr->client_request->get_caller_uid();
2718
2719 in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
2720 mdr->get_op_stamp();
2721
2722 in->inode.change_attr = 0;
2723
2724 MClientRequest *req = mdr->client_request;
2725 if (req->get_data().length()) {
2726 bufferlist::iterator p = req->get_data().begin();
2727
2728 // xattrs on new inode?
2729 CInode::mempool_xattr_map xattrs;
2730 ::decode(xattrs, p);
2731 for (const auto &p : xattrs) {
2732 dout(10) << "prepare_new_inode setting xattr " << p.first << dendl;
2733 auto em = in->xattrs.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple(p.second));
2734 if (!em.second)
2735 em.first->second = p.second;
2736 }
2737 }
2738
2739 if (!mds->mdsmap->get_inline_data_enabled() ||
2740 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
2741 in->inode.inline_data.version = CEPH_INLINE_NONE;
2742
2743 mdcache->add_inode(in); // add
2744 dout(10) << "prepare_new_inode " << *in << dendl;
2745 return in;
2746 }
2747
2748 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
2749 {
2750 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
2751 << " inotablev " << mds->inotable->get_projected_version()
2752 << dendl;
2753 blob->set_ino_alloc(mdr->alloc_ino,
2754 mdr->used_prealloc_ino,
2755 mdr->prealloc_inos,
2756 mdr->client_request->get_source(),
2757 mds->sessionmap.get_projected(),
2758 mds->inotable->get_projected_version());
2759 }
2760
2761 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
2762 {
2763 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
2764 << " / " << mdr->prealloc_inos
2765 << " / " << mdr->used_prealloc_ino << dendl;
2766
2767 if (mdr->alloc_ino) {
2768 mds->inotable->apply_alloc_id(mdr->alloc_ino);
2769 }
2770 if (mdr->prealloc_inos.size()) {
2771 assert(session);
2772 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
2773 session->info.prealloc_inos.insert(mdr->prealloc_inos);
2774 mds->sessionmap.mark_dirty(session);
2775 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
2776 }
2777 if (mdr->used_prealloc_ino) {
2778 assert(session);
2779 session->info.used_inos.erase(mdr->used_prealloc_ino);
2780 mds->sessionmap.mark_dirty(session);
2781 }
2782 }
2783
2784 class C_MDS_TryFindInode : public ServerContext {
2785 MDRequestRef mdr;
2786 public:
2787 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
2788 void finish(int r) override {
2789 if (r == -ESTALE) // :( find_ino_peers failed
2790 server->respond_to_request(mdr, r);
2791 else
2792 server->dispatch_client_request(mdr);
2793 }
2794 };
2795
2796 CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath)
2797 {
2798 // figure parent dir vs dname
2799 if (refpath.depth() == 0) {
2800 dout(7) << "can't do that to root" << dendl;
2801 respond_to_request(mdr, -EINVAL);
2802 return 0;
2803 }
2804 string dname = refpath.last_dentry();
2805 refpath.pop_dentry();
2806
2807 dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl;
2808
2809 // traverse to parent dir
2810 CInode *diri;
2811 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &trace, &diri, MDS_TRAVERSE_FORWARD);
2812 if (r > 0) return 0; // delayed
2813 if (r < 0) {
2814 if (r == -ESTALE) {
2815 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2816 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
2817 return 0;
2818 }
2819 respond_to_request(mdr, r);
2820 return 0;
2821 }
2822
2823 // is it an auth dir?
2824 CDir *dir = validate_dentry_dir(mdr, diri, dname);
2825 if (!dir)
2826 return 0; // forwarded or waiting for freeze
2827
2828 dout(10) << "traverse_to_auth_dir " << *dir << dendl;
2829 return dir;
2830 }
2831
2832 /* If this returns null, the request has been handled
2833 * as appropriate: forwarded on, or the client's been replied to */
2834 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n,
2835 set<SimpleLock*> &rdlocks,
2836 bool want_auth,
2837 bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's
2838 a snapped dir */
2839 file_layout_t **layout,
2840 bool no_lookup) // true if we cannot return a null dentry lease
2841 {
2842 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2843 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
2844
2845 if (mdr->done_locking)
2846 return mdr->in[n];
2847
2848 // traverse
2849 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &mdr->dn[n], &mdr->in[n], MDS_TRAVERSE_FORWARD);
2850 if (r > 0)
2851 return NULL; // delayed
2852 if (r < 0) { // error
2853 if (r == -ENOENT && n == 0 && !mdr->dn[n].empty()) {
2854 if (!no_lookup) {
2855 mdr->tracedn = mdr->dn[n].back();
2856 }
2857 respond_to_request(mdr, r);
2858 } else if (r == -ESTALE) {
2859 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
2860 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
2861 mdcache->find_ino_peers(refpath.get_ino(), c);
2862 } else {
2863 dout(10) << "FAIL on error " << r << dendl;
2864 respond_to_request(mdr, r);
2865 }
2866 return 0;
2867 }
2868 CInode *ref = mdr->in[n];
2869 dout(10) << "ref is " << *ref << dendl;
2870
2871 // fw to inode auth?
2872 if (mdr->snapid != CEPH_NOSNAP && !no_want_auth)
2873 want_auth = true;
2874
2875 if (want_auth) {
2876 if (ref->is_ambiguous_auth()) {
2877 dout(10) << "waiting for single auth on " << *ref << dendl;
2878 ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
2879 return 0;
2880 }
2881 if (!ref->is_auth()) {
2882 dout(10) << "fw to auth for " << *ref << dendl;
2883 mdcache->request_forward(mdr, ref->authority().first);
2884 return 0;
2885 }
2886
2887 // auth_pin?
2888 // do NOT proceed if freezing, as cap release may defer in that case, and
2889 // we could deadlock when we try to lock @ref.
2890 // if we're already auth_pinned, continue; the release has already been processed.
2891 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
2892 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
2893 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
2894 ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2895 /* If we have any auth pins, this will deadlock.
2896 * But the only way to get here if we've already got auth pins
2897 * is because we're on an inode with snapshots that got updated
2898 * between dispatches of this request. So we're going to drop
2899 * our locks and our auth pins and reacquire them later.
2900 *
2901 * This is safe since we're only in this function when working on
2902 * a single MDS request; otherwise we'd be in
2903 * rdlock_path_xlock_dentry.
2904 */
2905 mds->locker->drop_locks(mdr.get(), NULL);
2906 mdr->drop_local_auth_pins();
2907 if (!mdr->remote_auth_pins.empty())
2908 mds->locker->notify_freeze_waiter(ref);
2909 return 0;
2910 }
2911
2912 mdr->auth_pin(ref);
2913 }
2914
2915 for (int i=0; i<(int)mdr->dn[n].size(); i++)
2916 rdlocks.insert(&mdr->dn[n][i]->lock);
2917 if (layout)
2918 mds->locker->include_snap_rdlocks_wlayout(rdlocks, ref, layout);
2919 else
2920 mds->locker->include_snap_rdlocks(rdlocks, ref);
2921
2922 // set and pin ref
2923 mdr->pin(ref);
2924 return ref;
2925 }
2926
2927
2928 /** rdlock_path_xlock_dentry
2929 * traverse path to the directory that could/would contain dentry.
2930 * make sure i am auth for that dentry, forward as necessary.
2931 * create null dentry in place (or use existing if okexist).
2932 * get rdlocks on traversed dentries, xlock on new dentry.
2933 */
2934 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
2935 set<SimpleLock*>& rdlocks, set<SimpleLock*>& wrlocks, set<SimpleLock*>& xlocks,
2936 bool okexist, bool mustexist, bool alwaysxlock,
2937 file_layout_t **layout)
2938 {
2939 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
2940
2941 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
2942
2943 client_t client = mdr->get_client();
2944
2945 if (mdr->done_locking)
2946 return mdr->dn[n].back();
2947
2948 CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
2949 if (!dir) return 0;
2950
2951 CInode *diri = dir->get_inode();
2952 if (!mdr->reqid.name.is_mds()) {
2953 if (diri->is_system() && !diri->is_root()) {
2954 respond_to_request(mdr, -EROFS);
2955 return 0;
2956 }
2957 }
2958 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
2959 respond_to_request(mdr, -ENOENT);
2960 return 0;
2961 }
2962
2963 // make a null dentry?
2964 boost::string_view dname = refpath.last_dentry();
2965 CDentry *dn;
2966 if (mustexist) {
2967 dn = dir->lookup(dname);
2968
2969 // make sure dir is complete
2970 if (!dn && !dir->is_complete() &&
2971 (!dir->has_bloom() || dir->is_in_bloom(dname))) {
2972 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
2973 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
2974 return 0;
2975 }
2976
2977 // readable?
2978 if (dn && !dn->lock.can_read(client) && dn->lock.get_xlock_by() != mdr) {
2979 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2980 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2981 return 0;
2982 }
2983
2984 // exists?
2985 if (!dn || dn->get_linkage(client, mdr)->is_null()) {
2986 dout(7) << "dentry " << dname << " dne in " << *dir << dendl;
2987 respond_to_request(mdr, -ENOENT);
2988 return 0;
2989 }
2990 } else {
2991 dn = prepare_null_dentry(mdr, dir, dname, okexist);
2992 if (!dn)
2993 return 0;
2994 }
2995
2996 mdr->dn[n].push_back(dn);
2997 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
2998 mdr->in[n] = dnl->get_inode();
2999
3000 // -- lock --
3001 // NOTE: rename takes the same set of locks for srcdn
3002 for (int i=0; i<(int)mdr->dn[n].size(); i++)
3003 rdlocks.insert(&mdr->dn[n][i]->lock);
3004 if (alwaysxlock || dnl->is_null())
3005 xlocks.insert(&dn->lock); // new dn, xlock
3006 else
3007 rdlocks.insert(&dn->lock); // existing dn, rdlock
3008 wrlocks.insert(&dn->get_dir()->inode->filelock); // also, wrlock on dir mtime
3009 wrlocks.insert(&dn->get_dir()->inode->nestlock); // also, wrlock on dir mtime
3010 if (layout)
3011 mds->locker->include_snap_rdlocks_wlayout(rdlocks, dn->get_dir()->inode, layout);
3012 else
3013 mds->locker->include_snap_rdlocks(rdlocks, dn->get_dir()->inode);
3014
3015 return dn;
3016 }
3017
3018
3019
3020
3021
3022 /**
3023 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3024 *
3025 * @param diri base inode
3026 * @param fg the exact frag we want
3027 * @param mdr request
3028 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3029 */
3030 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3031 {
3032 CDir *dir = diri->get_dirfrag(fg);
3033
3034 // not open and inode not mine?
3035 if (!dir && !diri->is_auth()) {
3036 mds_rank_t inauth = diri->authority().first;
3037 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3038 mdcache->request_forward(mdr, inauth);
3039 return 0;
3040 }
3041
3042 // not open and inode frozen?
3043 if (!dir && diri->is_frozen()) {
3044 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3045 assert(diri->get_parent_dir());
3046 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3047 return 0;
3048 }
3049
3050 // invent?
3051 if (!dir)
3052 dir = diri->get_or_open_dirfrag(mdcache, fg);
3053
3054 // am i auth for the dirfrag?
3055 if (!dir->is_auth()) {
3056 mds_rank_t auth = dir->authority().first;
3057 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3058 << ", fw to mds." << auth << dendl;
3059 mdcache->request_forward(mdr, auth);
3060 return 0;
3061 }
3062
3063 return dir;
3064 }
3065
3066
3067 // ===============================================================================
3068 // STAT
3069
3070 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3071 {
3072 MClientRequest *req = mdr->client_request;
3073 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3074
3075 if (req->get_filepath().depth() == 0 && is_lookup) {
3076 // refpath can't be empty for lookup but it can for
3077 // getattr (we do getattr with empty refpath for mount of '/')
3078 respond_to_request(mdr, -EINVAL);
3079 return;
3080 }
3081
3082 bool want_auth = false;
3083 int mask = req->head.args.getattr.mask;
3084 if (mask & CEPH_STAT_RSTAT)
3085 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3086
3087 CInode *ref = rdlock_path_pin_ref(mdr, 0, rdlocks, want_auth, false, NULL,
3088 !is_lookup);
3089 if (!ref) return;
3090
3091 /*
3092 * if client currently holds the EXCL cap on a field, do not rdlock
3093 * it; client's stat() will result in valid info if _either_ EXCL
3094 * cap is held or MDS rdlocks and reads the value here.
3095 *
3096 * handling this case here is easier than weakening rdlock
3097 * semantics... that would cause problems elsewhere.
3098 */
3099 client_t client = mdr->get_client();
3100 int issued = 0;
3101 Capability *cap = ref->get_client_cap(client);
3102 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3103 mdr->snapid <= cap->client_follows))
3104 issued = cap->issued();
3105
3106 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
3107 rdlocks.insert(&ref->linklock);
3108 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
3109 rdlocks.insert(&ref->authlock);
3110 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
3111 rdlocks.insert(&ref->xattrlock);
3112 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3113 // Don't wait on unstable filelock if client is allowed to read file size.
3114 // This can reduce the response time of getattr in the case that multiple
3115 // clients do stat(2) and there are writers.
3116 // The downside of this optimization is that mds may not issue Fs caps along
3117 // with getattr reply. Client may need to send more getattr requests.
3118 if (mdr->rdlocks.count(&ref->filelock)) {
3119 rdlocks.insert(&ref->filelock);
3120 } else if (ref->filelock.is_stable() ||
3121 ref->filelock.get_num_wrlocks() > 0 ||
3122 !ref->filelock.can_read(mdr->get_client())) {
3123 rdlocks.insert(&ref->filelock);
3124 mdr->done_locking = false;
3125 }
3126 }
3127
3128 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3129 return;
3130
3131 if (!check_access(mdr, ref, MAY_READ))
3132 return;
3133
3134 utime_t now = ceph_clock_now();
3135 mdr->set_mds_stamp(now);
3136
3137 // note which caps are requested, so we return at least a snapshot
3138 // value for them. (currently this matters for xattrs and inline data)
3139 mdr->getattr_caps = mask;
3140
3141 mds->balancer->hit_inode(now, ref, META_POP_IRD,
3142 req->get_source().num());
3143
3144 // reply
3145 dout(10) << "reply to stat on " << *req << dendl;
3146 mdr->tracei = ref;
3147 if (is_lookup)
3148 mdr->tracedn = mdr->dn[0].back();
3149 respond_to_request(mdr, 0);
3150 }
3151
3152 struct C_MDS_LookupIno2 : public ServerContext {
3153 MDRequestRef mdr;
3154 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3155 void finish(int r) override {
3156 server->_lookup_ino_2(mdr, r);
3157 }
3158 };
3159
3160 /* This function DOES clean up the mdr before returning*/
3161 /*
3162 * filepath: ino
3163 */
3164 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3165 bool want_parent, bool want_dentry)
3166 {
3167 MClientRequest *req = mdr->client_request;
3168
3169 inodeno_t ino = req->get_filepath().get_ino();
3170 CInode *in = mdcache->get_inode(ino);
3171 if (in && in->state_test(CInode::STATE_PURGING)) {
3172 respond_to_request(mdr, -ESTALE);
3173 return;
3174 }
3175 if (!in) {
3176 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3177 return;
3178 }
3179
3180 if (mdr && in->snaprealm && !in->snaprealm->is_open() &&
3181 !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
3182 return;
3183 }
3184
3185 // check for nothing (not read or write); this still applies the
3186 // path check.
3187 if (!check_access(mdr, in, 0))
3188 return;
3189
3190 CDentry *dn = in->get_projected_parent_dn();
3191 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3192
3193 set<SimpleLock*> rdlocks;
3194 if (dn && (want_parent || want_dentry)) {
3195 mdr->pin(dn);
3196 rdlocks.insert(&dn->lock);
3197 }
3198
3199 unsigned mask = req->head.args.getattr.mask;
3200 if (mask) {
3201 Capability *cap = in->get_client_cap(mdr->get_client());
3202 int issued = 0;
3203 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3204 issued = cap->issued();
3205 // permission bits, ACL/security xattrs
3206 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3207 rdlocks.insert(&in->authlock);
3208 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3209 rdlocks.insert(&in->xattrlock);
3210
3211 mdr->getattr_caps = mask;
3212 }
3213
3214 if (!rdlocks.empty()) {
3215 set<SimpleLock*> wrlocks, xlocks;
3216 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3217 return;
3218
3219 if (diri != NULL) {
3220 // need read access to directory inode
3221 if (!check_access(mdr, diri, MAY_READ))
3222 return;
3223 }
3224 }
3225
3226 if (want_parent) {
3227 if (in->is_base()) {
3228 respond_to_request(mdr, -EINVAL);
3229 return;
3230 }
3231 if (!diri || diri->is_stray()) {
3232 respond_to_request(mdr, -ESTALE);
3233 return;
3234 }
3235 dout(10) << "reply to lookup_parent " << *in << dendl;
3236 mdr->tracei = diri;
3237 respond_to_request(mdr, 0);
3238 } else {
3239 if (want_dentry) {
3240 inodeno_t dirino = req->get_filepath2().get_ino();
3241 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3242 respond_to_request(mdr, -ENOENT);
3243 return;
3244 }
3245 dout(10) << "reply to lookup_name " << *in << dendl;
3246 } else
3247 dout(10) << "reply to lookup_ino " << *in << dendl;
3248
3249 mdr->tracei = in;
3250 if (want_dentry)
3251 mdr->tracedn = dn;
3252 respond_to_request(mdr, 0);
3253 }
3254 }
3255
3256 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
3257 {
3258 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
3259 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3260
3261 // `r` is a rank if >=0, else an error code
3262 if (r >= 0) {
3263 mds_rank_t dest_rank(r);
3264 if (dest_rank == mds->get_nodeid())
3265 dispatch_client_request(mdr);
3266 else
3267 mdcache->request_forward(mdr, dest_rank);
3268 return;
3269 }
3270
3271 // give up
3272 if (r == -ENOENT || r == -ENODATA)
3273 r = -ESTALE;
3274 respond_to_request(mdr, r);
3275 }
3276
3277
3278 /* This function takes responsibility for the passed mdr*/
3279 void Server::handle_client_open(MDRequestRef& mdr)
3280 {
3281 MClientRequest *req = mdr->client_request;
3282 dout(7) << "open on " << req->get_filepath() << dendl;
3283
3284 int flags = req->head.args.open.flags;
3285 int cmode = ceph_flags_to_mode(flags);
3286 if (cmode < 0) {
3287 respond_to_request(mdr, -EINVAL);
3288 return;
3289 }
3290
3291 bool need_auth = !file_mode_is_readonly(cmode) ||
3292 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
3293
3294 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
3295 dout(7) << "read-only FS" << dendl;
3296 respond_to_request(mdr, -EROFS);
3297 return;
3298 }
3299
3300 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3301 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, need_auth);
3302 if (!cur)
3303 return;
3304
3305 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
3306 assert(!need_auth);
3307 mdr->done_locking = false;
3308 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3309 if (!cur)
3310 return;
3311 }
3312
3313 if (!cur->inode.is_file()) {
3314 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3315 cmode = CEPH_FILE_MODE_PIN;
3316 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3317 if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
3318 flags &= ~CEPH_O_TRUNC;
3319 }
3320
3321 dout(10) << "open flags = " << flags
3322 << ", filemode = " << cmode
3323 << ", need_auth = " << need_auth
3324 << dendl;
3325
3326 // regular file?
3327 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3328 dout(7) << "not a file or dir " << *cur << dendl;
3329 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3330 return;
3331 }*/
3332 if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
3333 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
3334 respond_to_request(mdr, -EINVAL);
3335 return;
3336 }
3337
3338 if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
3339 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
3340 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3341 respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
3342 return;
3343 }
3344
3345 if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
3346 !mdr->session->connection->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
3347 dout(7) << "old client cannot open inline data file " << *cur << dendl;
3348 respond_to_request(mdr, -EPERM);
3349 return;
3350 }
3351
3352 // snapped data is read only
3353 if (mdr->snapid != CEPH_NOSNAP &&
3354 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
3355 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
3356 respond_to_request(mdr, -EROFS);
3357 return;
3358 }
3359
3360 unsigned mask = req->head.args.open.mask;
3361 if (mask) {
3362 Capability *cap = cur->get_client_cap(mdr->get_client());
3363 int issued = 0;
3364 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3365 issued = cap->issued();
3366 // permission bits, ACL/security xattrs
3367 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3368 rdlocks.insert(&cur->authlock);
3369 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3370 rdlocks.insert(&cur->xattrlock);
3371
3372 mdr->getattr_caps = mask;
3373 }
3374
3375 // O_TRUNC
3376 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
3377 assert(cur->is_auth());
3378
3379 xlocks.insert(&cur->filelock);
3380 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3381 return;
3382
3383 if (!check_access(mdr, cur, MAY_WRITE))
3384 return;
3385
3386 // wait for pending truncate?
3387 const auto pi = cur->get_projected_inode();
3388 if (pi->is_truncating()) {
3389 dout(10) << " waiting for pending truncate from " << pi->truncate_from
3390 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
3391 mds->locker->drop_locks(mdr.get());
3392 mdr->drop_local_auth_pins();
3393 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
3394 return;
3395 }
3396
3397 do_open_truncate(mdr, cmode);
3398 return;
3399 }
3400
3401 // sync filelock if snapped.
3402 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3403 // and that data itself is flushed so that we can read the snapped data off disk.
3404 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
3405 rdlocks.insert(&cur->filelock);
3406 }
3407
3408 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3409 return;
3410
3411 mask = MAY_READ;
3412 if (cmode & CEPH_FILE_MODE_WR)
3413 mask |= MAY_WRITE;
3414 if (!check_access(mdr, cur, mask))
3415 return;
3416
3417 utime_t now = ceph_clock_now();
3418 mdr->set_mds_stamp(now);
3419
3420 if (cur->is_file() || cur->is_dir()) {
3421 if (mdr->snapid == CEPH_NOSNAP) {
3422 // register new cap
3423 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr->session, 0, req->is_replay());
3424 if (cap)
3425 dout(12) << "open issued caps " << ccap_string(cap->pending())
3426 << " for " << req->get_source()
3427 << " on " << *cur << dendl;
3428 } else {
3429 int caps = ceph_caps_for_mode(cmode);
3430 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
3431 << " for " << req->get_source()
3432 << " snapid " << mdr->snapid
3433 << " on " << *cur << dendl;
3434 mdr->snap_caps = caps;
3435 }
3436 }
3437
3438 // increase max_size?
3439 if (cmode & CEPH_FILE_MODE_WR)
3440 mds->locker->check_inode_max_size(cur);
3441
3442 // make sure this inode gets into the journal
3443 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
3444 !cur->item_open_file.is_on_list()) {
3445 LogSegment *ls = mds->mdlog->get_current_segment();
3446 EOpen *le = new EOpen(mds->mdlog);
3447 mdlog->start_entry(le);
3448 le->add_clean_inode(cur);
3449 ls->open_files.push_back(&cur->item_open_file);
3450 mdlog->submit_entry(le);
3451 }
3452
3453 // hit pop
3454 if (cmode & CEPH_FILE_MODE_WR)
3455 mds->balancer->hit_inode(now, cur, META_POP_IWR);
3456 else
3457 mds->balancer->hit_inode(now, cur, META_POP_IRD,
3458 mdr->client_request->get_source().num());
3459
3460 CDentry *dn = 0;
3461 if (req->get_dentry_wanted()) {
3462 assert(mdr->dn[0].size());
3463 dn = mdr->dn[0].back();
3464 }
3465
3466 mdr->tracei = cur;
3467 mdr->tracedn = dn;
3468 respond_to_request(mdr, 0);
3469 }
3470
3471 class C_MDS_openc_finish : public ServerLogContext {
3472 CDentry *dn;
3473 CInode *newi;
3474 snapid_t follows;
3475 public:
3476 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni, snapid_t f) :
3477 ServerLogContext(s, r), dn(d), newi(ni), follows(f) {}
3478 void finish(int r) override {
3479 assert(r == 0);
3480
3481 dn->pop_projected_linkage();
3482
3483 // dirty inode, dn, dir
3484 newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
3485 newi->mark_dirty(newi->inode.version+1, mdr->ls);
3486 newi->mark_dirty_parent(mdr->ls, true);
3487
3488 mdr->apply();
3489
3490 get_mds()->locker->share_inode_max_size(newi);
3491
3492 MDRequestRef null_ref;
3493 get_mds()->mdcache->send_dentry_link(dn, null_ref);
3494
3495 utime_t now = ceph_clock_now();
3496 get_mds()->balancer->hit_inode(now, newi, META_POP_IWR);
3497
3498 server->respond_to_request(mdr, 0);
3499
3500 assert(g_conf->mds_kill_openc_at != 1);
3501 }
3502 };
3503
3504 /* This function takes responsibility for the passed mdr*/
3505 void Server::handle_client_openc(MDRequestRef& mdr)
3506 {
3507 MClientRequest *req = mdr->client_request;
3508 client_t client = mdr->get_client();
3509
3510 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
3511
3512 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
3513 if (cmode < 0) {
3514 respond_to_request(mdr, -EINVAL);
3515 return;
3516 }
3517
3518 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
3519
3520 if (!excl) {
3521 int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(),
3522 &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
3523 if (r > 0) return;
3524 if (r == 0) {
3525 // it existed.
3526 handle_client_open(mdr);
3527 return;
3528 }
3529 if (r < 0 && r != -ENOENT) {
3530 if (r == -ESTALE) {
3531 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3532 MDSInternalContextBase *c = new C_MDS_TryFindInode(this, mdr);
3533 mdcache->find_ino_peers(req->get_filepath().get_ino(), c);
3534 } else {
3535 dout(10) << "FAIL on error " << r << dendl;
3536 respond_to_request(mdr, r);
3537 }
3538 return;
3539 }
3540 }
3541
3542 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3543 file_layout_t *dir_layout = NULL;
3544 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks,
3545 !excl, false, false, &dir_layout);
3546 if (!dn) return;
3547 if (mdr->snapid != CEPH_NOSNAP) {
3548 respond_to_request(mdr, -EROFS);
3549 return;
3550 }
3551 // set layout
3552 file_layout_t layout;
3553 if (dir_layout)
3554 layout = *dir_layout;
3555 else
3556 layout = mdcache->default_file_layout;
3557
3558 // What kind of client caps are required to complete this operation
3559 uint64_t access = MAY_WRITE;
3560
3561 const auto default_layout = layout;
3562
3563 // fill in any special params from client
3564 if (req->head.args.open.stripe_unit)
3565 layout.stripe_unit = req->head.args.open.stripe_unit;
3566 if (req->head.args.open.stripe_count)
3567 layout.stripe_count = req->head.args.open.stripe_count;
3568 if (req->head.args.open.object_size)
3569 layout.object_size = req->head.args.open.object_size;
3570 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
3571 (__s32)req->head.args.open.pool >= 0) {
3572 layout.pool_id = req->head.args.open.pool;
3573
3574 // make sure we have as new a map as the client
3575 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
3576 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
3577 return;
3578 }
3579 }
3580
3581 // If client doesn't have capability to modify layout pools, then
3582 // only permit this request if the requested pool matches what the
3583 // file would have inherited anyway from its parent.
3584 if (default_layout != layout) {
3585 access |= MAY_SET_VXATTR;
3586 }
3587
3588 if (!layout.is_valid()) {
3589 dout(10) << " invalid initial file layout" << dendl;
3590 respond_to_request(mdr, -EINVAL);
3591 return;
3592 }
3593 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
3594 dout(10) << " invalid data pool " << layout.pool_id << dendl;
3595 respond_to_request(mdr, -EINVAL);
3596 return;
3597 }
3598
3599 // created null dn.
3600 CDir *dir = dn->get_dir();
3601 CInode *diri = dir->get_inode();
3602 rdlocks.insert(&diri->authlock);
3603 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3604 return;
3605
3606 if (!check_access(mdr, diri, access))
3607 return;
3608
3609 if (!check_fragment_space(mdr, dir))
3610 return;
3611
3612 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3613
3614 if (!dnl->is_null()) {
3615 // it existed.
3616 assert(req->head.args.open.flags & CEPH_O_EXCL);
3617 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
3618 mdr->tracei = dnl->get_inode();
3619 mdr->tracedn = dn;
3620 respond_to_request(mdr, -EEXIST);
3621 return;
3622 }
3623
3624 // create inode.
3625 SnapRealm *realm = diri->find_snaprealm(); // use directory's realm; inode isn't attached yet.
3626 snapid_t follows = realm->get_newest_seq();
3627
3628 CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
3629 req->head.args.open.mode | S_IFREG, &layout);
3630 assert(in);
3631
3632 // it's a file.
3633 dn->push_projected_linkage(in);
3634
3635 in->inode.version = dn->pre_dirty();
3636 if (layout.pool_id != mdcache->default_file_layout.pool_id)
3637 in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
3638 in->inode.update_backtrace();
3639 if (cmode & CEPH_FILE_MODE_WR) {
3640 in->inode.client_ranges[client].range.first = 0;
3641 in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
3642 in->inode.client_ranges[client].follows = follows;
3643 }
3644 in->inode.rstat.rfiles = 1;
3645
3646 assert(dn->first == follows+1);
3647 in->first = dn->first;
3648
3649 // prepare finisher
3650 mdr->ls = mdlog->get_current_segment();
3651 EUpdate *le = new EUpdate(mdlog, "openc");
3652 mdlog->start_entry(le);
3653 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
3654 journal_allocated_inos(mdr, &le->metablob);
3655 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
3656 le->metablob.add_primary_dentry(dn, in, true, true, true);
3657
3658 // do the open
3659 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
3660 in->authlock.set_state(LOCK_EXCL);
3661 in->xattrlock.set_state(LOCK_EXCL);
3662
3663 // make sure this inode gets into the journal
3664 le->metablob.add_opened_ino(in->ino());
3665 LogSegment *ls = mds->mdlog->get_current_segment();
3666 ls->open_files.push_back(&in->item_open_file);
3667
3668 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in, follows);
3669
3670 if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
3671 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
3672 // add the file created flag onto the reply if create_flags features is supported
3673 ::encode(in->inode.ino, mdr->reply_extra_bl);
3674 }
3675
3676 journal_and_reply(mdr, in, dn, le, fin);
3677
3678 // We hit_dir (via hit_inode) in our finish callback, but by then we might
3679 // have overshot the split size (multiple opencs in flight), so here is
3680 // an early chance to split the dir if this openc makes it oversized.
3681 mds->balancer->maybe_fragment(dir, false);
3682 }
3683
3684
3685
3686 void Server::handle_client_readdir(MDRequestRef& mdr)
3687 {
3688 MClientRequest *req = mdr->client_request;
3689 client_t client = req->get_source().num();
3690 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3691 CInode *diri = rdlock_path_pin_ref(mdr, 0, rdlocks, false, true);
3692 if (!diri) return;
3693
3694 // it's a directory, right?
3695 if (!diri->is_dir()) {
3696 // not a dir
3697 dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
3698 respond_to_request(mdr, -ENOTDIR);
3699 return;
3700 }
3701
3702 rdlocks.insert(&diri->filelock);
3703 rdlocks.insert(&diri->dirfragtreelock);
3704
3705 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
3706 return;
3707
3708 if (!check_access(mdr, diri, MAY_READ))
3709 return;
3710
3711 // which frag?
3712 frag_t fg = (__u32)req->head.args.readdir.frag;
3713 unsigned req_flags = (__u32)req->head.args.readdir.flags;
3714 string offset_str = req->get_path2();
3715
3716 __u32 offset_hash = 0;
3717 if (!offset_str.empty())
3718 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
3719 else
3720 offset_hash = (__u32)req->head.args.readdir.offset_hash;
3721
3722 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
3723 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
3724
3725 // does the frag exist?
3726 if (diri->dirfragtree[fg.value()] != fg) {
3727 frag_t newfg;
3728 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3729 if (fg.contains((unsigned)offset_hash)) {
3730 newfg = diri->dirfragtree[offset_hash];
3731 } else {
3732 // client actually wants next frag
3733 newfg = diri->dirfragtree[fg.value()];
3734 }
3735 } else {
3736 offset_str.clear();
3737 newfg = diri->dirfragtree[fg.value()];
3738 }
3739 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
3740 fg = newfg;
3741 }
3742
3743 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
3744 if (!dir) return;
3745
3746 // ok!
3747 dout(10) << "handle_client_readdir on " << *dir << dendl;
3748 assert(dir->is_auth());
3749
3750 if (!dir->is_complete()) {
3751 if (dir->is_frozen()) {
3752 dout(7) << "dir is frozen " << *dir << dendl;
3753 mds->locker->drop_locks(mdr.get());
3754 mdr->drop_local_auth_pins();
3755 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3756 return;
3757 }
3758 // fetch
3759 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
3760 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
3761 return;
3762 }
3763
3764 #ifdef MDS_VERIFY_FRAGSTAT
3765 dir->verify_fragstat();
3766 #endif
3767
3768 utime_t now = ceph_clock_now();
3769 mdr->set_mds_stamp(now);
3770
3771 snapid_t snapid = mdr->snapid;
3772 dout(10) << "snapid " << snapid << dendl;
3773
3774 SnapRealm *realm = diri->find_snaprealm();
3775
3776 unsigned max = req->head.args.readdir.max_entries;
3777 if (!max)
3778 max = dir->get_num_any(); // whatever, something big.
3779 unsigned max_bytes = req->head.args.readdir.max_bytes;
3780 if (!max_bytes)
3781 // make sure at least one item can be encoded
3782 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
3783
3784 // start final blob
3785 bufferlist dirbl;
3786 dir->encode_dirstat(dirbl, mds->get_nodeid());
3787
3788 // count bytes available.
3789 // this isn't perfect, but we should capture the main variable/unbounded size items!
3790 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
3791 int bytes_left = max_bytes - front_bytes;
3792 bytes_left -= realm->get_snap_trace().length();
3793
3794 // build dir contents
3795 bufferlist dnbl;
3796 __u32 numfiles = 0;
3797 bool start = !offset_hash && offset_str.empty();
3798 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
3799 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
3800 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
3801 bool end = (it == dir->end());
3802 for (; !end && numfiles < max; end = (it == dir->end())) {
3803 CDentry *dn = it->second;
3804 ++it;
3805
3806 if (dn->state_test(CDentry::STATE_PURGING))
3807 continue;
3808
3809 bool dnp = dn->use_projected(client, mdr);
3810 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
3811
3812 if (dnl->is_null())
3813 continue;
3814
3815 if (dn->last < snapid || dn->first > snapid) {
3816 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
3817 continue;
3818 }
3819
3820 if (!start) {
3821 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
3822 if (!(offset_key < dn->key()))
3823 continue;
3824 }
3825
3826 CInode *in = dnl->get_inode();
3827
3828 if (in && in->ino() == CEPH_INO_CEPH)
3829 continue;
3830
3831 // remote link?
3832 // better for the MDS to do the work, if we think the client will stat any of these files.
3833 if (dnl->is_remote() && !in) {
3834 in = mdcache->get_inode(dnl->get_remote_ino());
3835 if (in) {
3836 dn->link_remote(dnl, in);
3837 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
3838 dout(10) << "skipping bad remote ino on " << *dn << dendl;
3839 continue;
3840 } else {
3841 // touch everything i _do_ have
3842 for (auto &p : *dir) {
3843 if (!p.second->get_linkage()->is_null())
3844 mdcache->lru.lru_touch(p.second);
3845 }
3846
3847 // already issued caps and leases, reply immediately.
3848 if (dnbl.length() > 0) {
3849 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
3850 dout(10) << " open remote dentry after caps were issued, stopping at "
3851 << dnbl.length() << " < " << bytes_left << dendl;
3852 break;
3853 }
3854
3855 mds->locker->drop_locks(mdr.get());
3856 mdr->drop_local_auth_pins();
3857 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
3858 return;
3859 }
3860 }
3861 assert(in);
3862
3863 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
3864 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
3865 break;
3866 }
3867
3868 unsigned start_len = dnbl.length();
3869
3870 // dentry
3871 dout(12) << "including dn " << *dn << dendl;
3872 ::encode(dn->get_name(), dnbl);
3873 mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session);
3874
3875 // inode
3876 dout(12) << "including inode " << *in << dendl;
3877 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
3878 if (r < 0) {
3879 // chop off dn->name, lease
3880 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
3881 bufferlist keep;
3882 keep.substr_of(dnbl, 0, start_len);
3883 dnbl.swap(keep);
3884 break;
3885 }
3886 assert(r >= 0);
3887 numfiles++;
3888
3889 // touch dn
3890 mdcache->lru.lru_touch(dn);
3891 }
3892
3893 __u16 flags = 0;
3894 if (end) {
3895 flags = CEPH_READDIR_FRAG_END;
3896 if (start)
3897 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
3898 }
3899 // client only understand END and COMPLETE flags ?
3900 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
3901 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
3902 }
3903
3904 // finish final blob
3905 ::encode(numfiles, dirbl);
3906 ::encode(flags, dirbl);
3907 dirbl.claim_append(dnbl);
3908
3909 // yay, reply
3910 dout(10) << "reply to " << *req << " readdir num=" << numfiles
3911 << " bytes=" << dirbl.length()
3912 << " start=" << (int)start
3913 << " end=" << (int)end
3914 << dendl;
3915 mdr->reply_extra_bl = dirbl;
3916
3917 // bump popularity. NOTE: this doesn't quite capture it.
3918 mds->balancer->hit_dir(now, dir, META_POP_IRD, -1, numfiles);
3919
3920 // reply
3921 mdr->tracei = diri;
3922 respond_to_request(mdr, 0);
3923 }
3924
3925
3926
3927 // ===============================================================================
3928 // INODE UPDATES
3929
3930
3931 /*
3932 * finisher for basic inode updates
3933 */
3934 class C_MDS_inode_update_finish : public ServerLogContext {
3935 CInode *in;
3936 bool truncating_smaller, changed_ranges;
3937 public:
3938 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
3939 bool sm=false, bool cr=false) :
3940 ServerLogContext(s, r), in(i), truncating_smaller(sm), changed_ranges(cr) { }
3941 void finish(int r) override {
3942 assert(r == 0);
3943
3944 // apply
3945 in->pop_and_dirty_projected_inode(mdr->ls);
3946 mdr->apply();
3947
3948 // notify any clients
3949 if (truncating_smaller && in->inode.is_truncating()) {
3950 get_mds()->locker->issue_truncate(in);
3951 get_mds()->mdcache->truncate_inode(in, mdr->ls);
3952 }
3953
3954 utime_t now = ceph_clock_now();
3955 get_mds()->balancer->hit_inode(now, in, META_POP_IWR);
3956
3957 server->respond_to_request(mdr, 0);
3958
3959 if (changed_ranges)
3960 get_mds()->locker->share_inode_max_size(in);
3961 }
3962 };
3963
3964 void Server::handle_client_file_setlock(MDRequestRef& mdr)
3965 {
3966 MClientRequest *req = mdr->client_request;
3967 set<SimpleLock*> rdlocks, wrlocks, xlocks;
3968
3969 // get the inode to operate on, and set up any locks needed for that
3970 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
3971 if (!cur)
3972 return;
3973
3974 xlocks.insert(&cur->flocklock);
3975 /* acquire_locks will return true if it gets the locks. If it fails,
3976 it will redeliver this request at a later date, so drop the request.
3977 */
3978 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
3979 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
3980 return;
3981 }
3982
3983 // copy the lock change into a ceph_filelock so we can store/apply it
3984 ceph_filelock set_lock;
3985 set_lock.start = req->head.args.filelock_change.start;
3986 set_lock.length = req->head.args.filelock_change.length;
3987 set_lock.client = req->get_orig_source().num();
3988 set_lock.owner = req->head.args.filelock_change.owner;
3989 set_lock.pid = req->head.args.filelock_change.pid;
3990 set_lock.type = req->head.args.filelock_change.type;
3991 bool will_wait = req->head.args.filelock_change.wait;
3992
3993 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
3994
3995 ceph_lock_state_t *lock_state = NULL;
3996 bool interrupt = false;
3997
3998 // get the appropriate lock state
3999 switch (req->head.args.filelock_change.rule) {
4000 case CEPH_LOCK_FLOCK_INTR:
4001 interrupt = true;
4002 // fall-thru
4003 case CEPH_LOCK_FLOCK:
4004 lock_state = cur->get_flock_lock_state();
4005 break;
4006
4007 case CEPH_LOCK_FCNTL_INTR:
4008 interrupt = true;
4009 // fall-thru
4010 case CEPH_LOCK_FCNTL:
4011 lock_state = cur->get_fcntl_lock_state();
4012 break;
4013
4014 default:
4015 dout(10) << "got unknown lock type " << set_lock.type
4016 << ", dropping request!" << dendl;
4017 respond_to_request(mdr, -EOPNOTSUPP);
4018 return;
4019 }
4020
4021 dout(10) << " state prior to lock change: " << *lock_state << dendl;
4022 if (CEPH_LOCK_UNLOCK == set_lock.type) {
4023 list<ceph_filelock> activated_locks;
4024 list<MDSInternalContextBase*> waiters;
4025 if (lock_state->is_waiting(set_lock)) {
4026 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
4027 lock_state->remove_waiting(set_lock);
4028 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4029 } else if (!interrupt) {
4030 dout(10) << " unlock attempt on " << set_lock << dendl;
4031 lock_state->remove_lock(set_lock, activated_locks);
4032 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4033 }
4034 mds->queue_waiters(waiters);
4035
4036 respond_to_request(mdr, 0);
4037 } else {
4038 dout(10) << " lock attempt on " << set_lock << dendl;
4039 bool deadlock = false;
4040 if (mdr->more()->flock_was_waiting &&
4041 !lock_state->is_waiting(set_lock)) {
4042 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
4043 respond_to_request(mdr, -EINTR);
4044 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
4045 dout(10) << " it failed on this attempt" << dendl;
4046 // couldn't set lock right now
4047 if (deadlock) {
4048 respond_to_request(mdr, -EDEADLK);
4049 } else if (!will_wait) {
4050 respond_to_request(mdr, -EWOULDBLOCK);
4051 } else {
4052 dout(10) << " added to waiting list" << dendl;
4053 assert(lock_state->is_waiting(set_lock));
4054 mdr->more()->flock_was_waiting = true;
4055 mds->locker->drop_locks(mdr.get());
4056 mdr->drop_local_auth_pins();
4057 mdr->mark_event("failed to add lock, waiting");
4058 mdr->mark_nowarn();
4059 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
4060 }
4061 } else
4062 respond_to_request(mdr, 0);
4063 }
4064 dout(10) << " state after lock change: " << *lock_state << dendl;
4065 }
4066
4067 void Server::handle_client_file_readlock(MDRequestRef& mdr)
4068 {
4069 MClientRequest *req = mdr->client_request;
4070 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4071
4072 // get the inode to operate on, and set up any locks needed for that
4073 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4074 if (!cur)
4075 return;
4076
4077 /* acquire_locks will return true if it gets the locks. If it fails,
4078 it will redeliver this request at a later date, so drop the request.
4079 */
4080 rdlocks.insert(&cur->flocklock);
4081 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) {
4082 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
4083 return;
4084 }
4085
4086 // copy the lock change into a ceph_filelock so we can store/apply it
4087 ceph_filelock checking_lock;
4088 checking_lock.start = req->head.args.filelock_change.start;
4089 checking_lock.length = req->head.args.filelock_change.length;
4090 checking_lock.client = req->get_orig_source().num();
4091 checking_lock.owner = req->head.args.filelock_change.owner;
4092 checking_lock.pid = req->head.args.filelock_change.pid;
4093 checking_lock.type = req->head.args.filelock_change.type;
4094
4095 // get the appropriate lock state
4096 ceph_lock_state_t *lock_state = NULL;
4097 switch (req->head.args.filelock_change.rule) {
4098 case CEPH_LOCK_FLOCK:
4099 lock_state = cur->get_flock_lock_state();
4100 break;
4101
4102 case CEPH_LOCK_FCNTL:
4103 lock_state = cur->get_fcntl_lock_state();
4104 break;
4105
4106 default:
4107 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
4108 respond_to_request(mdr, -EINVAL);
4109 return;
4110 }
4111 lock_state->look_for_lock(checking_lock);
4112
4113 bufferlist lock_bl;
4114 ::encode(checking_lock, lock_bl);
4115
4116 mdr->reply_extra_bl = lock_bl;
4117 respond_to_request(mdr, 0);
4118 }
4119
4120 void Server::handle_client_setattr(MDRequestRef& mdr)
4121 {
4122 MClientRequest *req = mdr->client_request;
4123 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4124 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4125 if (!cur) return;
4126
4127 if (mdr->snapid != CEPH_NOSNAP) {
4128 respond_to_request(mdr, -EROFS);
4129 return;
4130 }
4131 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
4132 respond_to_request(mdr, -EPERM);
4133 return;
4134 }
4135
4136 __u32 mask = req->head.args.setattr.mask;
4137 __u32 access_mask = MAY_WRITE;
4138
4139 // xlock inode
4140 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
4141 xlocks.insert(&cur->authlock);
4142 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
4143 xlocks.insert(&cur->filelock);
4144 if (mask & CEPH_SETATTR_CTIME)
4145 wrlocks.insert(&cur->versionlock);
4146
4147 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4148 return;
4149
4150 if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
4151 access_mask |= MAY_CHOWN;
4152
4153 if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
4154 access_mask |= MAY_CHGRP;
4155
4156 if (!check_access(mdr, cur, access_mask))
4157 return;
4158
4159 // trunc from bigger -> smaller?
4160 auto pip = cur->get_projected_inode();
4161
4162 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
4163
4164 // ENOSPC on growing file while full, but allow shrinks
4165 if (is_full && req->head.args.setattr.size > old_size) {
4166 dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
4167 respond_to_request(mdr, -ENOSPC);
4168 return;
4169 }
4170
4171 bool truncating_smaller = false;
4172 if (mask & CEPH_SETATTR_SIZE) {
4173 truncating_smaller = req->head.args.setattr.size < old_size;
4174 if (truncating_smaller && pip->is_truncating()) {
4175 dout(10) << " waiting for pending truncate from " << pip->truncate_from
4176 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
4177 mds->locker->drop_locks(mdr.get());
4178 mdr->drop_local_auth_pins();
4179 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4180 return;
4181 }
4182 }
4183
4184 bool changed_ranges = false;
4185
4186 // project update
4187 mdr->ls = mdlog->get_current_segment();
4188 EUpdate *le = new EUpdate(mdlog, "setattr");
4189 mdlog->start_entry(le);
4190
4191 auto &pi = cur->project_inode();
4192
4193 if (mask & CEPH_SETATTR_UID)
4194 pi.inode.uid = req->head.args.setattr.uid;
4195 if (mask & CEPH_SETATTR_GID)
4196 pi.inode.gid = req->head.args.setattr.gid;
4197
4198 if (mask & CEPH_SETATTR_MODE)
4199 pi.inode.mode = (pi.inode.mode & ~07777) | (req->head.args.setattr.mode & 07777);
4200 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
4201 S_ISREG(pi.inode.mode) &&
4202 (pi.inode.mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
4203 pi.inode.mode &= ~(S_ISUID|S_ISGID);
4204 }
4205
4206 if (mask & CEPH_SETATTR_MTIME)
4207 pi.inode.mtime = req->head.args.setattr.mtime;
4208 if (mask & CEPH_SETATTR_ATIME)
4209 pi.inode.atime = req->head.args.setattr.atime;
4210 if (mask & CEPH_SETATTR_BTIME)
4211 pi.inode.btime = req->head.args.setattr.btime;
4212 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
4213 pi.inode.time_warp_seq++; // maybe not a timewarp, but still a serialization point.
4214 if (mask & CEPH_SETATTR_SIZE) {
4215 if (truncating_smaller) {
4216 pi.inode.truncate(old_size, req->head.args.setattr.size);
4217 le->metablob.add_truncate_start(cur->ino());
4218 } else {
4219 pi.inode.size = req->head.args.setattr.size;
4220 pi.inode.rstat.rbytes = pi.inode.size;
4221 }
4222 pi.inode.mtime = mdr->get_op_stamp();
4223
4224 // adjust client's max_size?
4225 CInode::mempool_inode::client_range_map new_ranges;
4226 bool max_increased = false;
4227 mds->locker->calc_new_client_ranges(cur, pi.inode.size, &new_ranges, &max_increased);
4228 if (pi.inode.client_ranges != new_ranges) {
4229 dout(10) << " client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl;
4230 pi.inode.client_ranges = new_ranges;
4231 changed_ranges = true;
4232 }
4233 }
4234
4235 pi.inode.version = cur->pre_dirty();
4236 pi.inode.ctime = mdr->get_op_stamp();
4237 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
4238 pi.inode.rstat.rctime = mdr->get_op_stamp();
4239 pi.inode.change_attr++;
4240
4241 // log + wait
4242 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4243 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4244 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4245
4246 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
4247 truncating_smaller, changed_ranges));
4248
4249 // flush immediately if there are readers/writers waiting
4250 if (xlocks.count(&cur->filelock) &&
4251 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
4252 mds->mdlog->flush();
4253 }
4254
4255 /* Takes responsibility for mdr */
4256 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
4257 {
4258 CInode *in = mdr->in[0];
4259 client_t client = mdr->get_client();
4260 assert(in);
4261
4262 dout(10) << "do_open_truncate " << *in << dendl;
4263
4264 SnapRealm *realm = in->find_snaprealm();
4265 mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
4266
4267 mdr->ls = mdlog->get_current_segment();
4268 EUpdate *le = new EUpdate(mdlog, "open_truncate");
4269 mdlog->start_entry(le);
4270
4271 // prepare
4272 auto &pi = in->project_inode();
4273 pi.inode.version = in->pre_dirty();
4274 pi.inode.mtime = pi.inode.ctime = mdr->get_op_stamp();
4275 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
4276 pi.inode.rstat.rctime = mdr->get_op_stamp();
4277 pi.inode.change_attr++;
4278
4279 uint64_t old_size = std::max<uint64_t>(pi.inode.size, mdr->client_request->head.args.open.old_size);
4280 if (old_size > 0) {
4281 pi.inode.truncate(old_size, 0);
4282 le->metablob.add_truncate_start(in->ino());
4283 }
4284
4285 bool changed_ranges = false;
4286 if (cmode & CEPH_FILE_MODE_WR) {
4287 pi.inode.client_ranges[client].range.first = 0;
4288 pi.inode.client_ranges[client].range.last = pi.inode.get_layout_size_increment();
4289 pi.inode.client_ranges[client].follows = in->find_snaprealm()->get_newest_seq();
4290 changed_ranges = true;
4291 }
4292
4293 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
4294
4295 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
4296 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
4297
4298 // make sure ino gets into the journal
4299 le->metablob.add_opened_ino(in->ino());
4300 LogSegment *ls = mds->mdlog->get_current_segment();
4301 ls->open_files.push_back(&in->item_open_file);
4302
4303 mdr->o_trunc = true;
4304
4305 CDentry *dn = 0;
4306 if (mdr->client_request->get_dentry_wanted()) {
4307 assert(mdr->dn[0].size());
4308 dn = mdr->dn[0].back();
4309 }
4310
4311 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
4312 changed_ranges));
4313 // Although the `open` part can give an early reply, the truncation won't
4314 // happen until our EUpdate is persistent, to give the client a prompt
4315 // response we must also flush that event.
4316 mdlog->flush();
4317 }
4318
4319
4320 /* This function cleans up the passed mdr */
4321 void Server::handle_client_setlayout(MDRequestRef& mdr)
4322 {
4323 MClientRequest *req = mdr->client_request;
4324 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4325 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4326 if (!cur) return;
4327
4328 if (mdr->snapid != CEPH_NOSNAP) {
4329 respond_to_request(mdr, -EROFS);
4330 return;
4331 }
4332 if (!cur->is_file()) {
4333 respond_to_request(mdr, -EINVAL);
4334 return;
4335 }
4336 if (cur->get_projected_inode()->size ||
4337 cur->get_projected_inode()->truncate_seq > 1) {
4338 respond_to_request(mdr, -ENOTEMPTY);
4339 return;
4340 }
4341
4342 // validate layout
4343 file_layout_t layout = cur->get_projected_inode()->layout;
4344 // save existing layout for later
4345 const auto old_layout = layout;
4346
4347 int access = MAY_WRITE;
4348
4349 if (req->head.args.setlayout.layout.fl_object_size > 0)
4350 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4351 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4352 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4353 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4354 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4355 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4356 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4357
4358 // make sure we have as new a map as the client
4359 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4360 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4361 return;
4362 }
4363 }
4364
4365 // Don't permit layout modifications without 'p' caps
4366 if (layout != old_layout) {
4367 access |= MAY_SET_VXATTR;
4368 }
4369
4370 if (!layout.is_valid()) {
4371 dout(10) << "bad layout" << dendl;
4372 respond_to_request(mdr, -EINVAL);
4373 return;
4374 }
4375 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4376 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4377 respond_to_request(mdr, -EINVAL);
4378 return;
4379 }
4380
4381 xlocks.insert(&cur->filelock);
4382 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4383 return;
4384
4385 if (!check_access(mdr, cur, access))
4386 return;
4387
4388 // project update
4389 auto &pi = cur->project_inode();
4390 pi.inode.layout = layout;
4391 // add the old pool to the inode
4392 pi.inode.add_old_pool(old_layout.pool_id);
4393 pi.inode.version = cur->pre_dirty();
4394 pi.inode.ctime = mdr->get_op_stamp();
4395 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
4396 pi.inode.rstat.rctime = mdr->get_op_stamp();
4397 pi.inode.change_attr++;
4398
4399 // log + wait
4400 mdr->ls = mdlog->get_current_segment();
4401 EUpdate *le = new EUpdate(mdlog, "setlayout");
4402 mdlog->start_entry(le);
4403 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4404 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4405 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4406
4407 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4408 }
4409
4410 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
4411 {
4412 MClientRequest *req = mdr->client_request;
4413 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4414 file_layout_t *dir_layout = NULL;
4415 CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4416 if (!cur) return;
4417
4418 if (mdr->snapid != CEPH_NOSNAP) {
4419 respond_to_request(mdr, -EROFS);
4420 return;
4421 }
4422
4423 if (!cur->is_dir()) {
4424 respond_to_request(mdr, -ENOTDIR);
4425 return;
4426 }
4427
4428 xlocks.insert(&cur->policylock);
4429 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4430 return;
4431
4432 // validate layout
4433 const auto old_pi = cur->get_projected_inode();
4434 file_layout_t layout;
4435 if (old_pi->has_layout())
4436 layout = old_pi->layout;
4437 else if (dir_layout)
4438 layout = *dir_layout;
4439 else
4440 layout = mdcache->default_file_layout;
4441
4442 // Level of access required to complete
4443 int access = MAY_WRITE;
4444
4445 const auto old_layout = layout;
4446
4447 if (req->head.args.setlayout.layout.fl_object_size > 0)
4448 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4449 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4450 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4451 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4452 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4453 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4454 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4455 // make sure we have as new a map as the client
4456 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4457 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4458 return;
4459 }
4460 }
4461
4462 if (layout != old_layout) {
4463 access |= MAY_SET_VXATTR;
4464 }
4465
4466 if (!layout.is_valid()) {
4467 dout(10) << "bad layout" << dendl;
4468 respond_to_request(mdr, -EINVAL);
4469 return;
4470 }
4471 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4472 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4473 respond_to_request(mdr, -EINVAL);
4474 return;
4475 }
4476
4477 if (!check_access(mdr, cur, access))
4478 return;
4479
4480 auto &pi = cur->project_inode();
4481 pi.inode.layout = layout;
4482 pi.inode.version = cur->pre_dirty();
4483
4484 // log + wait
4485 mdr->ls = mdlog->get_current_segment();
4486 EUpdate *le = new EUpdate(mdlog, "setlayout");
4487 mdlog->start_entry(le);
4488 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4489 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4490 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4491
4492 mdr->no_early_reply = true;
4493 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4494 }
4495
4496 // XATTRS
4497
4498 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
4499 file_layout_t *layout, bool validate)
4500 {
4501 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
4502 try {
4503 if (name == "layout") {
4504 string::iterator begin = value.begin();
4505 string::iterator end = value.end();
4506 keys_and_values<string::iterator> p; // create instance of parser
4507 std::map<string, string> m; // map to receive results
4508 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4509 return -EINVAL;
4510 }
4511 string left(begin, end);
4512 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4513 if (begin != end)
4514 return -EINVAL;
4515 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4516 // Skip validation on each attr, we do it once at the end (avoid
4517 // rejecting intermediate states if the overall result is ok)
4518 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
4519 osdmap, layout, false);
4520 if (r < 0)
4521 return r;
4522 }
4523 } else if (name == "layout.object_size") {
4524 layout->object_size = boost::lexical_cast<unsigned>(value);
4525 } else if (name == "layout.stripe_unit") {
4526 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
4527 } else if (name == "layout.stripe_count") {
4528 layout->stripe_count = boost::lexical_cast<unsigned>(value);
4529 } else if (name == "layout.pool") {
4530 try {
4531 layout->pool_id = boost::lexical_cast<unsigned>(value);
4532 } catch (boost::bad_lexical_cast const&) {
4533 int64_t pool = osdmap.lookup_pg_pool_name(value);
4534 if (pool < 0) {
4535 dout(10) << " unknown pool " << value << dendl;
4536 return -ENOENT;
4537 }
4538 layout->pool_id = pool;
4539 }
4540 } else if (name == "layout.pool_namespace") {
4541 layout->pool_ns = value;
4542 } else {
4543 dout(10) << " unknown layout vxattr " << name << dendl;
4544 return -EINVAL;
4545 }
4546 } catch (boost::bad_lexical_cast const&) {
4547 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4548 return -EINVAL;
4549 }
4550
4551 if (validate && !layout->is_valid()) {
4552 dout(10) << "bad layout" << dendl;
4553 return -EINVAL;
4554 }
4555 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
4556 dout(10) << " invalid data pool " << layout->pool_id << dendl;
4557 return -EINVAL;
4558 }
4559 return 0;
4560 }
4561
4562 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
4563 {
4564 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
4565 try {
4566 if (name == "quota") {
4567 string::iterator begin = value.begin();
4568 string::iterator end = value.end();
4569 keys_and_values<string::iterator> p; // create instance of parser
4570 std::map<string, string> m; // map to receive results
4571 if (!qi::parse(begin, end, p, m)) { // returns true if successful
4572 return -EINVAL;
4573 }
4574 string left(begin, end);
4575 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
4576 if (begin != end)
4577 return -EINVAL;
4578 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
4579 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
4580 if (r < 0)
4581 return r;
4582 }
4583 } else if (name == "quota.max_bytes") {
4584 int64_t q = boost::lexical_cast<int64_t>(value);
4585 if (q < 0)
4586 return -EINVAL;
4587 quota->max_bytes = q;
4588 } else if (name == "quota.max_files") {
4589 int64_t q = boost::lexical_cast<int64_t>(value);
4590 if (q < 0)
4591 return -EINVAL;
4592 quota->max_files = q;
4593 } else {
4594 dout(10) << " unknown quota vxattr " << name << dendl;
4595 return -EINVAL;
4596 }
4597 } catch (boost::bad_lexical_cast const&) {
4598 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4599 return -EINVAL;
4600 }
4601
4602 if (!quota->is_valid()) {
4603 dout(10) << "bad quota" << dendl;
4604 return -EINVAL;
4605 }
4606 return 0;
4607 }
4608
4609 /*
4610 * Verify that the file layout attribute carried by client
4611 * is well-formatted.
4612 * Return 0 on success, otherwise this function takes
4613 * responsibility for the passed mdr.
4614 */
4615 int Server::check_layout_vxattr(MDRequestRef& mdr,
4616 string name,
4617 string value,
4618 file_layout_t *layout)
4619 {
4620 MClientRequest *req = mdr->client_request;
4621 epoch_t epoch;
4622 int r;
4623
4624 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4625 r = parse_layout_vxattr(name, value, osdmap, layout);
4626 epoch = osdmap.get_epoch();
4627 });
4628
4629 if (r == -ENOENT) {
4630
4631 // we don't have the specified pool, make sure our map
4632 // is newer than or as new as the client.
4633 epoch_t req_epoch = req->get_osdmap_epoch();
4634
4635 if (req_epoch > epoch) {
4636
4637 // well, our map is older. consult mds.
4638 Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
4639
4640 if (!mds->objecter->wait_for_map(req_epoch, fin))
4641 return r; // wait, fin will retry this request later
4642
4643 delete fin;
4644
4645 // now we have at least as new a map as the client, try again.
4646 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
4647 r = parse_layout_vxattr(name, value, osdmap, layout);
4648 epoch = osdmap.get_epoch();
4649 });
4650
4651 assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
4652
4653 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
4654
4655 // For compatibility with client w/ old code, we still need get the
4656 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
4657 // we can remove those code.
4658 mdr->waited_for_osdmap = true;
4659 mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
4660 mds, new C_MDS_RetryRequest(mdcache, mdr)));
4661 return r;
4662 }
4663 }
4664
4665 if (r < 0) {
4666
4667 if (r == -ENOENT)
4668 r = -EINVAL;
4669
4670 respond_to_request(mdr, r);
4671 return r;
4672 }
4673
4674 // all is well
4675 return 0;
4676 }
4677
4678 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
4679 file_layout_t *dir_layout,
4680 set<SimpleLock*> rdlocks,
4681 set<SimpleLock*> wrlocks,
4682 set<SimpleLock*> xlocks)
4683 {
4684 MClientRequest *req = mdr->client_request;
4685 string name(req->get_path2());
4686 bufferlist bl = req->get_data();
4687 string value (bl.c_str(), bl.length());
4688 dout(10) << "handle_set_vxattr " << name
4689 << " val " << value.length()
4690 << " bytes on " << *cur
4691 << dendl;
4692
4693 CInode::mempool_inode *pip = nullptr;
4694 string rest;
4695
4696 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
4697 return;
4698 }
4699
4700 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
4701 if (!cur->is_dir()) {
4702 respond_to_request(mdr, -EINVAL);
4703 return;
4704 }
4705
4706 file_layout_t layout;
4707 if (cur->get_projected_inode()->has_layout())
4708 layout = cur->get_projected_inode()->layout;
4709 else if (dir_layout)
4710 layout = *dir_layout;
4711 else
4712 layout = mdcache->default_file_layout;
4713
4714 rest = name.substr(name.find("layout"));
4715 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4716 return;
4717
4718 xlocks.insert(&cur->policylock);
4719 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4720 return;
4721
4722 auto &pi = cur->project_inode();
4723 pi.inode.layout = layout;
4724 mdr->no_early_reply = true;
4725 pip = &pi.inode;
4726 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
4727 if (!cur->is_file()) {
4728 respond_to_request(mdr, -EINVAL);
4729 return;
4730 }
4731 if (cur->get_projected_inode()->size ||
4732 cur->get_projected_inode()->truncate_seq > 1) {
4733 respond_to_request(mdr, -ENOTEMPTY);
4734 return;
4735 }
4736 file_layout_t layout = cur->get_projected_inode()->layout;
4737 rest = name.substr(name.find("layout"));
4738 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
4739 return;
4740
4741 xlocks.insert(&cur->filelock);
4742 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4743 return;
4744
4745 auto &pi = cur->project_inode();
4746 int64_t old_pool = pi.inode.layout.pool_id;
4747 pi.inode.add_old_pool(old_pool);
4748 pi.inode.layout = layout;
4749 pip = &pi.inode;
4750 } else if (name.compare(0, 10, "ceph.quota") == 0) {
4751 if (!cur->is_dir() || cur->is_root()) {
4752 respond_to_request(mdr, -EINVAL);
4753 return;
4754 }
4755
4756 quota_info_t quota = cur->get_projected_inode()->quota;
4757
4758 rest = name.substr(name.find("quota"));
4759 int r = parse_quota_vxattr(rest, value, &quota);
4760 if (r < 0) {
4761 respond_to_request(mdr, r);
4762 return;
4763 }
4764
4765 xlocks.insert(&cur->policylock);
4766 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4767 return;
4768
4769 auto &pi = cur->project_inode();
4770 pi.inode.quota = quota;
4771
4772 mdr->no_early_reply = true;
4773 pip = &pi.inode;
4774
4775 client_t exclude_ct = mdr->get_client();
4776 mdcache->broadcast_quota_to_client(cur, exclude_ct);
4777 } else if (name.find("ceph.dir.pin") == 0) {
4778 if (!cur->is_dir() || cur->is_root()) {
4779 respond_to_request(mdr, -EINVAL);
4780 return;
4781 }
4782
4783 mds_rank_t rank;
4784 try {
4785 rank = boost::lexical_cast<mds_rank_t>(value);
4786 if (rank < 0) rank = MDS_RANK_NONE;
4787 } catch (boost::bad_lexical_cast const&) {
4788 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
4789 respond_to_request(mdr, -EINVAL);
4790 return;
4791 }
4792
4793 xlocks.insert(&cur->policylock);
4794 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4795 return;
4796
4797 auto &pi = cur->project_inode();
4798 cur->set_export_pin(rank);
4799 pip = &pi.inode;
4800 } else {
4801 dout(10) << " unknown vxattr " << name << dendl;
4802 respond_to_request(mdr, -EINVAL);
4803 return;
4804 }
4805
4806 pip->change_attr++;
4807 pip->ctime = mdr->get_op_stamp();
4808 if (mdr->get_op_stamp() > pip->rstat.rctime)
4809 pip->rstat.rctime = mdr->get_op_stamp();
4810 pip->version = cur->pre_dirty();
4811 if (cur->is_file())
4812 pip->update_backtrace();
4813
4814 // log + wait
4815 mdr->ls = mdlog->get_current_segment();
4816 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
4817 mdlog->start_entry(le);
4818 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4819 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4820 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4821
4822 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4823 return;
4824 }
4825
4826 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
4827 file_layout_t *dir_layout,
4828 set<SimpleLock*> rdlocks,
4829 set<SimpleLock*> wrlocks,
4830 set<SimpleLock*> xlocks)
4831 {
4832 MClientRequest *req = mdr->client_request;
4833 string name(req->get_path2());
4834
4835 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
4836
4837 if (name == "ceph.dir.layout") {
4838 if (!cur->is_dir()) {
4839 respond_to_request(mdr, -ENODATA);
4840 return;
4841 }
4842 if (cur->is_root()) {
4843 dout(10) << "can't remove layout policy on the root directory" << dendl;
4844 respond_to_request(mdr, -EINVAL);
4845 return;
4846 }
4847
4848 if (!cur->get_projected_inode()->has_layout()) {
4849 respond_to_request(mdr, -ENODATA);
4850 return;
4851 }
4852
4853 xlocks.insert(&cur->policylock);
4854 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4855 return;
4856
4857 auto &pi = cur->project_inode();
4858 pi.inode.clear_layout();
4859 pi.inode.version = cur->pre_dirty();
4860
4861 // log + wait
4862 mdr->ls = mdlog->get_current_segment();
4863 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
4864 mdlog->start_entry(le);
4865 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4866 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4867 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4868
4869 mdr->no_early_reply = true;
4870 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4871 return;
4872 } else if (name == "ceph.dir.layout.pool_namespace"
4873 || name == "ceph.file.layout.pool_namespace") {
4874 // Namespace is the only layout field that has a meaningful
4875 // null/none value (empty string, means default layout). Is equivalent
4876 // to a setxattr with empty string: pass through the empty payload of
4877 // the rmxattr request to do this.
4878 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4879 return;
4880 }
4881
4882 respond_to_request(mdr, -ENODATA);
4883 }
4884
4885 class C_MDS_inode_xattr_update_finish : public ServerLogContext {
4886 CInode *in;
4887 public:
4888
4889 C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
4890 ServerLogContext(s, r), in(i) { }
4891 void finish(int r) override {
4892 assert(r == 0);
4893
4894 // apply
4895 in->pop_and_dirty_projected_inode(mdr->ls);
4896
4897 mdr->apply();
4898
4899 utime_t now = ceph_clock_now();
4900 get_mds()->balancer->hit_inode(now, in, META_POP_IWR);
4901
4902 server->respond_to_request(mdr, 0);
4903 }
4904 };
4905
4906 void Server::handle_client_setxattr(MDRequestRef& mdr)
4907 {
4908 MClientRequest *req = mdr->client_request;
4909 string name(req->get_path2());
4910 set<SimpleLock*> rdlocks, wrlocks, xlocks;
4911 CInode *cur;
4912
4913 file_layout_t *dir_layout = NULL;
4914 if (name.compare(0, 15, "ceph.dir.layout") == 0)
4915 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
4916 else
4917 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
4918 if (!cur)
4919 return;
4920
4921 if (mdr->snapid != CEPH_NOSNAP) {
4922 respond_to_request(mdr, -EROFS);
4923 return;
4924 }
4925
4926 int flags = req->head.args.setxattr.flags;
4927
4928 // magic ceph.* namespace?
4929 if (name.compare(0, 5, "ceph.") == 0) {
4930 handle_set_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
4931 return;
4932 }
4933
4934 xlocks.insert(&cur->xattrlock);
4935 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
4936 return;
4937
4938 if (!check_access(mdr, cur, MAY_WRITE))
4939 return;
4940
4941 auto pxattrs = cur->get_projected_xattrs();
4942 size_t len = req->get_data().length();
4943 size_t inc = len + name.length();
4944
4945 // check xattrs kv pairs size
4946 size_t cur_xattrs_size = 0;
4947 for (const auto& p : *pxattrs) {
4948 if ((flags & CEPH_XATTR_REPLACE) && (name.compare(std::string(boost::string_view(p.first))) == 0)) {
4949 continue;
4950 }
4951 cur_xattrs_size += p.first.length() + p.second.length();
4952 }
4953
4954 if (((cur_xattrs_size + inc) > g_conf->mds_max_xattr_pairs_size)) {
4955 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
4956 << cur_xattrs_size << ", inc " << inc << dendl;
4957 respond_to_request(mdr, -ENOSPC);
4958 return;
4959 }
4960
4961 if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(mempool::mds_co::string(boost::string_view(name)))) {
4962 dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
4963 respond_to_request(mdr, -EEXIST);
4964 return;
4965 }
4966 if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(mempool::mds_co::string(boost::string_view(name)))) {
4967 dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
4968 respond_to_request(mdr, -ENODATA);
4969 return;
4970 }
4971
4972 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
4973
4974 // project update
4975 auto &pi = cur->project_inode(true);
4976 pi.inode.version = cur->pre_dirty();
4977 pi.inode.ctime = mdr->get_op_stamp();
4978 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
4979 pi.inode.rstat.rctime = mdr->get_op_stamp();
4980 pi.inode.change_attr++;
4981 pi.inode.xattr_version++;
4982 auto &px = *pi.xattrs;
4983 if ((flags & CEPH_XATTR_REMOVE)) {
4984 px.erase(mempool::mds_co::string(boost::string_view(name)));
4985 } else {
4986 bufferptr b = buffer::create(len);
4987 if (len)
4988 req->get_data().copy(0, len, b.c_str());
4989 auto em = px.emplace(std::piecewise_construct, std::forward_as_tuple(mempool::mds_co::string(boost::string_view(name))), std::forward_as_tuple(b));
4990 if (!em.second)
4991 em.first->second = b;
4992 }
4993
4994 // log + wait
4995 mdr->ls = mdlog->get_current_segment();
4996 EUpdate *le = new EUpdate(mdlog, "setxattr");
4997 mdlog->start_entry(le);
4998 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4999 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5000 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5001
5002 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5003 }
5004
5005 void Server::handle_client_removexattr(MDRequestRef& mdr)
5006 {
5007 MClientRequest *req = mdr->client_request;
5008 std::string name(req->get_path2());
5009 std::set<SimpleLock*> rdlocks, wrlocks, xlocks;
5010 file_layout_t *dir_layout = NULL;
5011 CInode *cur;
5012 if (name == "ceph.dir.layout")
5013 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true, false, &dir_layout);
5014 else
5015 cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
5016 if (!cur)
5017 return;
5018
5019 if (mdr->snapid != CEPH_NOSNAP) {
5020 respond_to_request(mdr, -EROFS);
5021 return;
5022 }
5023
5024 if (name.compare(0, 5, "ceph.") == 0) {
5025 handle_remove_vxattr(mdr, cur, dir_layout, rdlocks, wrlocks, xlocks);
5026 return;
5027 }
5028
5029 xlocks.insert(&cur->xattrlock);
5030 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5031 return;
5032
5033 auto pxattrs = cur->get_projected_xattrs();
5034 if (pxattrs->count(mempool::mds_co::string(boost::string_view(name))) == 0) {
5035 dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
5036 respond_to_request(mdr, -ENODATA);
5037 return;
5038 }
5039
5040 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
5041
5042 // project update
5043 auto &pi = cur->project_inode(true);
5044 auto &px = *pi.xattrs;
5045 pi.inode.version = cur->pre_dirty();
5046 pi.inode.ctime = mdr->get_op_stamp();
5047 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5048 pi.inode.rstat.rctime = mdr->get_op_stamp();
5049 pi.inode.change_attr++;
5050 pi.inode.xattr_version++;
5051 px.erase(mempool::mds_co::string(boost::string_view(name)));
5052
5053 // log + wait
5054 mdr->ls = mdlog->get_current_segment();
5055 EUpdate *le = new EUpdate(mdlog, "removexattr");
5056 mdlog->start_entry(le);
5057 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5058 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5059 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5060
5061 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5062 }
5063
5064
5065 // =================================================================
5066 // DIRECTORY and NAMESPACE OPS
5067
5068
5069 // ------------------------------------------------
5070
5071 // MKNOD
5072
5073 class C_MDS_mknod_finish : public ServerLogContext {
5074 CDentry *dn;
5075 CInode *newi;
5076 public:
5077 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
5078 ServerLogContext(s, r), dn(d), newi(ni) {}
5079 void finish(int r) override {
5080 assert(r == 0);
5081
5082 // link the inode
5083 dn->pop_projected_linkage();
5084
5085 // be a bit hacky with the inode version, here.. we decrement it
5086 // just to keep mark_dirty() happen. (we didn't bother projecting
5087 // a new version of hte inode since it's just been created)
5088 newi->inode.version--;
5089 newi->mark_dirty(newi->inode.version + 1, mdr->ls);
5090 newi->mark_dirty_parent(mdr->ls, true);
5091
5092 // mkdir?
5093 if (newi->inode.is_dir()) {
5094 CDir *dir = newi->get_dirfrag(frag_t());
5095 assert(dir);
5096 dir->fnode.version--;
5097 dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
5098 dir->mark_new(mdr->ls);
5099 }
5100
5101 mdr->apply();
5102
5103 MDRequestRef null_ref;
5104 get_mds()->mdcache->send_dentry_link(dn, null_ref);
5105
5106 if (newi->inode.is_file())
5107 get_mds()->locker->share_inode_max_size(newi);
5108
5109 // hit pop
5110 utime_t now = ceph_clock_now();
5111 get_mds()->balancer->hit_inode(now, newi, META_POP_IWR);
5112
5113 // reply
5114 server->respond_to_request(mdr, 0);
5115 }
5116 };
5117
5118
5119 void Server::handle_client_mknod(MDRequestRef& mdr)
5120 {
5121 MClientRequest *req = mdr->client_request;
5122 client_t client = mdr->get_client();
5123 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5124 file_layout_t *dir_layout = NULL;
5125 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false,
5126 &dir_layout);
5127 if (!dn) return;
5128 if (mdr->snapid != CEPH_NOSNAP) {
5129 respond_to_request(mdr, -EROFS);
5130 return;
5131 }
5132 CInode *diri = dn->get_dir()->get_inode();
5133 rdlocks.insert(&diri->authlock);
5134 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5135 return;
5136
5137 if (!check_access(mdr, diri, MAY_WRITE))
5138 return;
5139
5140 if (!check_fragment_space(mdr, dn->get_dir()))
5141 return;
5142
5143 unsigned mode = req->head.args.mknod.mode;
5144 if ((mode & S_IFMT) == 0)
5145 mode |= S_IFREG;
5146
5147 // set layout
5148 file_layout_t layout;
5149 if (dir_layout && S_ISREG(mode))
5150 layout = *dir_layout;
5151 else
5152 layout = mdcache->default_file_layout;
5153
5154 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5155 snapid_t follows = realm->get_newest_seq();
5156 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
5157 mode, &layout);
5158 assert(newi);
5159
5160 dn->push_projected_linkage(newi);
5161
5162 newi->inode.rdev = req->head.args.mknod.rdev;
5163 newi->inode.version = dn->pre_dirty();
5164 newi->inode.rstat.rfiles = 1;
5165 if (layout.pool_id != mdcache->default_file_layout.pool_id)
5166 newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
5167 newi->inode.update_backtrace();
5168
5169 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5170 // want to write to it (e.g., if they are reexporting NFS)
5171 if (S_ISREG(newi->inode.mode)) {
5172 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
5173 newi->inode.client_ranges[client].range.first = 0;
5174 newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
5175 newi->inode.client_ranges[client].follows = follows;
5176
5177 // issue a cap on the file
5178 int cmode = CEPH_FILE_MODE_RDWR;
5179 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5180 if (cap) {
5181 cap->set_wanted(0);
5182
5183 // put locks in excl mode
5184 newi->filelock.set_state(LOCK_EXCL);
5185 newi->authlock.set_state(LOCK_EXCL);
5186 newi->xattrlock.set_state(LOCK_EXCL);
5187 }
5188 }
5189
5190 assert(dn->first == follows + 1);
5191 newi->first = dn->first;
5192
5193 dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
5194
5195 // prepare finisher
5196 mdr->ls = mdlog->get_current_segment();
5197 EUpdate *le = new EUpdate(mdlog, "mknod");
5198 mdlog->start_entry(le);
5199 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5200 journal_allocated_inos(mdr, &le->metablob);
5201
5202 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
5203 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5204 le->metablob.add_primary_dentry(dn, newi, true, true, true);
5205
5206 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5207 }
5208
5209
5210
5211 // MKDIR
5212 /* This function takes responsibility for the passed mdr*/
5213 void Server::handle_client_mkdir(MDRequestRef& mdr)
5214 {
5215 MClientRequest *req = mdr->client_request;
5216 if (req->get_filepath().is_last_dot_or_dotdot()) {
5217 respond_to_request(mdr, -EEXIST);
5218 return;
5219 }
5220
5221 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5222 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5223 if (!dn) return;
5224 if (mdr->snapid != CEPH_NOSNAP) {
5225 respond_to_request(mdr, -EROFS);
5226 return;
5227 }
5228 CDir *dir = dn->get_dir();
5229 CInode *diri = dir->get_inode();
5230 rdlocks.insert(&diri->authlock);
5231 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5232 return;
5233
5234 // mkdir check access
5235 if (!check_access(mdr, diri, MAY_WRITE))
5236 return;
5237
5238 if (!check_fragment_space(mdr, dir))
5239 return;
5240
5241 // new inode
5242 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5243 snapid_t follows = realm->get_newest_seq();
5244
5245 unsigned mode = req->head.args.mkdir.mode;
5246 mode &= ~S_IFMT;
5247 mode |= S_IFDIR;
5248 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5249 assert(newi);
5250
5251 // it's a directory.
5252 dn->push_projected_linkage(newi);
5253
5254 newi->inode.version = dn->pre_dirty();
5255 newi->inode.rstat.rsubdirs = 1;
5256 newi->inode.update_backtrace();
5257
5258 dout(12) << " follows " << follows << dendl;
5259 assert(dn->first == follows + 1);
5260 newi->first = dn->first;
5261
5262 // ...and that new dir is empty.
5263 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
5264 newdir->state_set(CDir::STATE_CREATING);
5265 newdir->mark_complete();
5266 newdir->fnode.version = newdir->pre_dirty();
5267
5268 // prepare finisher
5269 mdr->ls = mdlog->get_current_segment();
5270 EUpdate *le = new EUpdate(mdlog, "mkdir");
5271 mdlog->start_entry(le);
5272 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5273 journal_allocated_inos(mdr, &le->metablob);
5274 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5275 le->metablob.add_primary_dentry(dn, newi, true, true);
5276 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
5277
5278 // issue a cap on the directory
5279 int cmode = CEPH_FILE_MODE_RDWR;
5280 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5281 if (cap) {
5282 cap->set_wanted(0);
5283
5284 // put locks in excl mode
5285 newi->filelock.set_state(LOCK_EXCL);
5286 newi->authlock.set_state(LOCK_EXCL);
5287 newi->xattrlock.set_state(LOCK_EXCL);
5288 }
5289
5290 // make sure this inode gets into the journal
5291 le->metablob.add_opened_ino(newi->ino());
5292 LogSegment *ls = mds->mdlog->get_current_segment();
5293 ls->open_files.push_back(&newi->item_open_file);
5294
5295 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5296 }
5297
5298
5299 // SYMLINK
5300
5301 void Server::handle_client_symlink(MDRequestRef& mdr)
5302 {
5303 MClientRequest *req = mdr->client_request;
5304 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5305 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5306 if (!dn) return;
5307 if (mdr->snapid != CEPH_NOSNAP) {
5308 respond_to_request(mdr, -EROFS);
5309 return;
5310 }
5311 CDir *dir = dn->get_dir();
5312 CInode *diri = dir->get_inode();
5313 rdlocks.insert(&diri->authlock);
5314 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5315 return;
5316
5317 if (!check_access(mdr, diri, MAY_WRITE))
5318 return;
5319
5320 if (!check_fragment_space(mdr, dir))
5321 return;
5322
5323 unsigned mode = S_IFLNK | 0777;
5324 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5325 assert(newi);
5326
5327 // it's a symlink
5328 dn->push_projected_linkage(newi);
5329
5330 newi->symlink = mempool::mds_co::string(boost::string_view(req->get_path2()));
5331 newi->inode.size = newi->symlink.length();
5332 newi->inode.rstat.rbytes = newi->inode.size;
5333 newi->inode.rstat.rfiles = 1;
5334 newi->inode.version = dn->pre_dirty();
5335 newi->inode.update_backtrace();
5336
5337 newi->first = dn->first;
5338
5339 // prepare finisher
5340 mdr->ls = mdlog->get_current_segment();
5341 EUpdate *le = new EUpdate(mdlog, "symlink");
5342 mdlog->start_entry(le);
5343 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5344 journal_allocated_inos(mdr, &le->metablob);
5345 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5346 le->metablob.add_primary_dentry(dn, newi, true, true);
5347
5348 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5349 }
5350
5351
5352
5353
5354
5355 // LINK
5356
5357 void Server::handle_client_link(MDRequestRef& mdr)
5358 {
5359 MClientRequest *req = mdr->client_request;
5360
5361 dout(7) << "handle_client_link " << req->get_filepath()
5362 << " to " << req->get_filepath2()
5363 << dendl;
5364
5365 set<SimpleLock*> rdlocks, wrlocks, xlocks;
5366
5367 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);
5368 if (!dn) return;
5369 CInode *targeti = rdlock_path_pin_ref(mdr, 1, rdlocks, false);
5370 if (!targeti) return;
5371 if (mdr->snapid != CEPH_NOSNAP) {
5372 respond_to_request(mdr, -EROFS);
5373 return;
5374 }
5375
5376 CDir *dir = dn->get_dir();
5377 dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
5378 dout(7) << "target is " << *targeti << dendl;
5379 if (targeti->is_dir()) {
5380 // if srcdn is replica, need to make sure its linkage is correct
5381 vector<CDentry*>& trace = mdr->dn[1];
5382 if (trace.empty() ||
5383 trace.back()->is_auth() ||
5384 trace.back()->lock.can_read(mdr->get_client())) {
5385 dout(7) << "target is a dir, failing..." << dendl;
5386 respond_to_request(mdr, -EINVAL);
5387 return;
5388 }
5389 }
5390
5391 xlocks.insert(&targeti->linklock);
5392
5393 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
5394 return;
5395
5396 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5397 if (!check_access(mdr, targeti, MAY_WRITE))
5398 return;
5399
5400 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
5401 return;
5402
5403 if (!check_fragment_space(mdr, dir))
5404 return;
5405 }
5406
5407 // go!
5408 assert(g_conf->mds_kill_link_at != 1);
5409
5410 // local or remote?
5411 if (targeti->is_auth())
5412 _link_local(mdr, dn, targeti);
5413 else
5414 _link_remote(mdr, true, dn, targeti);
5415 }
5416
5417
5418 class C_MDS_link_local_finish : public ServerLogContext {
5419 CDentry *dn;
5420 CInode *targeti;
5421 version_t dnpv;
5422 version_t tipv;
5423 public:
5424 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
5425 version_t dnpv_, version_t tipv_) :
5426 ServerLogContext(s, r), dn(d), targeti(ti),
5427 dnpv(dnpv_), tipv(tipv_) { }
5428 void finish(int r) override {
5429 assert(r == 0);
5430 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv);
5431 }
5432 };
5433
5434
5435 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
5436 {
5437 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
5438
5439 mdr->ls = mdlog->get_current_segment();
5440
5441 // predirty NEW dentry
5442 version_t dnpv = dn->pre_dirty();
5443 version_t tipv = targeti->pre_dirty();
5444
5445 // project inode update
5446 auto &pi = targeti->project_inode();
5447 pi.inode.nlink++;
5448 pi.inode.ctime = mdr->get_op_stamp();
5449 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5450 pi.inode.rstat.rctime = mdr->get_op_stamp();
5451 pi.inode.change_attr++;
5452 pi.inode.version = tipv;
5453
5454 // log + wait
5455 EUpdate *le = new EUpdate(mdlog, "link_local");
5456 mdlog->start_entry(le);
5457 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5458 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
5459 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
5460 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5461 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
5462
5463 // do this after predirty_*, to avoid funky extra dnl arg
5464 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5465
5466 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv));
5467 }
5468
5469 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
5470 version_t dnpv, version_t tipv)
5471 {
5472 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
5473
5474 // link and unlock the NEW dentry
5475 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
5476 if (!dnl->get_inode())
5477 dn->link_remote(dnl, targeti);
5478 dn->mark_dirty(dnpv, mdr->ls);
5479
5480 // target inode
5481 targeti->pop_and_dirty_projected_inode(mdr->ls);
5482
5483 mdr->apply();
5484
5485 MDRequestRef null_ref;
5486 mdcache->send_dentry_link(dn, null_ref);
5487
5488 // bump target popularity
5489 utime_t now = ceph_clock_now();
5490 mds->balancer->hit_inode(now, targeti, META_POP_IWR);
5491 mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
5492
5493 // reply
5494 respond_to_request(mdr, 0);
5495 }
5496
5497
5498 // link / unlink remote
5499
5500 class C_MDS_link_remote_finish : public ServerLogContext {
5501 bool inc;
5502 CDentry *dn;
5503 CInode *targeti;
5504 version_t dpv;
5505 public:
5506 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
5507 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
5508 dpv(d->get_projected_version()) {}
5509 void finish(int r) override {
5510 assert(r == 0);
5511 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
5512 }
5513 };
5514
5515 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
5516 {
5517 dout(10) << "_link_remote "
5518 << (inc ? "link ":"unlink ")
5519 << *dn << " to " << *targeti << dendl;
5520
5521 // 1. send LinkPrepare to dest (journal nlink++ prepare)
5522 mds_rank_t linkauth = targeti->authority().first;
5523 if (mdr->more()->witnessed.count(linkauth) == 0) {
5524 if (mds->is_cluster_degraded() &&
5525 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
5526 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
5527 if (mdr->more()->waiting_on_slave.empty())
5528 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
5529 return;
5530 }
5531
5532 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
5533 int op;
5534 if (inc)
5535 op = MMDSSlaveRequest::OP_LINKPREP;
5536 else
5537 op = MMDSSlaveRequest::OP_UNLINKPREP;
5538 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, op);
5539 targeti->set_object_info(req->get_object_info());
5540 req->op_stamp = mdr->get_op_stamp();
5541 mds->send_message_mds(req, linkauth);
5542
5543 assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
5544 mdr->more()->waiting_on_slave.insert(linkauth);
5545 return;
5546 }
5547 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
5548
5549 assert(g_conf->mds_kill_link_at != 2);
5550
5551 mdr->set_mds_stamp(ceph_clock_now());
5552
5553 // add to event
5554 mdr->ls = mdlog->get_current_segment();
5555 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
5556 mdlog->start_entry(le);
5557 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5558 if (!mdr->more()->witnessed.empty()) {
5559 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
5560 le->reqid = mdr->reqid;
5561 le->had_slaves = true;
5562 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
5563 }
5564
5565 if (inc) {
5566 dn->pre_dirty();
5567 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
5568 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
5569 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
5570 } else {
5571 dn->pre_dirty();
5572 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
5573 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
5574 le->metablob.add_null_dentry(dn, true);
5575 dn->push_projected_linkage();
5576 }
5577
5578 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
5579 }
5580
5581 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
5582 CDentry *dn, CInode *targeti,
5583 version_t dpv)
5584 {
5585 dout(10) << "_link_remote_finish "
5586 << (inc ? "link ":"unlink ")
5587 << *dn << " to " << *targeti << dendl;
5588
5589 assert(g_conf->mds_kill_link_at != 3);
5590
5591 if (!mdr->more()->witnessed.empty())
5592 mdcache->logged_master_update(mdr->reqid);
5593
5594 if (inc) {
5595 // link the new dentry
5596 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
5597 if (!dnl->get_inode())
5598 dn->link_remote(dnl, targeti);
5599 dn->mark_dirty(dpv, mdr->ls);
5600 } else {
5601 // unlink main dentry
5602 dn->get_dir()->unlink_inode(dn);
5603 dn->pop_projected_linkage();
5604 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
5605 }
5606
5607 mdr->apply();
5608
5609 MDRequestRef null_ref;
5610 if (inc)
5611 mdcache->send_dentry_link(dn, null_ref);
5612 else
5613 mdcache->send_dentry_unlink(dn, NULL, null_ref);
5614
5615 // bump target popularity
5616 utime_t now = ceph_clock_now();
5617 mds->balancer->hit_inode(now, targeti, META_POP_IWR);
5618 mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
5619
5620 // reply
5621 respond_to_request(mdr, 0);
5622
5623 if (!inc)
5624 // removing a new dn?
5625 dn->get_dir()->try_remove_unlinked_dn(dn);
5626 }
5627
5628
5629 // remote linking/unlinking
5630
5631 class C_MDS_SlaveLinkPrep : public ServerLogContext {
5632 CInode *targeti;
5633 public:
5634 C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t) :
5635 ServerLogContext(s, r), targeti(t) { }
5636 void finish(int r) override {
5637 assert(r == 0);
5638 server->_logged_slave_link(mdr, targeti);
5639 }
5640 };
5641
5642 class C_MDS_SlaveLinkCommit : public ServerContext {
5643 MDRequestRef mdr;
5644 CInode *targeti;
5645 public:
5646 C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
5647 ServerContext(s), mdr(r), targeti(t) { }
5648 void finish(int r) override {
5649 server->_commit_slave_link(mdr, r, targeti);
5650 }
5651 };
5652
5653 /* This function DOES put the mdr->slave_request before returning*/
5654 void Server::handle_slave_link_prep(MDRequestRef& mdr)
5655 {
5656 dout(10) << "handle_slave_link_prep " << *mdr
5657 << " on " << mdr->slave_request->get_object_info()
5658 << dendl;
5659
5660 assert(g_conf->mds_kill_link_at != 4);
5661
5662 CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
5663 assert(targeti);
5664 dout(10) << "targeti " << *targeti << dendl;
5665 CDentry *dn = targeti->get_parent_dn();
5666 CDentry::linkage_t *dnl = dn->get_linkage();
5667 assert(dnl->is_primary());
5668
5669 mdr->set_op_stamp(mdr->slave_request->op_stamp);
5670
5671 mdr->auth_pin(targeti);
5672
5673 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
5674 assert(g_conf->mds_kill_link_at != 5);
5675
5676 // journal it
5677 mdr->ls = mdlog->get_current_segment();
5678 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
5679 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
5680 mdlog->start_entry(le);
5681
5682 auto &pi = dnl->get_inode()->project_inode();
5683
5684 // update journaled target inode
5685 bool inc;
5686 if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
5687 inc = true;
5688 pi.inode.nlink++;
5689 } else {
5690 inc = false;
5691 pi.inode.nlink--;
5692 }
5693
5694 link_rollback rollback;
5695 rollback.reqid = mdr->reqid;
5696 rollback.ino = targeti->ino();
5697 rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
5698 const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
5699 rollback.old_dir_mtime = pf->fragstat.mtime;
5700 rollback.old_dir_rctime = pf->rstat.rctime;
5701 rollback.was_inc = inc;
5702 ::encode(rollback, le->rollback);
5703 mdr->more()->rollback_bl = le->rollback;
5704
5705 pi.inode.ctime = mdr->get_op_stamp();
5706 pi.inode.version = targeti->pre_dirty();
5707
5708 dout(10) << " projected inode " << pi.inode.ino << " v " << pi.inode.version << dendl;
5709
5710 // commit case
5711 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
5712 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
5713
5714 // set up commit waiter
5715 mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
5716
5717 mdr->more()->slave_update_journaled = true;
5718 submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti),
5719 mdr, __func__);
5720 mdlog->flush();
5721 }
5722
5723 void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti)
5724 {
5725 dout(10) << "_logged_slave_link " << *mdr
5726 << " " << *targeti << dendl;
5727
5728 assert(g_conf->mds_kill_link_at != 6);
5729
5730 // update the target
5731 targeti->pop_and_dirty_projected_inode(mdr->ls);
5732 mdr->apply();
5733
5734 // hit pop
5735 utime_t now = ceph_clock_now();
5736 mds->balancer->hit_inode(now, targeti, META_POP_IWR);
5737
5738 // done.
5739 mdr->reset_slave_request();
5740
5741 // ack
5742 if (!mdr->aborted) {
5743 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5744 MMDSSlaveRequest::OP_LINKPREPACK);
5745 mds->send_message_mds(reply, mdr->slave_to_mds);
5746 } else {
5747 dout(10) << " abort flag set, finishing" << dendl;
5748 mdcache->request_finish(mdr);
5749 }
5750 }
5751
5752
5753 struct C_MDS_CommittedSlave : public ServerLogContext {
5754 C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
5755 void finish(int r) override {
5756 server->_committed_slave(mdr);
5757 }
5758 };
5759
5760 void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
5761 {
5762 dout(10) << "_commit_slave_link " << *mdr
5763 << " r=" << r
5764 << " " << *targeti << dendl;
5765
5766 assert(g_conf->mds_kill_link_at != 7);
5767
5768 if (r == 0) {
5769 // drop our pins, etc.
5770 mdr->cleanup();
5771
5772 // write a commit to the journal
5773 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
5774 ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
5775 mdlog->start_entry(le);
5776 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
5777 mdlog->flush();
5778 } else {
5779 do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
5780 }
5781 }
5782
5783 void Server::_committed_slave(MDRequestRef& mdr)
5784 {
5785 dout(10) << "_committed_slave " << *mdr << dendl;
5786
5787 assert(g_conf->mds_kill_link_at != 8);
5788
5789 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
5790 MMDSSlaveRequest::OP_COMMITTED);
5791 mds->send_message_mds(req, mdr->slave_to_mds);
5792 mdcache->request_finish(mdr);
5793 }
5794
5795 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
5796 MutationRef mut;
5797 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r) : ServerLogContext(s, r), mut(m) {}
5798 void finish(int r) override {
5799 server->_link_rollback_finish(mut, mdr);
5800 }
5801 };
5802
5803 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
5804 {
5805 link_rollback rollback;
5806 bufferlist::iterator p = rbl.begin();
5807 ::decode(rollback, p);
5808
5809 dout(10) << "do_link_rollback on " << rollback.reqid
5810 << (rollback.was_inc ? " inc":" dec")
5811 << " ino " << rollback.ino
5812 << dendl;
5813
5814 assert(g_conf->mds_kill_link_at != 9);
5815
5816 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
5817 assert(mdr || mds->is_resolve());
5818
5819 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
5820 mut->ls = mds->mdlog->get_current_segment();
5821
5822 CInode *in = mdcache->get_inode(rollback.ino);
5823 assert(in);
5824 dout(10) << " target is " << *in << dendl;
5825 assert(!in->is_projected()); // live slave request hold versionlock xlock.
5826
5827 auto &pi = in->project_inode();
5828 pi.inode.version = in->pre_dirty();
5829 mut->add_projected_inode(in);
5830
5831 // parent dir rctime
5832 CDir *parent = in->get_projected_parent_dn()->get_dir();
5833 fnode_t *pf = parent->project_fnode();
5834 mut->add_projected_fnode(parent);
5835 pf->version = parent->pre_dirty();
5836 if (pf->fragstat.mtime == pi.inode.ctime) {
5837 pf->fragstat.mtime = rollback.old_dir_mtime;
5838 if (pf->rstat.rctime == pi.inode.ctime)
5839 pf->rstat.rctime = rollback.old_dir_rctime;
5840 mut->add_updated_lock(&parent->get_inode()->filelock);
5841 mut->add_updated_lock(&parent->get_inode()->nestlock);
5842 }
5843
5844 // inode
5845 pi.inode.ctime = rollback.old_ctime;
5846 if (rollback.was_inc)
5847 pi.inode.nlink--;
5848 else
5849 pi.inode.nlink++;
5850
5851 // journal it
5852 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
5853 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
5854 mdlog->start_entry(le);
5855 le->commit.add_dir_context(parent);
5856 le->commit.add_dir(parent, true);
5857 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
5858
5859 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr),
5860 mdr, __func__);
5861 mdlog->flush();
5862 }
5863
5864 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr)
5865 {
5866 dout(10) << "_link_rollback_finish" << dendl;
5867
5868 assert(g_conf->mds_kill_link_at != 10);
5869
5870 mut->apply();
5871 if (mdr)
5872 mdcache->request_finish(mdr);
5873
5874 mdcache->finish_rollback(mut->reqid);
5875
5876 mut->cleanup();
5877 }
5878
5879
5880 /* This function DOES NOT put the passed message before returning*/
5881 void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *m)
5882 {
5883 dout(10) << "handle_slave_link_prep_ack " << *mdr
5884 << " " << *m << dendl;
5885 mds_rank_t from = mds_rank_t(m->get_source().num());
5886
5887 assert(g_conf->mds_kill_link_at != 11);
5888
5889 // note slave
5890 mdr->more()->slaves.insert(from);
5891
5892 // witnessed!
5893 assert(mdr->more()->witnessed.count(from) == 0);
5894 mdr->more()->witnessed.insert(from);
5895 assert(!m->is_not_journaled());
5896 mdr->more()->has_journaled_slaves = true;
5897
5898 // remove from waiting list
5899 assert(mdr->more()->waiting_on_slave.count(from));
5900 mdr->more()->waiting_on_slave.erase(from);
5901
5902 assert(mdr->more()->waiting_on_slave.empty());
5903
5904 dispatch_client_request(mdr); // go again!
5905 }
5906
5907
5908
5909
5910
5911 // UNLINK
5912
5913 void Server::handle_client_unlink(MDRequestRef& mdr)
5914 {
5915 MClientRequest *req = mdr->client_request;
5916 client_t client = mdr->get_client();
5917
5918 // rmdir or unlink?
5919 bool rmdir = false;
5920 if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true;
5921
5922 const filepath& refpath = req->get_filepath();
5923 if (refpath.depth() == 0) {
5924 respond_to_request(mdr, -EINVAL);
5925 return;
5926 }
5927 if (refpath.is_last_dot_or_dotdot()) {
5928 respond_to_request(mdr, -ENOTEMPTY);
5929 return;
5930 }
5931
5932 // traverse to path
5933 vector<CDentry*> trace;
5934 CInode *in;
5935 int r = mdcache->path_traverse(mdr, NULL, NULL, refpath, &trace, &in, MDS_TRAVERSE_FORWARD);
5936 if (r > 0) return;
5937 if (r < 0) {
5938 if (r == -ESTALE) {
5939 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
5940 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
5941 return;
5942 }
5943 respond_to_request(mdr, r);
5944 return;
5945 }
5946 if (mdr->snapid != CEPH_NOSNAP) {
5947 respond_to_request(mdr, -EROFS);
5948 return;
5949 }
5950
5951 CDentry *dn = trace.back();
5952 assert(dn);
5953 if (!dn->is_auth()) {
5954 mdcache->request_forward(mdr, dn->authority().first);
5955 return;
5956 }
5957
5958 CInode *diri = dn->get_dir()->get_inode();
5959
5960 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
5961 assert(!dnl->is_null());
5962
5963 if (rmdir) {
5964 dout(7) << "handle_client_rmdir on " << *dn << dendl;
5965 } else {
5966 dout(7) << "handle_client_unlink on " << *dn << dendl;
5967 }
5968 dout(7) << "dn links to " << *in << dendl;
5969
5970 // rmdir vs is_dir
5971 if (in->is_dir()) {
5972 if (rmdir) {
5973 // do empty directory checks
5974 if (_dir_is_nonempty_unlocked(mdr, in)) {
5975 respond_to_request(mdr, -ENOTEMPTY);
5976 return;
5977 }
5978 } else {
5979 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
5980 respond_to_request(mdr, -EISDIR);
5981 return;
5982 }
5983 } else {
5984 if (rmdir) {
5985 // unlink
5986 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
5987 respond_to_request(mdr, -ENOTDIR);
5988 return;
5989 }
5990 }
5991
5992 // -- create stray dentry? --
5993 CDentry *straydn = NULL;
5994 if (dnl->is_primary()) {
5995 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
5996 if (!straydn)
5997 return;
5998 dout(10) << " straydn is " << *straydn << dendl;
5999 } else if (mdr->straydn) {
6000 mdr->unpin(mdr->straydn);
6001 mdr->straydn = NULL;
6002 }
6003
6004 // lock
6005 set<SimpleLock*> rdlocks, wrlocks, xlocks;
6006 for (int i=0; i<(int)trace.size()-1; i++) {
6007 rdlocks.insert(&trace[i]->lock);
6008 }
6009 xlocks.insert(&dn->lock);
6010 wrlocks.insert(&diri->filelock);
6011 wrlocks.insert(&diri->nestlock);
6012 xlocks.insert(&in->linklock);
6013 if (straydn) {
6014 wrlocks.insert(&straydn->get_dir()->inode->filelock);
6015 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
6016 xlocks.insert(&straydn->lock);
6017 }
6018 if (in->is_dir())
6019 rdlocks.insert(&in->filelock); // to verify it's empty
6020 mds->locker->include_snap_rdlocks(rdlocks, dnl->get_inode());
6021
6022 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
6023 return;
6024
6025 if (in->is_dir() &&
6026 _dir_is_nonempty(mdr, in)) {
6027 respond_to_request(mdr, -ENOTEMPTY);
6028 return;
6029 }
6030
6031 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6032 if (!check_access(mdr, diri, MAY_WRITE))
6033 return;
6034 }
6035
6036 // yay!
6037 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
6038 // subtree root auths need to be witnesses
6039 set<mds_rank_t> witnesses;
6040 in->list_replicas(witnesses);
6041 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
6042
6043 for (set<mds_rank_t>::iterator p = witnesses.begin();
6044 p != witnesses.end();
6045 ++p) {
6046 if (mdr->more()->witnessed.count(*p)) {
6047 dout(10) << " already witnessed by mds." << *p << dendl;
6048 } else if (mdr->more()->waiting_on_slave.count(*p)) {
6049 dout(10) << " already waiting on witness mds." << *p << dendl;
6050 } else {
6051 if (!_rmdir_prepare_witness(mdr, *p, trace, straydn))
6052 return;
6053 }
6054 }
6055 if (!mdr->more()->waiting_on_slave.empty())
6056 return; // we're waiting for a witness.
6057 }
6058
6059 // ok!
6060 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
6061 _link_remote(mdr, false, dn, dnl->get_inode());
6062 else
6063 _unlink_local(mdr, dn, straydn);
6064 }
6065
6066 class C_MDS_unlink_local_finish : public ServerLogContext {
6067 CDentry *dn;
6068 CDentry *straydn;
6069 version_t dnpv; // deleted dentry
6070 public:
6071 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
6072 ServerLogContext(s, r), dn(d), straydn(sd),
6073 dnpv(d->get_projected_version()) {}
6074 void finish(int r) override {
6075 assert(r == 0);
6076 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
6077 }
6078 };
6079
6080 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
6081 {
6082 dout(10) << "_unlink_local " << *dn << dendl;
6083
6084 CDentry::linkage_t *dnl = dn->get_projected_linkage();
6085 CInode *in = dnl->get_inode();
6086
6087 SnapRealm *realm = in->find_snaprealm();
6088 snapid_t follows = realm->get_newest_seq();
6089
6090 // ok, let's do it.
6091 mdr->ls = mdlog->get_current_segment();
6092
6093 // prepare log entry
6094 EUpdate *le = new EUpdate(mdlog, "unlink_local");
6095 mdlog->start_entry(le);
6096 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6097 if (!mdr->more()->witnessed.empty()) {
6098 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6099 le->reqid = mdr->reqid;
6100 le->had_slaves = true;
6101 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6102 }
6103
6104 if (straydn) {
6105 assert(dnl->is_primary());
6106 straydn->push_projected_linkage(in);
6107 straydn->first = follows + 1;
6108 }
6109
6110 // the unlinked dentry
6111 dn->pre_dirty();
6112
6113 auto &pi = in->project_inode();
6114 {
6115 std::string t;
6116 dn->make_path_string(t, true);
6117 pi.inode.stray_prior_path = mempool::mds_co::string(boost::string_view(t));
6118 }
6119 mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
6120 pi.inode.version = in->pre_dirty();
6121 pi.inode.ctime = mdr->get_op_stamp();
6122 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
6123 pi.inode.rstat.rctime = mdr->get_op_stamp();
6124 pi.inode.change_attr++;
6125 pi.inode.nlink--;
6126 if (pi.inode.nlink == 0)
6127 in->state_set(CInode::STATE_ORPHAN);
6128
6129 if (dnl->is_primary()) {
6130 // primary link. add stray dentry.
6131 assert(straydn);
6132 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
6133 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6134
6135 // project snaprealm, too
6136 if (in->snaprealm || follows + 1 > in->get_oldest_snap())
6137 in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
6138
6139 pi.inode.update_backtrace();
6140 le->metablob.add_primary_dentry(straydn, in, true, true);
6141 } else {
6142 // remote link. update remote inode.
6143 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
6144 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6145 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
6146 }
6147
6148 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6149 le->metablob.add_null_dentry(dn, true);
6150
6151 if (in->is_dir()) {
6152 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6153 le->metablob.renamed_dirino = in->ino();
6154 }
6155
6156 dn->push_projected_linkage();
6157
6158 if (in->is_dir()) {
6159 assert(straydn);
6160 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6161
6162 in->maybe_export_pin(true);
6163 }
6164
6165 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
6166 }
6167
6168 void Server::_unlink_local_finish(MDRequestRef& mdr,
6169 CDentry *dn, CDentry *straydn,
6170 version_t dnpv)
6171 {
6172 dout(10) << "_unlink_local_finish " << *dn << dendl;
6173
6174 if (!mdr->more()->witnessed.empty())
6175 mdcache->logged_master_update(mdr->reqid);
6176
6177 // unlink main dentry
6178 dn->get_dir()->unlink_inode(dn);
6179 dn->pop_projected_linkage();
6180
6181 // relink as stray? (i.e. was primary link?)
6182 CInode *strayin = NULL;
6183 bool snap_is_new = false;
6184 if (straydn) {
6185 dout(20) << " straydn is " << *straydn << dendl;
6186 CDentry::linkage_t *straydnl = straydn->pop_projected_linkage();
6187 strayin = straydnl->get_inode();
6188
6189 snap_is_new = strayin->snaprealm ? true : false;
6190 mdcache->touch_dentry_bottom(straydn);
6191 }
6192
6193 dn->mark_dirty(dnpv, mdr->ls);
6194 mdr->apply();
6195
6196 if (snap_is_new) //only new if strayin exists
6197 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, true);
6198
6199 mdcache->send_dentry_unlink(dn, straydn, mdr);
6200
6201 // update subtree map?
6202 if (straydn && strayin->is_dir())
6203 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
6204
6205 // bump pop
6206 utime_t now = ceph_clock_now();
6207 mds->balancer->hit_dir(now, dn->get_dir(), META_POP_IWR);
6208
6209 // reply
6210 respond_to_request(mdr, 0);
6211
6212 // removing a new dn?
6213 dn->get_dir()->try_remove_unlinked_dn(dn);
6214
6215 // clean up ?
6216 // respond_to_request() drops locks. So stray reintegration can race with us.
6217 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6218 // Tip off the MDCache that this dentry is a stray that
6219 // might be elegible for purge.
6220 mdcache->notify_stray(straydn);
6221 }
6222 }
6223
6224 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
6225 {
6226 if (mds->is_cluster_degraded() &&
6227 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6228 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
6229 if (mdr->more()->waiting_on_slave.empty())
6230 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6231 return false;
6232 }
6233
6234 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
6235 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6236 MMDSSlaveRequest::OP_RMDIRPREP);
6237 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
6238 for (auto dn : trace)
6239 req->srcdnpath.push_dentry(dn->get_name());
6240 mdcache->replicate_stray(straydn, who, req->stray);
6241
6242 req->op_stamp = mdr->get_op_stamp();
6243 mds->send_message_mds(req, who);
6244
6245 assert(mdr->more()->waiting_on_slave.count(who) == 0);
6246 mdr->more()->waiting_on_slave.insert(who);
6247 return true;
6248 }
6249
6250 struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
6251 CDentry *dn, *straydn;
6252 C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
6253 : ServerLogContext(s, r), dn(d), straydn(st) {}
6254 void finish(int r) override {
6255 server->_logged_slave_rmdir(mdr, dn, straydn);
6256 }
6257 };
6258
6259 struct C_MDS_SlaveRmdirCommit : public ServerContext {
6260 MDRequestRef mdr;
6261 CDentry *straydn;
6262 C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
6263 : ServerContext(s), mdr(r), straydn(sd) { }
6264 void finish(int r) override {
6265 server->_commit_slave_rmdir(mdr, r, straydn);
6266 }
6267 };
6268
6269 void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
6270 {
6271 dout(10) << "handle_slave_rmdir_prep " << *mdr
6272 << " " << mdr->slave_request->srcdnpath
6273 << " to " << mdr->slave_request->destdnpath
6274 << dendl;
6275
6276 vector<CDentry*> trace;
6277 filepath srcpath(mdr->slave_request->srcdnpath);
6278 dout(10) << " src " << srcpath << dendl;
6279 CInode *in;
6280 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK);
6281 if (r > 0) return;
6282 if (r == -ESTALE) {
6283 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
6284 mdr->slave_to_mds);
6285 return;
6286 }
6287 assert(r == 0);
6288 CDentry *dn = trace.back();
6289 dout(10) << " dn " << *dn << dendl;
6290 mdr->pin(dn);
6291
6292 assert(mdr->straydn);
6293 CDentry *straydn = mdr->straydn;
6294 dout(10) << " straydn " << *straydn << dendl;
6295
6296 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6297
6298 rmdir_rollback rollback;
6299 rollback.reqid = mdr->reqid;
6300 rollback.src_dir = dn->get_dir()->dirfrag();
6301 rollback.src_dname = std::string(dn->get_name());
6302 rollback.dest_dir = straydn->get_dir()->dirfrag();
6303 rollback.dest_dname = std::string(straydn->get_name());
6304 ::encode(rollback, mdr->more()->rollback_bl);
6305 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
6306
6307 // set up commit waiter
6308 mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
6309
6310 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
6311 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
6312 dn->get_dir()->unlink_inode(dn);
6313 straydn->get_dir()->link_primary_inode(straydn, in);
6314
6315 assert(straydn->first >= in->first);
6316 in->first = straydn->first;
6317
6318 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), false);
6319
6320 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6321 MMDSSlaveRequest::OP_RMDIRPREPACK);
6322 reply->mark_not_journaled();
6323 mds->send_message_mds(reply, mdr->slave_to_mds);
6324
6325 // send caps to auth (if we're not already)
6326 if (in->is_any_caps() && !in->state_test(CInode::STATE_EXPORTINGCAPS))
6327 mdcache->migrator->export_caps(in);
6328
6329 mdcache->touch_dentry_bottom(straydn); // move stray to end of lru
6330
6331 mdr->slave_request->put();
6332 mdr->slave_request = 0;
6333 mdr->straydn = 0;
6334 return;
6335 }
6336
6337 straydn->push_projected_linkage(in);
6338 dn->push_projected_linkage();
6339
6340 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
6341 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
6342 mdlog->start_entry(le);
6343 le->rollback = mdr->more()->rollback_bl;
6344
6345 le->commit.add_dir_context(straydn->get_dir());
6346 le->commit.add_primary_dentry(straydn, in, true);
6347 // slave: no need to journal original dentry
6348
6349 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6350 le->commit.renamed_dirino = in->ino();
6351
6352 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6353
6354 mdr->more()->slave_update_journaled = true;
6355 submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
6356 mdr, __func__);
6357 mdlog->flush();
6358 }
6359
6360 void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
6361 {
6362 dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
6363
6364 // update our cache now, so we are consistent with what is in the journal
6365 // when we journal a subtree map
6366 CInode *in = dn->get_linkage()->get_inode();
6367 dn->get_dir()->unlink_inode(dn);
6368 straydn->pop_projected_linkage();
6369 dn->pop_projected_linkage();
6370 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), true);
6371
6372 // done.
6373 mdr->reset_slave_request();
6374 mdr->straydn = 0;
6375
6376 if (!mdr->aborted) {
6377 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
6378 MMDSSlaveRequest::OP_RMDIRPREPACK);
6379 mds->send_message_mds(reply, mdr->slave_to_mds);
6380 } else {
6381 dout(10) << " abort flag set, finishing" << dendl;
6382 mdcache->request_finish(mdr);
6383 }
6384 }
6385
6386 void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
6387 {
6388 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
6389 << " " << *ack << dendl;
6390
6391 mds_rank_t from = mds_rank_t(ack->get_source().num());
6392
6393 mdr->more()->slaves.insert(from);
6394 mdr->more()->witnessed.insert(from);
6395 if (!ack->is_not_journaled())
6396 mdr->more()->has_journaled_slaves = true;
6397
6398 // remove from waiting list
6399 assert(mdr->more()->waiting_on_slave.count(from));
6400 mdr->more()->waiting_on_slave.erase(from);
6401
6402 if (mdr->more()->waiting_on_slave.empty())
6403 dispatch_client_request(mdr); // go again!
6404 else
6405 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
6406 }
6407
6408 void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
6409 {
6410 dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
6411
6412 if (r == 0) {
6413 if (mdr->more()->slave_update_journaled) {
6414 CInode *strayin = straydn->get_projected_linkage()->get_inode();
6415 if (strayin && !strayin->snaprealm)
6416 mdcache->clear_dirty_bits_for_stray(strayin);
6417 }
6418
6419 mdr->cleanup();
6420
6421 if (mdr->more()->slave_update_journaled) {
6422 // write a commit to the journal
6423 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
6424 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
6425 ESlaveUpdate::RMDIR);
6426 mdlog->start_entry(le);
6427 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
6428 mdlog->flush();
6429 } else {
6430 _committed_slave(mdr);
6431 }
6432 } else {
6433 // abort
6434 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
6435 }
6436 }
6437
6438 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
6439 metareqid_t reqid;
6440 CDentry *dn;
6441 CDentry *straydn;
6442 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
6443 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
6444 void finish(int r) override {
6445 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
6446 }
6447 };
6448
6449 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
6450 {
6451 // unlink the other rollback methods, the rmdir rollback is only
6452 // needed to record the subtree changes in the journal for inode
6453 // replicas who are auth for empty dirfrags. no actual changes to
6454 // the file system are taking place here, so there is no Mutation.
6455
6456 rmdir_rollback rollback;
6457 bufferlist::iterator p = rbl.begin();
6458 ::decode(rollback, p);
6459
6460 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
6461 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
6462 assert(mdr || mds->is_resolve());
6463
6464 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
6465 if (!dir)
6466 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
6467 assert(dir);
6468 CDentry *dn = dir->lookup(rollback.src_dname);
6469 assert(dn);
6470 dout(10) << " dn " << *dn << dendl;
6471 dir = mdcache->get_dirfrag(rollback.dest_dir);
6472 assert(dir);
6473 CDentry *straydn = dir->lookup(rollback.dest_dname);
6474 assert(straydn);
6475 dout(10) << " straydn " << *dn << dendl;
6476 CInode *in = straydn->get_linkage()->get_inode();
6477
6478 if (mdr && !mdr->more()->slave_update_journaled) {
6479 assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
6480
6481 straydn->get_dir()->unlink_inode(straydn);
6482 dn->get_dir()->link_primary_inode(dn, in);
6483
6484 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), false);
6485
6486 mdcache->request_finish(mdr);
6487 mdcache->finish_rollback(rollback.reqid);
6488 return;
6489 }
6490
6491 dn->push_projected_linkage(in);
6492 straydn->push_projected_linkage();
6493
6494 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
6495 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
6496 mdlog->start_entry(le);
6497
6498 le->commit.add_dir_context(dn->get_dir());
6499 le->commit.add_primary_dentry(dn, in, true);
6500 // slave: no need to journal straydn
6501
6502 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6503 le->commit.renamed_dirino = in->ino();
6504
6505 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
6506
6507 submit_mdlog_entry(le,
6508 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
6509 dn, straydn),
6510 mdr, __func__);
6511 mdlog->flush();
6512 }
6513
6514 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
6515 {
6516 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
6517
6518 straydn->get_dir()->unlink_inode(straydn);
6519 dn->pop_projected_linkage();
6520 straydn->pop_projected_linkage();
6521
6522 CInode *in = dn->get_linkage()->get_inode();
6523 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), true);
6524 if (mds->is_resolve()) {
6525 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
6526 mdcache->try_trim_non_auth_subtree(root);
6527 }
6528
6529 if (mdr)
6530 mdcache->request_finish(mdr);
6531
6532 mdcache->finish_rollback(reqid);
6533 }
6534
6535
6536 /** _dir_is_nonempty[_unlocked]
6537 *
6538 * check if a directory is non-empty (i.e. we can rmdir it).
6539 *
6540 * the unlocked varient this is a fastpath check. we can't really be
6541 * sure until we rdlock the filelock.
6542 */
6543 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
6544 {
6545 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
6546 assert(in->is_auth());
6547
6548 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
6549 return true; // in a snapshot!
6550
6551 list<CDir*> ls;
6552 in->get_dirfrags(ls);
6553 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6554 CDir *dir = *p;
6555 // is the frag obviously non-empty?
6556 if (dir->is_auth()) {
6557 if (dir->get_projected_fnode()->fragstat.size()) {
6558 dout(10) << "dir_is_nonempty_unlocked dirstat has "
6559 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
6560 return true;
6561 }
6562 }
6563 }
6564
6565 return false;
6566 }
6567
6568 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
6569 {
6570 dout(10) << "dir_is_nonempty " << *in << dendl;
6571 assert(in->is_auth());
6572 assert(in->filelock.can_read(mdr->get_client()));
6573
6574 frag_info_t dirstat;
6575 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
6576
6577 list<CDir*> ls;
6578 in->get_dirfrags(ls);
6579 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
6580 CDir *dir = *p;
6581 const fnode_t *pf = dir->get_projected_fnode();
6582 if (pf->fragstat.size()) {
6583 dout(10) << "dir_is_nonempty dirstat has "
6584 << pf->fragstat.size() << " items " << *dir << dendl;
6585 return true;
6586 }
6587
6588 if (pf->accounted_fragstat.version == dirstat_version)
6589 dirstat.add(pf->accounted_fragstat);
6590 else
6591 dirstat.add(pf->fragstat);
6592 }
6593
6594 return dirstat.size() != in->get_projected_inode()->dirstat.size();
6595 }
6596
6597
6598 // ======================================================
6599
6600
6601 class C_MDS_rename_finish : public ServerLogContext {
6602 CDentry *srcdn;
6603 CDentry *destdn;
6604 CDentry *straydn;
6605 public:
6606 C_MDS_rename_finish(Server *s, MDRequestRef& r,
6607 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
6608 ServerLogContext(s, r),
6609 srcdn(sdn), destdn(ddn), straydn(stdn) { }
6610 void finish(int r) override {
6611 assert(r == 0);
6612 server->_rename_finish(mdr, srcdn, destdn, straydn);
6613 }
6614 };
6615
6616
6617 /** handle_client_rename
6618 *
6619 * rename master is the destdn auth. this is because cached inodes
6620 * must remain connected. thus, any replica of srci, must also
6621 * replicate destdn, and possibly straydn, so that srci (and
6622 * destdn->inode) remain connected during the rename.
6623 *
6624 * to do this, we freeze srci, then master (destdn auth) verifies that
6625 * all other nodes have also replciated destdn and straydn. note that
6626 * destdn replicas need not also replicate srci. this only works when
6627 * destdn is master.
6628 *
6629 * This function takes responsibility for the passed mdr.
6630 */
6631 void Server::handle_client_rename(MDRequestRef& mdr)
6632 {
6633 MClientRequest *req = mdr->client_request;
6634 dout(7) << "handle_client_rename " << *req << dendl;
6635
6636 filepath destpath = req->get_filepath();
6637 filepath srcpath = req->get_filepath2();
6638 if (destpath.depth() == 0 || srcpath.depth() == 0) {
6639 respond_to_request(mdr, -EINVAL);
6640 return;
6641 }
6642 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
6643 respond_to_request(mdr, -EBUSY);
6644 return;
6645 }
6646
6647 boost::string_view destname = destpath.last_dentry();
6648
6649 vector<CDentry*>& srctrace = mdr->dn[1];
6650 vector<CDentry*>& desttrace = mdr->dn[0];
6651
6652 set<SimpleLock*> rdlocks, wrlocks, xlocks;
6653
6654 CDentry *destdn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, true, false, true);
6655 if (!destdn) return;
6656 dout(10) << " destdn " << *destdn << dendl;
6657 if (mdr->snapid != CEPH_NOSNAP) {
6658 respond_to_request(mdr, -EROFS);
6659 return;
6660 }
6661 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
6662 CDir *destdir = destdn->get_dir();
6663 assert(destdir->is_auth());
6664
6665 int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER);
6666 if (r > 0)
6667 return; // delayed
6668 if (r < 0) {
6669 if (r == -ESTALE) {
6670 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
6671 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
6672 } else {
6673 dout(10) << "FAIL on error " << r << dendl;
6674 respond_to_request(mdr, r);
6675 }
6676 return;
6677
6678 }
6679 assert(!srctrace.empty());
6680 CDentry *srcdn = srctrace.back();
6681 dout(10) << " srcdn " << *srcdn << dendl;
6682 if (srcdn->last != CEPH_NOSNAP) {
6683 respond_to_request(mdr, -EROFS);
6684 return;
6685 }
6686 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
6687 CInode *srci = srcdnl->get_inode();
6688 dout(10) << " srci " << *srci << dendl;
6689
6690 CInode *oldin = 0;
6691 if (!destdnl->is_null()) {
6692 //dout(10) << "dest dn exists " << *destdn << dendl;
6693 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
6694 if (!oldin) return;
6695 dout(10) << " oldin " << *oldin << dendl;
6696
6697 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
6698 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
6699 respond_to_request(mdr, -ENOTEMPTY);
6700 return;
6701 }
6702
6703 // if srcdn is replica, need to make sure its linkage is correct
6704 if (srcdn->is_auth() ||
6705 srcdn->lock.can_read(mdr->get_client()) ||
6706 (srcdn->lock.is_xlocked() && srcdn->lock.get_xlock_by() == mdr)) {
6707 // mv /some/thing /to/some/existing_other_thing
6708 if (oldin->is_dir() && !srci->is_dir()) {
6709 respond_to_request(mdr, -EISDIR);
6710 return;
6711 }
6712 if (!oldin->is_dir() && srci->is_dir()) {
6713 respond_to_request(mdr, -ENOTDIR);
6714 return;
6715 }
6716 if (srci == oldin && !srcdn->get_dir()->inode->is_stray()) {
6717 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
6718 return;
6719 }
6720 }
6721 }
6722
6723 // -- some sanity checks --
6724
6725 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
6726 if (destpath.get_ino() != srcpath.get_ino() &&
6727 !(req->get_source().is_mds() &&
6728 MDS_INO_IS_MDSDIR(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
6729 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
6730 CInode *destbase = desttrace[0]->get_dir()->get_inode();
6731 // ok, extend srctrace toward root until it is an ancestor of desttrace.
6732 while (srcbase != destbase &&
6733 !srcbase->is_projected_ancestor_of(destbase)) {
6734 CDentry *pdn = srcbase->get_projected_parent_dn();
6735 srctrace.insert(srctrace.begin(), pdn);
6736 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
6737 srcbase = pdn->get_dir()->get_inode();
6738 }
6739
6740 // then, extend destpath until it shares the same parent inode as srcpath.
6741 while (destbase != srcbase) {
6742 CDentry *pdn = destbase->get_projected_parent_dn();
6743 desttrace.insert(desttrace.begin(), pdn);
6744 rdlocks.insert(&pdn->lock);
6745 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
6746 destbase = pdn->get_dir()->get_inode();
6747 }
6748 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
6749 }
6750
6751 // src == dest?
6752 if (srcdn->get_dir() == destdir && srcdn->get_name() == destname) {
6753 dout(7) << "rename src=dest, noop" << dendl;
6754 respond_to_request(mdr, 0);
6755 return;
6756 }
6757
6758 // dest a child of src?
6759 // e.g. mv /usr /usr/foo
6760 CDentry *pdn = destdir->inode->get_projected_parent_dn();
6761 while (pdn) {
6762 if (pdn == srcdn) {
6763 dout(7) << "cannot rename item to be a child of itself" << dendl;
6764 respond_to_request(mdr, -EINVAL);
6765 return;
6766 }
6767 pdn = pdn->get_dir()->inode->parent;
6768 }
6769
6770 // is this a stray migration, reintegration or merge? (sanity checks!)
6771 if (mdr->reqid.name.is_mds() &&
6772 !(MDS_INO_IS_MDSDIR(srcpath.get_ino()) &&
6773 MDS_INO_IS_MDSDIR(destpath.get_ino())) &&
6774 !(destdnl->is_remote() &&
6775 destdnl->get_remote_ino() == srci->ino())) {
6776 respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
6777 return;
6778 }
6779
6780 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
6781 (srcdnl->is_primary() || destdnl->is_primary()));
6782 if (linkmerge)
6783 dout(10) << " this is a link merge" << dendl;
6784
6785 // -- create stray dentry? --
6786 CDentry *straydn = NULL;
6787 if (destdnl->is_primary() && !linkmerge) {
6788 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
6789 if (!straydn)
6790 return;
6791 dout(10) << " straydn is " << *straydn << dendl;
6792 } else if (mdr->straydn) {
6793 mdr->unpin(mdr->straydn);
6794 mdr->straydn = NULL;
6795 }
6796
6797 // -- prepare witness list --
6798 /*
6799 * NOTE: we use _all_ replicas as witnesses.
6800 * this probably isn't totally necessary (esp for file renames),
6801 * but if/when we change that, we have to make sure rejoin is
6802 * sufficiently robust to handle strong rejoins from survivors
6803 * with totally wrong dentry->inode linkage.
6804 * (currently, it can ignore rename effects, because the resolve
6805 * stage will sort them out.)
6806 */
6807 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
6808 if (srcdn->is_auth())
6809 srcdn->list_replicas(witnesses);
6810 else
6811 witnesses.insert(srcdn->authority().first);
6812 if (srcdnl->is_remote() && !srci->is_auth())
6813 witnesses.insert(srci->authority().first);
6814 destdn->list_replicas(witnesses);
6815 if (destdnl->is_remote() && !oldin->is_auth())
6816 witnesses.insert(oldin->authority().first);
6817 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
6818
6819
6820 // -- locks --
6821 map<SimpleLock*, mds_rank_t> remote_wrlocks;
6822
6823 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
6824 for (int i=0; i<(int)srctrace.size(); i++)
6825 rdlocks.insert(&srctrace[i]->lock);
6826 xlocks.insert(&srcdn->lock);
6827 mds_rank_t srcdirauth = srcdn->get_dir()->authority().first;
6828 if (srcdirauth != mds->get_nodeid()) {
6829 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
6830 remote_wrlocks[&srcdn->get_dir()->inode->filelock] = srcdirauth;
6831 remote_wrlocks[&srcdn->get_dir()->inode->nestlock] = srcdirauth;
6832 if (srci->is_dir())
6833 rdlocks.insert(&srci->dirfragtreelock);
6834 } else {
6835 wrlocks.insert(&srcdn->get_dir()->inode->filelock);
6836 wrlocks.insert(&srcdn->get_dir()->inode->nestlock);
6837 }
6838 mds->locker->include_snap_rdlocks(rdlocks, srcdn->get_dir()->inode);
6839
6840 // straydn?
6841 if (straydn) {
6842 wrlocks.insert(&straydn->get_dir()->inode->filelock);
6843 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
6844 xlocks.insert(&straydn->lock);
6845 }
6846
6847 // xlock versionlock on dentries if there are witnesses.
6848 // replicas can't see projected dentry linkages, and will get
6849 // confused if we try to pipeline things.
6850 if (!witnesses.empty()) {
6851 // take xlock on all projected ancestor dentries for srcdn and destdn.
6852 // this ensures the srcdn and destdn can be traversed to by the witnesses.
6853 for (int i= 0; i<(int)srctrace.size(); i++) {
6854 if (srctrace[i]->is_auth() && srctrace[i]->is_projected())
6855 xlocks.insert(&srctrace[i]->versionlock);
6856 }
6857 for (int i=0; i<(int)desttrace.size(); i++) {
6858 if (desttrace[i]->is_auth() && desttrace[i]->is_projected())
6859 xlocks.insert(&desttrace[i]->versionlock);
6860 }
6861 // xlock srci and oldin's primary dentries, so witnesses can call
6862 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
6863 // is traversed.
6864 if (srcdnl->is_remote())
6865 xlocks.insert(&srci->get_projected_parent_dn()->lock);
6866 if (destdnl->is_remote())
6867 xlocks.insert(&oldin->get_projected_parent_dn()->lock);
6868 }
6869
6870 // we need to update srci's ctime. xlock its least contended lock to do that...
6871 xlocks.insert(&srci->linklock);
6872
6873 // xlock oldin (for nlink--)
6874 if (oldin) {
6875 xlocks.insert(&oldin->linklock);
6876 if (oldin->is_dir())
6877 rdlocks.insert(&oldin->filelock);
6878 }
6879 if (srcdnl->is_primary() && srci->is_dir())
6880 // FIXME: this should happen whenever we are renamning between
6881 // realms, regardless of the file type
6882 // FIXME: If/when this changes, make sure to update the
6883 // "allowance" in handle_slave_rename_prep
6884 xlocks.insert(&srci->snaplock); // FIXME: an auth bcast could be sufficient?
6885 else
6886 rdlocks.insert(&srci->snaplock);
6887
6888 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL;
6889 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks,
6890 &remote_wrlocks, auth_pin_freeze))
6891 return;
6892
6893 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6894 if (!check_access(mdr, srcdn->get_dir()->get_inode(), MAY_WRITE))
6895 return;
6896
6897 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
6898 return;
6899
6900 if (!check_fragment_space(mdr, destdn->get_dir()))
6901 return;
6902
6903 if (!check_access(mdr, srci, MAY_WRITE))
6904 return;
6905 }
6906
6907 // with read lock, really verify oldin is empty
6908 if (oldin &&
6909 oldin->is_dir() &&
6910 _dir_is_nonempty(mdr, oldin)) {
6911 respond_to_request(mdr, -ENOTEMPTY);
6912 return;
6913 }
6914
6915 /* project_past_snaprealm_parent() will do this job
6916 *
6917 // moving between snaprealms?
6918 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
6919 SnapRealm *srcrealm = srci->find_snaprealm();
6920 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
6921 if (srcrealm != destrealm &&
6922 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
6923 destrealm->get_newest_seq() + 1 > srcdn->first)) {
6924 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
6925 mdcache->snaprealm_create(mdr, srci);
6926 return;
6927 }
6928 }
6929 */
6930
6931 assert(g_conf->mds_kill_rename_at != 1);
6932
6933 // -- open all srcdn inode frags, if any --
6934 // we need these open so that auth can properly delegate from inode to dirfrags
6935 // after the inode is _ours_.
6936 if (srcdnl->is_primary() &&
6937 !srcdn->is_auth() &&
6938 srci->is_dir()) {
6939 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
6940 mdr->set_stickydirs(srci);
6941
6942 list<frag_t> frags;
6943 srci->dirfragtree.get_leaves(frags);
6944 for (list<frag_t>::iterator p = frags.begin();
6945 p != frags.end();
6946 ++p) {
6947 CDir *dir = srci->get_dirfrag(*p);
6948 if (!dir) {
6949 dout(10) << " opening " << *p << " under " << *srci << dendl;
6950 mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr));
6951 return;
6952 }
6953 }
6954 }
6955
6956 // -- prepare witnesses --
6957
6958 // do srcdn auth last
6959 mds_rank_t last = MDS_RANK_NONE;
6960 if (!srcdn->is_auth()) {
6961 last = srcdn->authority().first;
6962 mdr->more()->srcdn_auth_mds = last;
6963 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
6964 // are involved in the rename operation.
6965 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
6966 dout(10) << " preparing ambiguous auth for srci" << dendl;
6967 assert(mdr->more()->is_remote_frozen_authpin);
6968 assert(mdr->more()->rename_inode == srci);
6969 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6970 return;
6971 }
6972 }
6973
6974 for (set<mds_rank_t>::iterator p = witnesses.begin();
6975 p != witnesses.end();
6976 ++p) {
6977 if (*p == last) continue; // do it last!
6978 if (mdr->more()->witnessed.count(*p)) {
6979 dout(10) << " already witnessed by mds." << *p << dendl;
6980 } else if (mdr->more()->waiting_on_slave.count(*p)) {
6981 dout(10) << " already waiting on witness mds." << *p << dendl;
6982 } else {
6983 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
6984 return;
6985 }
6986 }
6987 if (!mdr->more()->waiting_on_slave.empty())
6988 return; // we're waiting for a witness.
6989
6990 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
6991 dout(10) << " preparing last witness (srcdn auth)" << dendl;
6992 assert(mdr->more()->waiting_on_slave.count(last) == 0);
6993 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
6994 return;
6995 }
6996
6997 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
6998 if (!mdr->more()->slaves.empty() && !srci->is_dir())
6999 assert(g_conf->mds_kill_rename_at != 3);
7000 if (!mdr->more()->slaves.empty() && srci->is_dir())
7001 assert(g_conf->mds_kill_rename_at != 4);
7002
7003 // -- declare now --
7004 mdr->set_mds_stamp(ceph_clock_now());
7005
7006 // -- prepare journal entry --
7007 mdr->ls = mdlog->get_current_segment();
7008 EUpdate *le = new EUpdate(mdlog, "rename");
7009 mdlog->start_entry(le);
7010 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7011 if (!mdr->more()->witnessed.empty()) {
7012 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
7013
7014 le->reqid = mdr->reqid;
7015 le->had_slaves = true;
7016
7017 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7018 // no need to send frozen auth pin to recovring auth MDS of srci
7019 mdr->more()->is_remote_frozen_authpin = false;
7020 }
7021
7022 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
7023 if (le->client_map.length())
7024 le->cmapv = mds->sessionmap.get_projected();
7025
7026 // -- commit locally --
7027 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
7028
7029 journal_and_reply(mdr, srci, destdn, le, fin);
7030 }
7031
7032
7033 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7034 {
7035 dout(10) << "_rename_finish " << *mdr << dendl;
7036
7037 if (!mdr->more()->witnessed.empty())
7038 mdcache->logged_master_update(mdr->reqid);
7039
7040 // apply
7041 _rename_apply(mdr, srcdn, destdn, straydn);
7042
7043 mdcache->send_dentry_link(destdn, mdr);
7044
7045 CDentry::linkage_t *destdnl = destdn->get_linkage();
7046 CInode *in = destdnl->get_inode();
7047 bool need_eval = mdr->more()->cap_imports.count(in);
7048
7049 // test hack: test slave commit
7050 if (!mdr->more()->slaves.empty() && !in->is_dir())
7051 assert(g_conf->mds_kill_rename_at != 5);
7052 if (!mdr->more()->slaves.empty() && in->is_dir())
7053 assert(g_conf->mds_kill_rename_at != 6);
7054
7055 // bump popularity
7056 utime_t now = ceph_clock_now();
7057 mds->balancer->hit_dir(now, srcdn->get_dir(), META_POP_IWR);
7058 if (destdnl->is_remote() && in->is_auth())
7059 mds->balancer->hit_inode(now, in, META_POP_IWR);
7060
7061 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
7062
7063 assert(g_conf->mds_kill_rename_at != 7);
7064
7065 // reply
7066 respond_to_request(mdr, 0);
7067
7068 if (need_eval)
7069 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
7070
7071 // clean up?
7072 // respond_to_request() drops locks. So stray reintegration can race with us.
7073 if (straydn && !straydn->get_projected_linkage()->is_null()) {
7074 mdcache->notify_stray(straydn);
7075 }
7076 }
7077
7078
7079
7080 // helpers
7081
7082 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
7083 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
7084 {
7085 if (mds->is_cluster_degraded() &&
7086 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7087 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
7088 if (mdr->more()->waiting_on_slave.empty())
7089 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7090 return false;
7091 }
7092
7093 dout(10) << "_rename_prepare_witness mds." << who << dendl;
7094 MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7095 MMDSSlaveRequest::OP_RENAMEPREP);
7096
7097 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
7098 for (auto dn : srctrace)
7099 req->srcdnpath.push_dentry(dn->get_name());
7100 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
7101 for (auto dn : dsttrace)
7102 req->destdnpath.push_dentry(dn->get_name());
7103 if (straydn)
7104 mdcache->replicate_stray(straydn, who, req->stray);
7105
7106 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
7107
7108 // srcdn auth will verify our current witness list is sufficient
7109 req->witnesses = witnesse;
7110
7111 req->op_stamp = mdr->get_op_stamp();
7112 mds->send_message_mds(req, who);
7113
7114 assert(mdr->more()->waiting_on_slave.count(who) == 0);
7115 mdr->more()->waiting_on_slave.insert(who);
7116 return true;
7117 }
7118
7119 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
7120 {
7121 version_t oldpv = mdr->more()->inode_import_v;
7122
7123 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7124
7125 /* import node */
7126 bufferlist::iterator blp = mdr->more()->inode_import.begin();
7127
7128 // imported caps
7129 map<client_t,entity_inst_t> client_map;
7130 decode(client_map, blp);
7131 prepare_force_open_sessions(client_map, mdr->more()->imported_session_map);
7132 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
7133
7134 list<ScatterLock*> updated_scatterlocks;
7135 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
7136 mdr->more()->cap_imports, updated_scatterlocks);
7137
7138 // hack: force back to !auth and clean, temporarily
7139 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
7140 srcdnl->get_inode()->mark_clean();
7141
7142 return oldpv;
7143 }
7144
7145 bool Server::_need_force_journal(CInode *diri, bool empty)
7146 {
7147 list<CDir*> ls;
7148 diri->get_dirfrags(ls);
7149
7150 bool force_journal = false;
7151 if (empty) {
7152 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7153 if ((*p)->is_subtree_root() && (*p)->get_dir_auth().first == mds->get_nodeid()) {
7154 dout(10) << " frag " << (*p)->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
7155 force_journal = true;
7156 break;
7157 } else
7158 dout(20) << " frag " << (*p)->get_frag() << " is not auth subtree dirfrag" << dendl;
7159 }
7160 } else {
7161 // see if any children of our frags are auth subtrees.
7162 list<CDir*> subtrees;
7163 mdcache->list_subtrees(subtrees);
7164 dout(10) << " subtrees " << subtrees << " frags " << ls << dendl;
7165 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7166 CDir *dir = *p;
7167 for (list<CDir*>::iterator q = subtrees.begin(); q != subtrees.end(); ++q) {
7168 if (dir->contains(*q)) {
7169 if ((*q)->get_dir_auth().first == mds->get_nodeid()) {
7170 dout(10) << " frag " << (*p)->get_frag() << " contains (maybe) auth subtree, will force journal "
7171 << **q << dendl;
7172 force_journal = true;
7173 break;
7174 } else
7175 dout(20) << " frag " << (*p)->get_frag() << " contains but isn't auth for " << **q << dendl;
7176 } else
7177 dout(20) << " frag " << (*p)->get_frag() << " does not contain " << **q << dendl;
7178 }
7179 if (force_journal)
7180 break;
7181 }
7182 }
7183 return force_journal;
7184 }
7185
7186 void Server::_rename_prepare(MDRequestRef& mdr,
7187 EMetaBlob *metablob, bufferlist *client_map_bl,
7188 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7189 {
7190 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7191 if (straydn)
7192 dout(10) << " straydn " << *straydn << dendl;
7193
7194 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7195 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7196 CInode *srci = srcdnl->get_inode();
7197 CInode *oldin = destdnl->get_inode();
7198
7199 // primary+remote link merge?
7200 bool linkmerge = (srci == destdnl->get_inode() &&
7201 (srcdnl->is_primary() || destdnl->is_primary()));
7202 bool silent = srcdn->get_dir()->inode->is_stray();
7203
7204 bool force_journal_dest = false;
7205 if (srci->is_dir() && !destdn->is_auth()) {
7206 if (srci->is_auth()) {
7207 // if we are auth for srci and exporting it, force journal because journal replay needs
7208 // the source inode to create auth subtrees.
7209 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
7210 force_journal_dest = true;
7211 } else
7212 force_journal_dest = _need_force_journal(srci, false);
7213 }
7214
7215 bool force_journal_stray = false;
7216 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
7217 force_journal_stray = _need_force_journal(oldin, true);
7218
7219 if (linkmerge)
7220 dout(10) << " merging remote and primary links to the same inode" << dendl;
7221 if (silent)
7222 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
7223 if (force_journal_dest)
7224 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
7225 if (force_journal_stray)
7226 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
7227
7228 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
7229 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
7230 metablob->renamed_dirino = srci->ino();
7231 } else if (oldin && oldin->is_dir() && force_journal_stray) {
7232 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
7233 metablob->renamed_dirino = oldin->ino();
7234 }
7235
7236 // prepare
7237 CInode::mempool_inode *spi = 0; // renamed inode
7238 CInode::mempool_inode *tpi = 0; // target/overwritten inode
7239
7240 // target inode
7241 if (!linkmerge) {
7242 if (destdnl->is_primary()) {
7243 assert(straydn); // moving to straydn.
7244 // link--, and move.
7245 if (destdn->is_auth()) {
7246 auto &pi= oldin->project_inode(); //project_snaprealm
7247 pi.inode.version = straydn->pre_dirty(pi.inode.version);
7248 pi.inode.update_backtrace();
7249 tpi = &pi.inode;
7250 }
7251 straydn->push_projected_linkage(oldin);
7252 } else if (destdnl->is_remote()) {
7253 // nlink-- targeti
7254 if (oldin->is_auth()) {
7255 auto &pi = oldin->project_inode();
7256 pi.inode.version = oldin->pre_dirty();
7257 tpi = &pi.inode;
7258 }
7259 }
7260 }
7261
7262 // dest
7263 if (srcdnl->is_remote()) {
7264 if (!linkmerge) {
7265 // destdn
7266 if (destdn->is_auth())
7267 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
7268 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7269 // srci
7270 if (srci->is_auth()) {
7271 auto &pi = srci->project_inode();
7272 pi.inode.version = srci->pre_dirty();
7273 spi = &pi.inode;
7274 }
7275 } else {
7276 dout(10) << " will merge remote onto primary link" << dendl;
7277 if (destdn->is_auth()) {
7278 auto &pi = oldin->project_inode();
7279 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
7280 spi = &pi.inode;
7281 }
7282 }
7283 } else { // primary
7284 if (destdn->is_auth()) {
7285 version_t oldpv;
7286 if (srcdn->is_auth())
7287 oldpv = srci->get_projected_version();
7288 else {
7289 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
7290
7291 // note which dirfrags have child subtrees in the journal
7292 // event, so that we can open those (as bounds) during replay.
7293 if (srci->is_dir()) {
7294 list<CDir*> ls;
7295 srci->get_dirfrags(ls);
7296 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7297 CDir *dir = *p;
7298 if (!dir->is_auth())
7299 metablob->renamed_dir_frags.push_back(dir->get_frag());
7300 }
7301 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
7302 }
7303 }
7304 auto &pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
7305 // & srcdnl->snaprealm
7306 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
7307 pi.inode.update_backtrace();
7308 spi = &pi.inode;
7309 }
7310 destdn->push_projected_linkage(srci);
7311 }
7312
7313 // src
7314 if (srcdn->is_auth())
7315 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
7316 srcdn->push_projected_linkage(); // push null linkage
7317
7318 if (!silent) {
7319 if (spi) {
7320 spi->ctime = mdr->get_op_stamp();
7321 if (mdr->get_op_stamp() > spi->rstat.rctime)
7322 spi->rstat.rctime = mdr->get_op_stamp();
7323 spi->change_attr++;
7324 if (linkmerge)
7325 spi->nlink--;
7326 }
7327 if (tpi) {
7328 tpi->ctime = mdr->get_op_stamp();
7329 if (mdr->get_op_stamp() > tpi->rstat.rctime)
7330 tpi->rstat.rctime = mdr->get_op_stamp();
7331 tpi->change_attr++;
7332 {
7333 std::string t;
7334 destdn->make_path_string(t, true);
7335 tpi->stray_prior_path = mempool::mds_co::string(boost::string_view(t));
7336 }
7337 tpi->nlink--;
7338 if (tpi->nlink == 0)
7339 oldin->state_set(CInode::STATE_ORPHAN);
7340 }
7341 }
7342
7343 // prepare nesting, mtime updates
7344 int predirty_dir = silent ? 0:PREDIRTY_DIR;
7345
7346 // guarantee stray dir is processed first during journal replay. unlink the old inode,
7347 // then link the source inode to destdn
7348 if (destdnl->is_primary()) {
7349 assert(straydn);
7350 if (straydn->is_auth()) {
7351 metablob->add_dir_context(straydn->get_dir());
7352 metablob->add_dir(straydn->get_dir(), true);
7353 }
7354 }
7355
7356 // sub off target
7357 if (destdn->is_auth() && !destdnl->is_null()) {
7358 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
7359 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
7360 if (destdnl->is_primary()) {
7361 assert(straydn);
7362 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
7363 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7364 }
7365 }
7366
7367 // move srcdn
7368 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
7369 int flags = predirty_dir | predirty_primary;
7370 if (srcdn->is_auth())
7371 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
7372 if (destdn->is_auth())
7373 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
7374
7375 SnapRealm *src_realm = srci->find_snaprealm();
7376 SnapRealm *dest_realm = destdn->get_dir()->inode->find_snaprealm();
7377 snapid_t next_dest_snap = dest_realm->get_newest_seq() + 1;
7378
7379 // add it all to the metablob
7380 // target inode
7381 if (!linkmerge) {
7382 if (destdnl->is_primary()) {
7383 assert(straydn);
7384 if (destdn->is_auth()) {
7385 // project snaprealm, too
7386 if (oldin->snaprealm || dest_realm->get_newest_seq() + 1 > oldin->get_oldest_snap())
7387 oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
7388 straydn->first = MAX(oldin->first, next_dest_snap);
7389 metablob->add_primary_dentry(straydn, oldin, true, true);
7390 } else if (force_journal_stray) {
7391 dout(10) << " forced journaling straydn " << *straydn << dendl;
7392 metablob->add_dir_context(straydn->get_dir());
7393 metablob->add_primary_dentry(straydn, oldin, true);
7394 }
7395 } else if (destdnl->is_remote()) {
7396 if (oldin->is_auth()) {
7397 // auth for targeti
7398 metablob->add_dir_context(oldin->get_projected_parent_dir());
7399 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
7400 CEPH_NOSNAP, 0, destdnl);
7401 metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
7402 }
7403 }
7404 }
7405
7406 // dest
7407 if (srcdnl->is_remote()) {
7408 if (!linkmerge) {
7409 if (destdn->is_auth() && !destdnl->is_null())
7410 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7411 else
7412 destdn->first = MAX(destdn->first, next_dest_snap);
7413
7414 if (destdn->is_auth())
7415 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
7416 if (srci->get_projected_parent_dn()->is_auth()) { // it's remote
7417 metablob->add_dir_context(srci->get_projected_parent_dir());
7418 mdcache->journal_cow_dentry(mdr.get(), metablob, srci->get_projected_parent_dn(), CEPH_NOSNAP, 0, srcdnl);
7419 metablob->add_primary_dentry(srci->get_projected_parent_dn(), srci, true);
7420 }
7421 } else {
7422 if (destdn->is_auth() && !destdnl->is_null())
7423 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7424 else
7425 destdn->first = MAX(destdn->first, next_dest_snap);
7426
7427 if (destdn->is_auth())
7428 metablob->add_primary_dentry(destdn, destdnl->get_inode(), true, true);
7429 }
7430 } else if (srcdnl->is_primary()) {
7431 // project snap parent update?
7432 if (destdn->is_auth() && src_realm != dest_realm &&
7433 (srci->snaprealm || src_realm->get_newest_seq() + 1 > srci->get_oldest_snap()))
7434 srci->project_past_snaprealm_parent(dest_realm);
7435
7436 if (destdn->is_auth() && !destdnl->is_null())
7437 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
7438 else
7439 destdn->first = MAX(destdn->first, next_dest_snap);
7440
7441 if (destdn->is_auth())
7442 metablob->add_primary_dentry(destdn, srci, true, true);
7443 else if (force_journal_dest) {
7444 dout(10) << " forced journaling destdn " << *destdn << dendl;
7445 metablob->add_dir_context(destdn->get_dir());
7446 metablob->add_primary_dentry(destdn, srci, true);
7447 if (srcdn->is_auth() && srci->is_dir()) {
7448 // journal new subtrees root dirfrags
7449 list<CDir*> ls;
7450 srci->get_dirfrags(ls);
7451 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7452 CDir *dir = *p;
7453 if (dir->is_auth())
7454 metablob->add_dir(dir, true);
7455 }
7456 }
7457 }
7458 }
7459
7460 // src
7461 if (srcdn->is_auth()) {
7462 dout(10) << " journaling srcdn " << *srcdn << dendl;
7463 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
7464 // also journal the inode in case we need do slave rename rollback. It is Ok to add
7465 // both primary and NULL dentries. Because during journal replay, null dentry is
7466 // processed after primary dentry.
7467 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
7468 metablob->add_primary_dentry(srcdn, srci, true);
7469 metablob->add_null_dentry(srcdn, true);
7470 } else
7471 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
7472
7473 // make renamed inode first track the dn
7474 if (srcdnl->is_primary() && destdn->is_auth())
7475 srci->first = destdn->first;
7476
7477 if (oldin && oldin->is_dir()) {
7478 assert(straydn);
7479 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
7480 }
7481 if (srci->is_dir())
7482 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
7483
7484 }
7485
7486
7487 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7488 {
7489 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7490 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
7491
7492 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7493 CDentry::linkage_t *destdnl = destdn->get_linkage();
7494
7495 CInode *oldin = destdnl->get_inode();
7496
7497 // primary+remote link merge?
7498 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7499 (srcdnl->is_primary() || destdnl->is_primary()));
7500
7501 // target inode
7502 if (!linkmerge) {
7503 if (destdnl->is_primary()) {
7504 assert(straydn);
7505 dout(10) << "straydn is " << *straydn << dendl;
7506 destdn->get_dir()->unlink_inode(destdn, false);
7507
7508 straydn->pop_projected_linkage();
7509 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7510 assert(!straydn->is_projected()); // no other projected
7511
7512 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
7513
7514 // nlink-- targeti
7515 if (destdn->is_auth()) {
7516 bool hadrealm = (oldin->snaprealm ? true : false);
7517 oldin->pop_and_dirty_projected_inode(mdr->ls);
7518 if (oldin->snaprealm && !hadrealm)
7519 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT);
7520 } else {
7521 // FIXME this snaprealm is not filled out correctly
7522 //oldin->open_snaprealm(); might be sufficient..
7523 }
7524 } else if (destdnl->is_remote()) {
7525 destdn->get_dir()->unlink_inode(destdn, false);
7526 if (oldin->is_auth())
7527 oldin->pop_and_dirty_projected_inode(mdr->ls);
7528 }
7529 }
7530
7531 // unlink src before we relink it at dest
7532 CInode *in = srcdnl->get_inode();
7533 assert(in);
7534
7535 bool srcdn_was_remote = srcdnl->is_remote();
7536 srcdn->get_dir()->unlink_inode(srcdn);
7537
7538 // dest
7539 if (srcdn_was_remote) {
7540 if (!linkmerge) {
7541 // destdn
7542 destdnl = destdn->pop_projected_linkage();
7543 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7544 assert(!destdn->is_projected()); // no other projected
7545
7546 destdn->link_remote(destdnl, in);
7547 if (destdn->is_auth())
7548 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
7549 // in
7550 if (in->is_auth())
7551 in->pop_and_dirty_projected_inode(mdr->ls);
7552 } else {
7553 dout(10) << "merging remote onto primary link" << dendl;
7554 oldin->pop_and_dirty_projected_inode(mdr->ls);
7555 }
7556 } else { // primary
7557 if (linkmerge) {
7558 dout(10) << "merging primary onto remote link" << dendl;
7559 destdn->get_dir()->unlink_inode(destdn, false);
7560 }
7561 destdnl = destdn->pop_projected_linkage();
7562 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7563 assert(!destdn->is_projected()); // no other projected
7564
7565 // srcdn inode import?
7566 if (!srcdn->is_auth() && destdn->is_auth()) {
7567 assert(mdr->more()->inode_import.length() > 0);
7568
7569 map<client_t,Capability::Import> imported_caps;
7570
7571 // finish cap imports
7572 finish_force_open_sessions(mdr->more()->imported_session_map);
7573 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
7574 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
7575 mdr->more()->srcdn_auth_mds, true,
7576 mdr->more()->imported_session_map,
7577 mdr->more()->cap_imports[destdnl->get_inode()],
7578 imported_caps);
7579 }
7580
7581 mdr->more()->inode_import.clear();
7582 ::encode(imported_caps, mdr->more()->inode_import);
7583
7584 /* hack: add an auth pin for each xlock we hold. These were
7585 * remote xlocks previously but now they're local and
7586 * we're going to try and unpin when we xlock_finish. */
7587 for (set<SimpleLock *>::iterator i = mdr->xlocks.begin();
7588 i != mdr->xlocks.end();
7589 ++i)
7590 if ((*i)->get_parent() == destdnl->get_inode() &&
7591 !(*i)->is_locallock())
7592 mds->locker->xlock_import(*i);
7593
7594 // hack: fix auth bit
7595 in->state_set(CInode::STATE_AUTH);
7596
7597 mdr->clear_ambiguous_auth();
7598 }
7599
7600 if (destdn->is_auth()) {
7601 in->pop_and_dirty_projected_inode(mdr->ls);
7602
7603 } else {
7604 // FIXME: fix up snaprealm!
7605 }
7606 }
7607
7608 // src
7609 if (srcdn->is_auth())
7610 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
7611 srcdn->pop_projected_linkage();
7612 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
7613 assert(!srcdn->is_projected()); // no other projected
7614
7615 // apply remaining projected inodes (nested)
7616 mdr->apply();
7617
7618 // update subtree map?
7619 if (destdnl->is_primary() && in->is_dir()) {
7620 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
7621
7622 if (destdn->is_auth())
7623 mdcache->migrator->adjust_export_after_rename(in, srcdn->get_dir());
7624 }
7625
7626 if (straydn && oldin->is_dir())
7627 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
7628
7629 // removing a new dn?
7630 if (srcdn->is_auth())
7631 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
7632 }
7633
7634
7635
7636 // ------------
7637 // SLAVE
7638
7639 class C_MDS_SlaveRenamePrep : public ServerLogContext {
7640 CDentry *srcdn, *destdn, *straydn;
7641 public:
7642 C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7643 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
7644 void finish(int r) override {
7645 server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
7646 }
7647 };
7648
7649 class C_MDS_SlaveRenameCommit : public ServerContext {
7650 MDRequestRef mdr;
7651 CDentry *srcdn, *destdn, *straydn;
7652 public:
7653 C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7654 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
7655 void finish(int r) override {
7656 server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
7657 }
7658 };
7659
7660 class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
7661 MDRequestRef mdr;
7662 public:
7663 C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
7664 ServerContext(s), mdr(r) {}
7665 void finish(int r) override {
7666 server->_slave_rename_sessions_flushed(mdr);
7667 }
7668 };
7669
7670 /* This function DOES put the mdr->slave_request before returning*/
7671 void Server::handle_slave_rename_prep(MDRequestRef& mdr)
7672 {
7673 dout(10) << "handle_slave_rename_prep " << *mdr
7674 << " " << mdr->slave_request->srcdnpath
7675 << " to " << mdr->slave_request->destdnpath
7676 << dendl;
7677
7678 if (mdr->slave_request->is_interrupted()) {
7679 dout(10) << " slave request interrupted, sending noop reply" << dendl;
7680 MMDSSlaveRequest *reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7681 reply->mark_interrupted();
7682 mds->send_message_mds(reply, mdr->slave_to_mds);
7683 mdr->reset_slave_request();
7684 return;
7685 }
7686
7687 // discover destdn
7688 filepath destpath(mdr->slave_request->destdnpath);
7689 dout(10) << " dest " << destpath << dendl;
7690 vector<CDentry*> trace;
7691 int r = mdcache->path_traverse(mdr, NULL, NULL, destpath, &trace, NULL, MDS_TRAVERSE_DISCOVERXLOCK);
7692 if (r > 0) return;
7693 if (r == -ESTALE) {
7694 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
7695 mdr->slave_to_mds);
7696 return;
7697 }
7698 assert(r == 0); // we shouldn't get an error here!
7699
7700 CDentry *destdn = trace.back();
7701 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7702 dout(10) << " destdn " << *destdn << dendl;
7703 mdr->pin(destdn);
7704
7705 // discover srcdn
7706 filepath srcpath(mdr->slave_request->srcdnpath);
7707 dout(10) << " src " << srcpath << dendl;
7708 CInode *srci = nullptr;
7709 r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &trace, &srci, MDS_TRAVERSE_DISCOVERXLOCK);
7710 if (r > 0) return;
7711 assert(r == 0);
7712
7713 // srcpath must not point to a null dentry
7714 assert(srci != nullptr);
7715
7716 CDentry *srcdn = trace.back();
7717 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7718 dout(10) << " srcdn " << *srcdn << dendl;
7719 mdr->pin(srcdn);
7720 mdr->pin(srci);
7721
7722 // stray?
7723 bool linkmerge = (srcdnl->get_inode() == destdnl->get_inode() &&
7724 (srcdnl->is_primary() || destdnl->is_primary()));
7725 CDentry *straydn = mdr->straydn;
7726 if (destdnl->is_primary() && !linkmerge)
7727 assert(straydn);
7728
7729 mdr->set_op_stamp(mdr->slave_request->op_stamp);
7730 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
7731
7732 // set up commit waiter (early, to clean up any freezing etc we do)
7733 if (!mdr->more()->slave_commit)
7734 mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
7735
7736 // am i srcdn auth?
7737 if (srcdn->is_auth()) {
7738 set<mds_rank_t> srcdnrep;
7739 srcdn->list_replicas(srcdnrep);
7740
7741 bool reply_witness = false;
7742 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
7743 // freeze?
7744 // we need this to
7745 // - avoid conflicting lock state changes
7746 // - avoid concurrent updates to the inode
7747 // (this could also be accomplished with the versionlock)
7748 int allowance = 2; // 1 for the mdr auth_pin, 1 for the link lock
7749 allowance += srcdnl->get_inode()->is_dir(); // for the snap lock
7750 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
7751 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
7752
7753 // unfreeze auth pin after freezing the inode to avoid queueing waiters
7754 if (srcdnl->get_inode()->is_frozen_auth_pin())
7755 mdr->unfreeze_auth_pin();
7756
7757 if (!frozen_inode) {
7758 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
7759 return;
7760 }
7761
7762 /*
7763 * set ambiguous auth for srci
7764 * NOTE: we don't worry about ambiguous cache expire as we do
7765 * with subtree migrations because all slaves will pin
7766 * srcdn->get_inode() for duration of this rename.
7767 */
7768 mdr->set_ambiguous_auth(srcdnl->get_inode());
7769
7770 // just mark the source inode as ambiguous auth if more than two MDS are involved.
7771 // the master will send another OP_RENAMEPREP slave request later.
7772 if (mdr->slave_request->witnesses.size() > 1) {
7773 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
7774 reply_witness = true;
7775 }
7776
7777 // make sure bystanders have received all lock related messages
7778 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7779 if (*p == mdr->slave_to_mds ||
7780 (mds->is_cluster_degraded() &&
7781 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
7782 continue;
7783 MMDSSlaveRequest *notify = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7784 MMDSSlaveRequest::OP_RENAMENOTIFY);
7785 mds->send_message_mds(notify, *p);
7786 mdr->more()->waiting_on_slave.insert(*p);
7787 }
7788
7789 // make sure clients have received all cap related messages
7790 set<client_t> export_client_set;
7791 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
7792
7793 MDSGatherBuilder gather(g_ceph_context);
7794 flush_client_sessions(export_client_set, gather);
7795 if (gather.has_subs()) {
7796 mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
7797 gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
7798 gather.activate();
7799 }
7800 }
7801
7802 // is witness list sufficient?
7803 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
7804 if (*p == mdr->slave_to_mds ||
7805 mdr->slave_request->witnesses.count(*p)) continue;
7806 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
7807 reply_witness = true;
7808 break;
7809 }
7810
7811 if (reply_witness) {
7812 assert(!srcdnrep.empty());
7813 MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
7814 MMDSSlaveRequest::OP_RENAMEPREPACK);
7815 reply->witnesses.swap(srcdnrep);
7816 mds->send_message_mds(reply, mdr->slave_to_mds);
7817 mdr->reset_slave_request();
7818 return;
7819 }
7820 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
7821 if (!mdr->more()->waiting_on_slave.empty()) {
7822 dout(10) << " still waiting for rename notify acks from "
7823 << mdr->more()->waiting_on_slave << dendl;
7824 return;
7825 }
7826 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
7827 // set ambiguous auth for srci on witnesses
7828 mdr->set_ambiguous_auth(srcdnl->get_inode());
7829 }
7830
7831 // encode everything we'd need to roll this back... basically, just the original state.
7832 rename_rollback rollback;
7833
7834 rollback.reqid = mdr->reqid;
7835
7836 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
7837 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7838 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
7839 rollback.orig_src.dname = std::string(srcdn->get_name());
7840 if (srcdnl->is_primary())
7841 rollback.orig_src.ino = srcdnl->get_inode()->ino();
7842 else {
7843 assert(srcdnl->is_remote());
7844 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
7845 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
7846 }
7847
7848 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
7849 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
7850 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
7851 rollback.orig_dest.dname = std::string(destdn->get_name());
7852 if (destdnl->is_primary())
7853 rollback.orig_dest.ino = destdnl->get_inode()->ino();
7854 else if (destdnl->is_remote()) {
7855 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
7856 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
7857 }
7858
7859 if (straydn) {
7860 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
7861 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
7862 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
7863 rollback.stray.dname = std::string(straydn->get_name());
7864 }
7865 ::encode(rollback, mdr->more()->rollback_bl);
7866 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7867
7868 // journal.
7869 mdr->ls = mdlog->get_current_segment();
7870 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
7871 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
7872 mdlog->start_entry(le);
7873 le->rollback = mdr->more()->rollback_bl;
7874
7875 bufferlist blah; // inode import data... obviously not used if we're the slave
7876 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
7877
7878 if (le->commit.empty()) {
7879 dout(10) << " empty metablob, skipping journal" << dendl;
7880 mdlog->cancel_entry(le);
7881 mdr->ls = NULL;
7882 _logged_slave_rename(mdr, srcdn, destdn, straydn);
7883 } else {
7884 mdr->more()->slave_update_journaled = true;
7885 submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
7886 mdr, __func__);
7887 mdlog->flush();
7888 }
7889 }
7890
7891 void Server::_logged_slave_rename(MDRequestRef& mdr,
7892 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7893 {
7894 dout(10) << "_logged_slave_rename " << *mdr << dendl;
7895
7896 // prepare ack
7897 MMDSSlaveRequest *reply = NULL;
7898 if (!mdr->aborted) {
7899 reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7900 if (!mdr->more()->slave_update_journaled)
7901 reply->mark_not_journaled();
7902 }
7903
7904 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7905 CDentry::linkage_t *destdnl = NULL;
7906 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
7907
7908 // export srci?
7909 if (srcdn->is_auth() && srcdnl->is_primary()) {
7910 // set export bounds for CInode::encode_export()
7911 list<CDir*> bounds;
7912 if (srcdnl->get_inode()->is_dir()) {
7913 srcdnl->get_inode()->get_dirfrags(bounds);
7914 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7915 (*p)->state_set(CDir::STATE_EXPORTBOUND);
7916 }
7917
7918 map<client_t,entity_inst_t> exported_client_map;
7919 bufferlist inodebl;
7920 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
7921 exported_client_map);
7922
7923 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
7924 (*p)->state_clear(CDir::STATE_EXPORTBOUND);
7925
7926 if (reply) {
7927 ::encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
7928 reply->inode_export.claim_append(inodebl);
7929 reply->inode_export_v = srcdnl->get_inode()->inode.version;
7930 }
7931
7932 // remove mdr auth pin
7933 mdr->auth_unpin(srcdnl->get_inode());
7934 mdr->more()->is_inode_exporter = true;
7935
7936 if (srcdnl->get_inode()->is_dirty())
7937 srcdnl->get_inode()->mark_clean();
7938
7939 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
7940 }
7941
7942 // apply
7943 _rename_apply(mdr, srcdn, destdn, straydn);
7944
7945 destdnl = destdn->get_linkage();
7946
7947 // bump popularity
7948 utime_t now = ceph_clock_now();
7949 mds->balancer->hit_dir(now, srcdn->get_dir(), META_POP_IWR);
7950 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
7951 mds->balancer->hit_inode(now, destdnl->get_inode(), META_POP_IWR);
7952
7953 // done.
7954 mdr->reset_slave_request();
7955 mdr->straydn = 0;
7956
7957 if (reply) {
7958 mds->send_message_mds(reply, mdr->slave_to_mds);
7959 } else {
7960 assert(mdr->aborted);
7961 dout(10) << " abort flag set, finishing" << dendl;
7962 mdcache->request_finish(mdr);
7963 }
7964 }
7965
7966 void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
7967 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7968 {
7969 dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
7970
7971 CDentry::linkage_t *destdnl = destdn->get_linkage();
7972
7973 list<MDSInternalContextBase*> finished;
7974 if (r == 0) {
7975 // unfreeze+singleauth inode
7976 // hmm, do i really need to delay this?
7977 if (mdr->more()->is_inode_exporter) {
7978
7979 CInode *in = destdnl->get_inode();
7980
7981 // drop our pins
7982 // we exported, clear out any xlocks that we moved to another MDS
7983 set<SimpleLock*>::iterator i = mdr->xlocks.begin();
7984 while (i != mdr->xlocks.end()) {
7985 SimpleLock *lock = *i++;
7986
7987 // we only care about xlocks on the exported inode
7988 if (lock->get_parent() == in &&
7989 !lock->is_locallock())
7990 mds->locker->xlock_export(lock, mdr.get());
7991 }
7992
7993 map<client_t,Capability::Import> peer_imported;
7994 bufferlist::iterator bp = mdr->more()->inode_import.begin();
7995 ::decode(peer_imported, bp);
7996
7997 dout(10) << " finishing inode export on " << *destdnl->get_inode() << dendl;
7998 mdcache->migrator->finish_export_inode(destdnl->get_inode(), ceph_clock_now(),
7999 mdr->slave_to_mds, peer_imported, finished);
8000 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
8001
8002 // unfreeze
8003 assert(destdnl->get_inode()->is_frozen_inode());
8004 destdnl->get_inode()->unfreeze_inode(finished);
8005 }
8006
8007 // singleauth
8008 if (mdr->more()->is_ambiguous_auth) {
8009 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8010 mdr->more()->is_ambiguous_auth = false;
8011 }
8012
8013 if (straydn && mdr->more()->slave_update_journaled) {
8014 CInode *strayin = straydn->get_projected_linkage()->get_inode();
8015 if (strayin && !strayin->snaprealm)
8016 mdcache->clear_dirty_bits_for_stray(strayin);
8017 }
8018
8019 mds->queue_waiters(finished);
8020 mdr->cleanup();
8021
8022 if (mdr->more()->slave_update_journaled) {
8023 // write a commit to the journal
8024 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
8025 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
8026 ESlaveUpdate::RENAME);
8027 mdlog->start_entry(le);
8028 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
8029 mdlog->flush();
8030 } else {
8031 _committed_slave(mdr);
8032 }
8033 } else {
8034
8035 // abort
8036 // rollback_bl may be empty if we froze the inode but had to provide an expanded
8037 // witness list from the master, and they failed before we tried prep again.
8038 if (mdr->more()->rollback_bl.length()) {
8039 if (mdr->more()->is_inode_exporter) {
8040 dout(10) << " reversing inode export of " << *destdnl->get_inode() << dendl;
8041 destdnl->get_inode()->abort_export();
8042 }
8043 if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
8044 mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
8045 // rollback but preserve the slave request
8046 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
8047 mdr->more()->rollback_bl.clear();
8048 } else
8049 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
8050 } else {
8051 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
8052 // singleauth
8053 if (mdr->more()->is_ambiguous_auth) {
8054 if (srcdn->is_auth())
8055 mdr->more()->rename_inode->unfreeze_inode(finished);
8056
8057 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8058 mdr->more()->is_ambiguous_auth = false;
8059 }
8060 mds->queue_waiters(finished);
8061 mdcache->request_finish(mdr);
8062 }
8063 }
8064 }
8065
8066 void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
8067 bool isdir, int linkunlink, nest_info_t &rstat)
8068 {
8069 fnode_t *pf;
8070 pf = dir->project_fnode();
8071 mut->add_projected_fnode(dir);
8072 pf->version = dir->pre_dirty();
8073
8074 if (isdir) {
8075 pf->fragstat.nsubdirs += linkunlink;
8076 } else {
8077 pf->fragstat.nfiles += linkunlink;
8078 }
8079 if (r.ino) {
8080 pf->rstat.rbytes += linkunlink * rstat.rbytes;
8081 pf->rstat.rfiles += linkunlink * rstat.rfiles;
8082 pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
8083 pf->rstat.rsnaprealms += linkunlink * rstat.rsnaprealms;
8084 }
8085 if (pf->fragstat.mtime == ctime) {
8086 pf->fragstat.mtime = r.dirfrag_old_mtime;
8087 if (pf->rstat.rctime == ctime)
8088 pf->rstat.rctime = r.dirfrag_old_rctime;
8089 }
8090 mut->add_updated_lock(&dir->get_inode()->filelock);
8091 mut->add_updated_lock(&dir->get_inode()->nestlock);
8092 }
8093
8094 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
8095 MutationRef mut;
8096 CDentry *srcdn;
8097 version_t srcdnpv;
8098 CDentry *destdn;
8099 CDentry *straydn;
8100 bool finish_mdr;
8101 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
8102 CDentry *sd, version_t pv, CDentry *dd,
8103 CDentry *st, bool f) :
8104 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
8105 straydn(st), finish_mdr(f) {}
8106 void finish(int r) override {
8107 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
8108 destdn, straydn, finish_mdr);
8109 }
8110 };
8111
8112 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
8113 bool finish_mdr)
8114 {
8115 rename_rollback rollback;
8116 bufferlist::iterator p = rbl.begin();
8117 ::decode(rollback, p);
8118
8119 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
8120 // need to finish this update before sending resolve to claim the subtree
8121 mdcache->add_rollback(rollback.reqid, master);
8122
8123 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
8124 mut->ls = mds->mdlog->get_current_segment();
8125
8126 CDentry *srcdn = NULL;
8127 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
8128 if (!srcdir)
8129 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
8130 if (srcdir) {
8131 dout(10) << " srcdir " << *srcdir << dendl;
8132 srcdn = srcdir->lookup(rollback.orig_src.dname);
8133 if (srcdn) {
8134 dout(10) << " srcdn " << *srcdn << dendl;
8135 assert(srcdn->get_linkage()->is_null());
8136 } else
8137 dout(10) << " srcdn not found" << dendl;
8138 } else
8139 dout(10) << " srcdir not found" << dendl;
8140
8141 CDentry *destdn = NULL;
8142 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
8143 if (!destdir)
8144 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
8145 if (destdir) {
8146 dout(10) << " destdir " << *destdir << dendl;
8147 destdn = destdir->lookup(rollback.orig_dest.dname);
8148 if (destdn)
8149 dout(10) << " destdn " << *destdn << dendl;
8150 else
8151 dout(10) << " destdn not found" << dendl;
8152 } else
8153 dout(10) << " destdir not found" << dendl;
8154
8155 CInode *in = NULL;
8156 if (rollback.orig_src.ino) {
8157 in = mdcache->get_inode(rollback.orig_src.ino);
8158 if (in && in->is_dir())
8159 assert(srcdn && destdn);
8160 } else
8161 in = mdcache->get_inode(rollback.orig_src.remote_ino);
8162
8163 CDir *straydir = NULL;
8164 CDentry *straydn = NULL;
8165 if (rollback.stray.dirfrag.ino) {
8166 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
8167 if (straydir) {
8168 dout(10) << "straydir " << *straydir << dendl;
8169 straydn = straydir->lookup(rollback.stray.dname);
8170 if (straydn) {
8171 dout(10) << " straydn " << *straydn << dendl;
8172 assert(straydn->get_linkage()->is_primary());
8173 } else
8174 dout(10) << " straydn not found" << dendl;
8175 } else
8176 dout(10) << "straydir not found" << dendl;
8177 }
8178
8179 CInode *target = NULL;
8180 if (rollback.orig_dest.ino) {
8181 target = mdcache->get_inode(rollback.orig_dest.ino);
8182 if (target)
8183 assert(destdn && straydn);
8184 } else if (rollback.orig_dest.remote_ino)
8185 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
8186
8187 // can't use is_auth() in the resolve stage
8188 mds_rank_t whoami = mds->get_nodeid();
8189 // slave
8190 assert(!destdn || destdn->authority().first != whoami);
8191 assert(!straydn || straydn->authority().first != whoami);
8192
8193 bool force_journal_src = false;
8194 bool force_journal_dest = false;
8195 if (in && in->is_dir() && srcdn->authority().first != whoami)
8196 force_journal_src = _need_force_journal(in, false);
8197 if (in && target && target->is_dir())
8198 force_journal_dest = _need_force_journal(in, true);
8199
8200 version_t srcdnpv = 0;
8201 // repair src
8202 if (srcdn) {
8203 if (srcdn->authority().first == whoami)
8204 srcdnpv = srcdn->pre_dirty();
8205 if (rollback.orig_src.ino) {
8206 assert(in);
8207 srcdn->push_projected_linkage(in);
8208 } else
8209 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
8210 rollback.orig_src.remote_d_type);
8211 }
8212
8213 CInode::mempool_inode *pip = 0;
8214 if (in) {
8215 if (in->authority().first == whoami) {
8216 auto &pi = in->project_inode();
8217 mut->add_projected_inode(in);
8218 pi.inode.version = in->pre_dirty();
8219 pip = &pi.inode;
8220 } else
8221 pip = in->get_projected_inode();
8222 if (pip->ctime == rollback.ctime)
8223 pip->ctime = rollback.orig_src.old_ctime;
8224 }
8225
8226 if (srcdn && srcdn->authority().first == whoami) {
8227 nest_info_t blah;
8228 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
8229 in ? in->is_dir() : false, 1, pip ? pip->accounted_rstat : blah);
8230 }
8231
8232 // repair dest
8233 if (destdn) {
8234 if (rollback.orig_dest.ino && target) {
8235 destdn->push_projected_linkage(target);
8236 } else if (rollback.orig_dest.remote_ino) {
8237 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
8238 rollback.orig_dest.remote_d_type);
8239 } else {
8240 // the dentry will be trimmed soon, it's ok to have wrong linkage
8241 if (rollback.orig_dest.ino)
8242 assert(mds->is_resolve());
8243 destdn->push_projected_linkage();
8244 }
8245 }
8246
8247 if (straydn)
8248 straydn->push_projected_linkage();
8249
8250 if (target) {
8251 CInode::mempool_inode *ti = NULL;
8252 if (target->authority().first == whoami) {
8253 auto &pi = target->project_inode();
8254 mut->add_projected_inode(target);
8255 pi.inode.version = target->pre_dirty();
8256 ti = &pi.inode;
8257 } else
8258 ti = target->get_projected_inode();
8259 if (ti->ctime == rollback.ctime)
8260 ti->ctime = rollback.orig_dest.old_ctime;
8261 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
8262 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
8263 assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
8264 else
8265 assert(rollback.orig_dest.remote_ino &&
8266 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
8267 } else
8268 ti->nlink++;
8269 }
8270
8271 if (srcdn)
8272 dout(0) << " srcdn back to " << *srcdn << dendl;
8273 if (in)
8274 dout(0) << " srci back to " << *in << dendl;
8275 if (destdn)
8276 dout(0) << " destdn back to " << *destdn << dendl;
8277 if (target)
8278 dout(0) << " desti back to " << *target << dendl;
8279
8280 // journal it
8281 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
8282 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
8283 mdlog->start_entry(le);
8284
8285 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
8286 le->commit.add_dir_context(srcdir);
8287 if (rollback.orig_src.ino)
8288 le->commit.add_primary_dentry(srcdn, 0, true);
8289 else
8290 le->commit.add_remote_dentry(srcdn, true);
8291 }
8292
8293 if (!rollback.orig_src.ino && // remote linkage
8294 in && in->authority().first == whoami) {
8295 le->commit.add_dir_context(in->get_projected_parent_dir());
8296 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
8297 }
8298
8299 if (force_journal_dest) {
8300 assert(rollback.orig_dest.ino);
8301 le->commit.add_dir_context(destdir);
8302 le->commit.add_primary_dentry(destdn, 0, true);
8303 }
8304
8305 // slave: no need to journal straydn
8306
8307 if (target && target != in && target->authority().first == whoami) {
8308 assert(rollback.orig_dest.remote_ino);
8309 le->commit.add_dir_context(target->get_projected_parent_dir());
8310 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
8311 }
8312
8313 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
8314 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
8315 le->commit.renamed_dirino = in->ino();
8316 if (srcdn->authority().first == whoami) {
8317 list<CDir*> ls;
8318 in->get_dirfrags(ls);
8319 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
8320 CDir *dir = *p;
8321 if (!dir->is_auth())
8322 le->commit.renamed_dir_frags.push_back(dir->get_frag());
8323 }
8324 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
8325 }
8326 } else if (force_journal_dest) {
8327 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
8328 le->commit.renamed_dirino = target->ino();
8329 }
8330
8331 if (target && target->is_dir()) {
8332 assert(destdn);
8333 mdcache->project_subtree_rename(target, straydir, destdir);
8334 }
8335
8336 if (in && in->is_dir()) {
8337 assert(srcdn);
8338 mdcache->project_subtree_rename(in, destdir, srcdir);
8339 }
8340
8341 if (mdr && !mdr->more()->slave_update_journaled) {
8342 assert(le->commit.empty());
8343 mdlog->cancel_entry(le);
8344 mut->ls = NULL;
8345 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, finish_mdr);
8346 } else {
8347 assert(!le->commit.empty());
8348 if (mdr)
8349 mdr->more()->slave_update_journaled = false;
8350 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr, srcdn, srcdnpv,
8351 destdn, straydn, finish_mdr);
8352 submit_mdlog_entry(le, fin, mdr, __func__);
8353 mdlog->flush();
8354 }
8355 }
8356
8357 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
8358 version_t srcdnpv, CDentry *destdn,
8359 CDentry *straydn, bool finish_mdr)
8360 {
8361 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
8362
8363 if (straydn) {
8364 straydn->get_dir()->unlink_inode(straydn);
8365 straydn->pop_projected_linkage();
8366 }
8367 if (destdn) {
8368 destdn->get_dir()->unlink_inode(destdn);
8369 destdn->pop_projected_linkage();
8370 }
8371 if (srcdn) {
8372 srcdn->pop_projected_linkage();
8373 if (srcdn->authority().first == mds->get_nodeid())
8374 srcdn->mark_dirty(srcdnpv, mut->ls);
8375 }
8376
8377 mut->apply();
8378
8379 if (srcdn && srcdn->get_linkage()->is_primary()) {
8380 CInode *in = srcdn->get_linkage()->get_inode();
8381 if (srcdn->authority().first == mds->get_nodeid())
8382 in->state_set(CInode::STATE_AUTH);
8383 // update subtree map?
8384 if (in && in->is_dir()) {
8385 assert(destdn);
8386 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
8387 }
8388 }
8389
8390 if (destdn) {
8391 CInode *oldin = destdn->get_linkage()->get_inode();
8392 // update subtree map?
8393 if (oldin && oldin->is_dir()) {
8394 assert(straydn);
8395 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
8396 }
8397 }
8398
8399 if (mds->is_resolve()) {
8400 CDir *root = NULL;
8401 if (straydn)
8402 root = mdcache->get_subtree_root(straydn->get_dir());
8403 else if (destdn)
8404 root = mdcache->get_subtree_root(destdn->get_dir());
8405 if (root)
8406 mdcache->try_trim_non_auth_subtree(root);
8407 }
8408
8409 if (mdr) {
8410 list<MDSInternalContextBase*> finished;
8411 if (mdr->more()->is_ambiguous_auth) {
8412 if (srcdn->is_auth())
8413 mdr->more()->rename_inode->unfreeze_inode(finished);
8414
8415 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8416 mdr->more()->is_ambiguous_auth = false;
8417 }
8418 mds->queue_waiters(finished);
8419 if (finish_mdr || mdr->aborted)
8420 mdcache->request_finish(mdr);
8421 else
8422 mdr->more()->slave_rolling_back = false;
8423 }
8424
8425 mdcache->finish_rollback(mut->reqid);
8426
8427 mut->cleanup();
8428 }
8429
8430 /* This function DOES put the passed message before returning*/
8431 void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8432 {
8433 dout(10) << "handle_slave_rename_prep_ack " << *mdr
8434 << " witnessed by " << ack->get_source()
8435 << " " << *ack << dendl;
8436 mds_rank_t from = mds_rank_t(ack->get_source().num());
8437
8438 // note slave
8439 mdr->more()->slaves.insert(from);
8440 if (mdr->more()->srcdn_auth_mds == from &&
8441 mdr->more()->is_remote_frozen_authpin &&
8442 !mdr->more()->is_ambiguous_auth) {
8443 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
8444 }
8445
8446 // witnessed? or add extra witnesses?
8447 assert(mdr->more()->witnessed.count(from) == 0);
8448 if (ack->is_interrupted()) {
8449 dout(10) << " slave request interrupted, noop" << dendl;
8450 } else if (ack->witnesses.empty()) {
8451 mdr->more()->witnessed.insert(from);
8452 if (!ack->is_not_journaled())
8453 mdr->more()->has_journaled_slaves = true;
8454 } else {
8455 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
8456 mdr->more()->extra_witnesses.swap(ack->witnesses);
8457 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
8458 }
8459
8460 // srci import?
8461 if (ack->inode_export.length()) {
8462 dout(10) << " got srci import" << dendl;
8463 mdr->more()->inode_import.claim(ack->inode_export);
8464 mdr->more()->inode_import_v = ack->inode_export_v;
8465 }
8466
8467 // remove from waiting list
8468 assert(mdr->more()->waiting_on_slave.count(from));
8469 mdr->more()->waiting_on_slave.erase(from);
8470
8471 if (mdr->more()->waiting_on_slave.empty())
8472 dispatch_client_request(mdr); // go again!
8473 else
8474 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
8475 }
8476
8477 void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
8478 {
8479 dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
8480 << ack->get_source() << dendl;
8481 assert(mdr->is_slave());
8482 mds_rank_t from = mds_rank_t(ack->get_source().num());
8483
8484 if (mdr->more()->waiting_on_slave.count(from)) {
8485 mdr->more()->waiting_on_slave.erase(from);
8486
8487 if (mdr->more()->waiting_on_slave.empty()) {
8488 if (mdr->slave_request)
8489 dispatch_slave_request(mdr);
8490 } else
8491 dout(10) << " still waiting for rename notify acks from "
8492 << mdr->more()->waiting_on_slave << dendl;
8493 }
8494 }
8495
8496 void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
8497 {
8498 dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
8499
8500 if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
8501 mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
8502
8503 if (mdr->more()->waiting_on_slave.empty()) {
8504 if (mdr->slave_request)
8505 dispatch_slave_request(mdr);
8506 } else
8507 dout(10) << " still waiting for rename notify acks from "
8508 << mdr->more()->waiting_on_slave << dendl;
8509 }
8510 }
8511
8512 // snaps
8513 /* This function takes responsibility for the passed mdr*/
8514 void Server::handle_client_lssnap(MDRequestRef& mdr)
8515 {
8516 MClientRequest *req = mdr->client_request;
8517
8518 // traverse to path
8519 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8520 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8521 respond_to_request(mdr, -ESTALE);
8522 return;
8523 }
8524 if (!diri->is_auth()) {
8525 mdcache->request_forward(mdr, diri->authority().first);
8526 return;
8527 }
8528 if (!diri->is_dir()) {
8529 respond_to_request(mdr, -ENOTDIR);
8530 return;
8531 }
8532 dout(10) << "lssnap on " << *diri << dendl;
8533
8534 // lock snap
8535 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8536 mds->locker->include_snap_rdlocks(rdlocks, diri);
8537 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8538 return;
8539
8540 if (!check_access(mdr, diri, MAY_READ))
8541 return;
8542
8543 SnapRealm *realm = diri->find_snaprealm();
8544 map<snapid_t,SnapInfo*> infomap;
8545 realm->get_snap_info(infomap, diri->get_oldest_snap());
8546
8547 unsigned max_entries = req->head.args.readdir.max_entries;
8548 if (!max_entries)
8549 max_entries = infomap.size();
8550 int max_bytes = req->head.args.readdir.max_bytes;
8551 if (!max_bytes)
8552 // make sure at least one item can be encoded
8553 max_bytes = (512 << 10) + g_conf->mds_max_xattr_pairs_size;
8554
8555 __u64 last_snapid = 0;
8556 string offset_str = req->get_path2();
8557 if (!offset_str.empty())
8558 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
8559
8560 bufferlist dirbl;
8561 encode_empty_dirstat(dirbl);
8562
8563 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
8564
8565 __u32 num = 0;
8566 bufferlist dnbl;
8567 map<snapid_t,SnapInfo*>::iterator p = infomap.upper_bound(last_snapid);
8568 for (; p != infomap.end() && num < max_entries; ++p) {
8569 dout(10) << p->first << " -> " << *p->second << dendl;
8570
8571 // actual
8572 string snap_name;
8573 if (p->second->ino == diri->ino())
8574 snap_name = std::string(p->second->name);
8575 else
8576 snap_name = std::string(p->second->get_long_name());
8577
8578 unsigned start_len = dnbl.length();
8579 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
8580 break;
8581
8582 ::encode(snap_name, dnbl);
8583 encode_infinite_lease(dnbl);
8584
8585 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
8586 if (r < 0) {
8587 bufferlist keep;
8588 keep.substr_of(dnbl, 0, start_len);
8589 dnbl.swap(keep);
8590 break;
8591 }
8592 ++num;
8593 }
8594
8595 ::encode(num, dirbl);
8596 __u16 flags = 0;
8597 if (p == infomap.end()) {
8598 flags = CEPH_READDIR_FRAG_END;
8599 if (last_snapid == 0)
8600 flags |= CEPH_READDIR_FRAG_COMPLETE;
8601 }
8602 ::encode(flags, dirbl);
8603 dirbl.claim_append(dnbl);
8604
8605 mdr->reply_extra_bl = dirbl;
8606 mdr->tracei = diri;
8607 respond_to_request(mdr, 0);
8608 }
8609
8610
8611 // MKSNAP
8612
8613 struct C_MDS_mksnap_finish : public ServerLogContext {
8614 CInode *diri;
8615 SnapInfo info;
8616 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
8617 ServerLogContext(s, r), diri(di), info(i) {}
8618 void finish(int r) override {
8619 server->_mksnap_finish(mdr, diri, info);
8620 }
8621 };
8622
8623 /* This function takes responsibility for the passed mdr*/
8624 void Server::handle_client_mksnap(MDRequestRef& mdr)
8625 {
8626 if (!mds->mdsmap->allows_snaps()) {
8627 // you can't make snapshots until you set an option right now
8628 respond_to_request(mdr, -EPERM);
8629 return;
8630 }
8631
8632 MClientRequest *req = mdr->client_request;
8633 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8634 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8635 respond_to_request(mdr, -ESTALE);
8636 return;
8637 }
8638
8639 if (!diri->is_auth()) { // fw to auth?
8640 mdcache->request_forward(mdr, diri->authority().first);
8641 return;
8642 }
8643
8644 // dir only
8645 if (!diri->is_dir()) {
8646 respond_to_request(mdr, -ENOTDIR);
8647 return;
8648 }
8649 if (diri->is_system() && !diri->is_root()) {
8650 // no snaps in system dirs (root is ok)
8651 respond_to_request(mdr, -EPERM);
8652 return;
8653 }
8654
8655 boost::string_view snapname = req->get_filepath().last_dentry();
8656
8657 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8658 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8659 respond_to_request(mdr, -EPERM);
8660 return;
8661 }
8662
8663 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
8664
8665 // lock snap
8666 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8667
8668 mds->locker->include_snap_rdlocks(rdlocks, diri);
8669 rdlocks.erase(&diri->snaplock);
8670 xlocks.insert(&diri->snaplock);
8671
8672 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8673 return;
8674
8675 if (!check_access(mdr, diri, MAY_WRITE))
8676 return;
8677
8678 // make sure name is unique
8679 if (diri->snaprealm &&
8680 diri->snaprealm->exists(snapname)) {
8681 respond_to_request(mdr, -EEXIST);
8682 return;
8683 }
8684 if (snapname.length() == 0 ||
8685 snapname[0] == '_') {
8686 respond_to_request(mdr, -EINVAL);
8687 return;
8688 }
8689
8690 // allocate a snapid
8691 if (!mdr->more()->stid) {
8692 // prepare an stid
8693 mds->snapclient->prepare_create(diri->ino(), snapname,
8694 mdr->get_mds_stamp(),
8695 &mdr->more()->stid, &mdr->more()->snapidbl,
8696 new C_MDS_RetryRequest(mdcache, mdr));
8697 return;
8698 }
8699
8700 version_t stid = mdr->more()->stid;
8701 snapid_t snapid;
8702 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8703 ::decode(snapid, p);
8704 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
8705
8706 // journal
8707 SnapInfo info;
8708 info.ino = diri->ino();
8709 info.snapid = snapid;
8710 info.name = std::string(snapname);
8711 info.stamp = mdr->get_op_stamp();
8712
8713 auto &pi = diri->project_inode(false, true);
8714 pi.inode.ctime = info.stamp;
8715 if (info.stamp > pi.inode.rstat.rctime)
8716 pi.inode.rstat.rctime = info.stamp;
8717 pi.inode.version = diri->pre_dirty();
8718
8719 // project the snaprealm
8720 auto &newsnap = *pi.snapnode;
8721 newsnap.created = snapid;
8722 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
8723 if (!em.second)
8724 em.first->second = info;
8725 newsnap.seq = snapid;
8726 newsnap.last_created = snapid;
8727
8728 // journal the inode changes
8729 mdr->ls = mdlog->get_current_segment();
8730 EUpdate *le = new EUpdate(mdlog, "mksnap");
8731 mdlog->start_entry(le);
8732
8733 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8734 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8735 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8736 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8737
8738 // journal the snaprealm changes
8739 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
8740 mdr, __func__);
8741 mdlog->flush();
8742 }
8743
8744 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
8745 {
8746 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
8747
8748 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
8749
8750 diri->pop_and_dirty_projected_inode(mdr->ls);
8751 mdr->apply();
8752
8753 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
8754
8755 // create snap
8756 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8757
8758 mdcache->do_realm_invalidate_and_update_notify(diri, op);
8759
8760 // yay
8761 mdr->in[0] = diri;
8762 mdr->snapid = info.snapid;
8763 mdr->tracei = diri;
8764 respond_to_request(mdr, 0);
8765 }
8766
8767
8768 // RMSNAP
8769
8770 struct C_MDS_rmsnap_finish : public ServerLogContext {
8771 CInode *diri;
8772 snapid_t snapid;
8773 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8774 ServerLogContext(s, r), diri(di), snapid(sn) {}
8775 void finish(int r) override {
8776 server->_rmsnap_finish(mdr, diri, snapid);
8777 }
8778 };
8779
8780 /* This function takes responsibility for the passed mdr*/
8781 void Server::handle_client_rmsnap(MDRequestRef& mdr)
8782 {
8783 MClientRequest *req = mdr->client_request;
8784
8785 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8786 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8787 respond_to_request(mdr, -ESTALE);
8788 return;
8789 }
8790 if (!diri->is_auth()) { // fw to auth?
8791 mdcache->request_forward(mdr, diri->authority().first);
8792 return;
8793 }
8794 if (!diri->is_dir()) {
8795 respond_to_request(mdr, -ENOTDIR);
8796 return;
8797 }
8798
8799 boost::string_view snapname = req->get_filepath().last_dentry();
8800
8801 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8802 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
8803 respond_to_request(mdr, -EPERM);
8804 return;
8805 }
8806
8807 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
8808
8809 // does snap exist?
8810 if (snapname.length() == 0 || snapname[0] == '_') {
8811 respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
8812 return;
8813 }
8814 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
8815 respond_to_request(mdr, -ENOENT);
8816 return;
8817 }
8818 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
8819 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
8820
8821 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8822 mds->locker->include_snap_rdlocks(rdlocks, diri);
8823 rdlocks.erase(&diri->snaplock);
8824 xlocks.insert(&diri->snaplock);
8825
8826 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8827 return;
8828
8829 if (!check_access(mdr, diri, MAY_WRITE))
8830 return;
8831
8832 // prepare
8833 if (!mdr->more()->stid) {
8834 mds->snapclient->prepare_destroy(diri->ino(), snapid,
8835 &mdr->more()->stid, &mdr->more()->snapidbl,
8836 new C_MDS_RetryRequest(mdcache, mdr));
8837 return;
8838 }
8839 version_t stid = mdr->more()->stid;
8840 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8841 snapid_t seq;
8842 ::decode(seq, p);
8843 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8844
8845 // journal
8846 auto &pi = diri->project_inode(false, true);
8847 pi.inode.version = diri->pre_dirty();
8848 pi.inode.ctime = mdr->get_op_stamp();
8849 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
8850 pi.inode.rstat.rctime = mdr->get_op_stamp();
8851
8852 mdr->ls = mdlog->get_current_segment();
8853 EUpdate *le = new EUpdate(mdlog, "rmsnap");
8854 mdlog->start_entry(le);
8855
8856 // project the snaprealm
8857 auto &newnode = *pi.snapnode;
8858 newnode.snaps.erase(snapid);
8859 newnode.seq = seq;
8860 newnode.last_destroyed = seq;
8861
8862 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
8863 le->metablob.add_table_transaction(TABLE_SNAP, stid);
8864 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
8865 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
8866
8867 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
8868 mdr, __func__);
8869 mdlog->flush();
8870 }
8871
8872 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
8873 {
8874 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
8875 snapid_t stid = mdr->more()->stid;
8876 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8877 snapid_t seq;
8878 ::decode(seq, p);
8879
8880 diri->pop_and_dirty_projected_inode(mdr->ls);
8881 mdr->apply();
8882
8883 mds->snapclient->commit(stid, mdr->ls);
8884
8885 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
8886
8887 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
8888
8889 // yay
8890 mdr->in[0] = diri;
8891 respond_to_request(mdr, 0);
8892
8893 // purge snapshot data
8894 if (diri->snaprealm->have_past_parents_open())
8895 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
8896 }
8897
8898 struct C_MDS_renamesnap_finish : public ServerLogContext {
8899 CInode *diri;
8900 snapid_t snapid;
8901 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
8902 ServerLogContext(s, r), diri(di), snapid(sn) {}
8903 void finish(int r) override {
8904 server->_renamesnap_finish(mdr, diri, snapid);
8905 }
8906 };
8907
8908 /* This function takes responsibility for the passed mdr*/
8909 void Server::handle_client_renamesnap(MDRequestRef& mdr)
8910 {
8911 MClientRequest *req = mdr->client_request;
8912 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
8913 respond_to_request(mdr, -EINVAL);
8914 return;
8915 }
8916
8917 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
8918 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
8919 respond_to_request(mdr, -ESTALE);
8920 return;
8921 }
8922
8923 if (!diri->is_auth()) { // fw to auth?
8924 mdcache->request_forward(mdr, diri->authority().first);
8925 return;
8926 }
8927
8928 if (!diri->is_dir()) { // dir only
8929 respond_to_request(mdr, -ENOTDIR);
8930 return;
8931 }
8932
8933 if (mdr->client_request->get_caller_uid() < g_conf->mds_snap_min_uid ||
8934 mdr->client_request->get_caller_uid() > g_conf->mds_snap_max_uid) {
8935 respond_to_request(mdr, -EPERM);
8936 return;
8937 }
8938
8939 boost::string_view dstname = req->get_filepath().last_dentry();
8940 boost::string_view srcname = req->get_filepath2().last_dentry();
8941 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
8942
8943 if (srcname.length() == 0 || srcname[0] == '_') {
8944 respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
8945 return;
8946 }
8947 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
8948 respond_to_request(mdr, -ENOENT);
8949 return;
8950 }
8951 if (dstname.length() == 0 || dstname[0] == '_') {
8952 respond_to_request(mdr, -EINVAL);
8953 return;
8954 }
8955 if (diri->snaprealm->exists(dstname)) {
8956 respond_to_request(mdr, -EEXIST);
8957 return;
8958 }
8959
8960 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
8961 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
8962
8963 // lock snap
8964 set<SimpleLock*> rdlocks, wrlocks, xlocks;
8965
8966 mds->locker->include_snap_rdlocks(rdlocks, diri);
8967 rdlocks.erase(&diri->snaplock);
8968 xlocks.insert(&diri->snaplock);
8969
8970 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
8971 return;
8972
8973 if (!check_access(mdr, diri, MAY_WRITE))
8974 return;
8975
8976 // prepare
8977 if (!mdr->more()->stid) {
8978 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
8979 &mdr->more()->stid, &mdr->more()->snapidbl,
8980 new C_MDS_RetryRequest(mdcache, mdr));
8981 return;
8982 }
8983
8984 version_t stid = mdr->more()->stid;
8985 bufferlist::iterator p = mdr->more()->snapidbl.begin();
8986 snapid_t seq;
8987 ::decode(seq, p);
8988 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
8989
8990 // journal
8991 auto &pi = diri->project_inode(false, true);
8992 pi.inode.ctime = mdr->get_op_stamp();
8993 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
8994 pi.inode.rstat.rctime = mdr->get_op_stamp();
8995 pi.inode.version = diri->pre_dirty();
8996
8997 // project the snaprealm
8998 auto &newsnap = *pi.snapnode;
8999 auto it = newsnap.snaps.find(snapid);
9000 assert(it != newsnap.snaps.end());
9001 it->second.name = std::string(dstname);
9002
9003 // journal the inode changes
9004 mdr->ls = mdlog->get_current_segment();
9005 EUpdate *le = new EUpdate(mdlog, "renamesnap");
9006 mdlog->start_entry(le);
9007
9008 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
9009 le->metablob.add_table_transaction(TABLE_SNAP, stid);
9010 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
9011 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
9012
9013 // journal the snaprealm changes
9014 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
9015 mdr, __func__);
9016 mdlog->flush();
9017 }
9018
9019 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
9020 {
9021 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
9022
9023 diri->pop_and_dirty_projected_inode(mdr->ls);
9024 mdr->apply();
9025
9026 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
9027
9028 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
9029
9030 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE, true);
9031
9032 // yay
9033 mdr->in[0] = diri;
9034 mdr->tracei = diri;
9035 mdr->snapid = snapid;
9036 respond_to_request(mdr, 0);
9037 }
9038
9039 /**
9040 * Return true if server is in state RECONNECT and this
9041 * client has not yet reconnected.
9042 */
9043 bool Server::waiting_for_reconnect(client_t c) const
9044 {
9045 return client_reconnect_gather.count(c) > 0;
9046 }
9047
9048 void Server::dump_reconnect_status(Formatter *f) const
9049 {
9050 f->open_object_section("reconnect_status");
9051 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
9052 f->close_section();
9053 }