]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
import ceph nautilus 14.2.2
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
21
22 #include "MDSRank.h"
23 #include "Server.h"
24 #include "Locker.h"
25 #include "MDCache.h"
26 #include "MDLog.h"
27 #include "Migrator.h"
28 #include "MDBalancer.h"
29 #include "InoTable.h"
30 #include "SnapClient.h"
31 #include "Mutation.h"
32 #include "cephfs_features.h"
33
34 #include "msg/Messenger.h"
35
36 #include "osdc/Objecter.h"
37
38 #include "events/EUpdate.h"
39 #include "events/ESlaveUpdate.h"
40 #include "events/ESession.h"
41 #include "events/EOpen.h"
42 #include "events/ECommitted.h"
43
44 #include "include/stringify.h"
45 #include "include/filepath.h"
46 #include "common/errno.h"
47 #include "common/Timer.h"
48 #include "common/perf_counters.h"
49 #include "include/compat.h"
50 #include "osd/OSDMap.h"
51
52 #include <errno.h>
53 #include <math.h>
54
55 #include <list>
56 #include <iostream>
57 #include <string_view>
58
59 #include "common/config.h"
60
61 #define dout_context g_ceph_context
62 #define dout_subsys ceph_subsys_mds
63 #undef dout_prefix
64 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
65
66 class ServerContext : public MDSContext {
67 protected:
68 Server *server;
69 MDSRank *get_mds() override
70 {
71 return server->mds;
72 }
73
74 public:
75 explicit ServerContext(Server *s) : server(s) {
76 ceph_assert(server != NULL);
77 }
78 };
79
80 class ServerLogContext : public MDSLogContextBase {
81 protected:
82 Server *server;
83 MDSRank *get_mds() override
84 {
85 return server->mds;
86 }
87
88 MDRequestRef mdr;
89 void pre_finish(int r) override {
90 if (mdr)
91 mdr->mark_event("journal_committed: ");
92 }
93 public:
94 explicit ServerLogContext(Server *s) : server(s) {
95 ceph_assert(server != NULL);
96 }
97 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
98 ceph_assert(server != NULL);
99 }
100 };
101
102 void Server::create_logger()
103 {
104 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
105
106 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
107 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
108 plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
109 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
110 plb.add_u64_counter(l_mdss_handle_client_session,
111 "handle_client_session", "Client session messages", "hcs",
112 PerfCountersBuilder::PRIO_INTERESTING);
113 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
114 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
115
116 // fop latencies are useful
117 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
118 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
119 "Request type lookup hash of inode latency");
120 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
121 "Request type lookup inode latency");
122 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
123 "Request type lookup parent latency");
124 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
125 "Request type lookup name latency");
126 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
127 "Request type lookup latency");
128 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
129 "Request type lookup snapshot latency");
130 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
131 "Request type get attribute latency");
132 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
133 "Request type set attribute latency");
134 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
135 "Request type set file layout latency");
136 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
137 "Request type set directory layout latency");
138 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
139 "Request type set extended attribute latency");
140 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
141 "Request type remove extended attribute latency");
142 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
143 "Request type read directory latency");
144 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
145 "Request type set file lock latency");
146 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
147 "Request type get file lock latency");
148 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
149 "Request type create latency");
150 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
151 "Request type open latency");
152 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
153 "Request type make node latency");
154 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
155 "Request type link latency");
156 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
157 "Request type unlink latency");
158 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
159 "Request type remove directory latency");
160 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
161 "Request type rename latency");
162 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
163 "Request type make directory latency");
164 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
165 "Request type symbolic link latency");
166 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
167 "Request type list snapshot latency");
168 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
169 "Request type make snapshot latency");
170 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
171 "Request type remove snapshot latency");
172 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
173 "Request type rename snapshot latency");
174
175 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
176 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
177 "Client requests dispatched");
178 plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request",
179 "Server requests dispatched");
180
181 logger = plb.create_perf_counters();
182 g_ceph_context->get_perfcounters_collection()->add(logger);
183 }
184
185 Server::Server(MDSRank *m) :
186 mds(m),
187 mdcache(mds->mdcache), mdlog(mds->mdlog),
188 logger(0),
189 is_full(false),
190 reconnect_done(NULL),
191 failed_reconnects(0),
192 reconnect_evicting(false),
193 terminating_sessions(false),
194 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate"))
195 {
196 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
197 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
198 }
199
200 void Server::dispatch(const Message::const_ref &m)
201 {
202 switch (m->get_type()) {
203 case CEPH_MSG_CLIENT_RECONNECT:
204 handle_client_reconnect(MClientReconnect::msgref_cast(m));
205 return;
206 }
207
208 // active?
209 // handle_slave_request()/handle_client_session() will wait if necessary
210 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
211 const auto &req = MClientRequest::msgref_cast(m);
212 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
213 Session *session = mds->get_session(req);
214 if (!session || session->is_closed()) {
215 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
216 return;
217 }
218 bool queue_replay = false;
219 if (req->is_replay()) {
220 dout(3) << "queuing replayed op" << dendl;
221 queue_replay = true;
222 if (req->head.ino &&
223 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
224 mdcache->add_replay_ino_alloc(inodeno_t(req->head.ino));
225 }
226 } else if (req->get_retry_attempt()) {
227 // process completed request in clientreplay stage. The completed request
228 // might have created new file/directorie. This guarantees MDS sends a reply
229 // to client before other request modifies the new file/directorie.
230 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
231 dout(3) << "queuing completed op" << dendl;
232 queue_replay = true;
233 }
234 // this request was created before the cap reconnect message, drop any embedded
235 // cap releases.
236 req->releases.clear();
237 }
238 if (queue_replay) {
239 req->mark_queued_for_replay();
240 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
241 return;
242 }
243 }
244
245 bool wait_for_active = true;
246 if (mds->is_stopping()) {
247 wait_for_active = false;
248 } else if (mds->is_clientreplay()) {
249 if (req->is_queued_for_replay()) {
250 wait_for_active = false;
251 }
252 }
253 if (wait_for_active) {
254 dout(3) << "not active yet, waiting" << dendl;
255 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
256 return;
257 }
258 }
259
260 switch (m->get_type()) {
261 case CEPH_MSG_CLIENT_SESSION:
262 handle_client_session(MClientSession::msgref_cast(m));
263 return;
264 case CEPH_MSG_CLIENT_REQUEST:
265 handle_client_request(MClientRequest::msgref_cast(m));
266 return;
267 case CEPH_MSG_CLIENT_RECLAIM:
268 handle_client_reclaim(MClientReclaim::msgref_cast(m));
269 return;
270 case MSG_MDS_SLAVE_REQUEST:
271 handle_slave_request(MMDSSlaveRequest::msgref_cast(m));
272 return;
273 default:
274 derr << "server unknown message " << m->get_type() << dendl;
275 ceph_abort_msg("server unknown message");
276 }
277 }
278
279
280
281 // ----------------------------------------------------------
282 // SESSION management
283
284 class C_MDS_session_finish : public ServerLogContext {
285 Session *session;
286 uint64_t state_seq;
287 bool open;
288 version_t cmapv;
289 interval_set<inodeno_t> inos;
290 version_t inotablev;
291 Context *fin;
292 public:
293 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
294 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
295 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
296 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { }
297 void finish(int r) override {
298 ceph_assert(r == 0);
299 server->_session_logged(session, state_seq, open, cmapv, inos, inotablev);
300 if (fin) {
301 fin->complete(r);
302 }
303 }
304 };
305
306 Session* Server::find_session_by_uuid(std::string_view uuid)
307 {
308 Session* session = nullptr;
309 for (auto& it : mds->sessionmap.get_sessions()) {
310 auto& metadata = it.second->info.client_metadata;
311
312 auto p = metadata.find("uuid");
313 if (p == metadata.end() || p->second != uuid)
314 continue;
315
316 if (!session) {
317 session = it.second;
318 } else if (!session->reclaiming_from) {
319 assert(it.second->reclaiming_from == session);
320 session = it.second;
321 } else {
322 assert(session->reclaiming_from == it.second);
323 }
324 }
325 return session;
326 }
327
328 void Server::reclaim_session(Session *session, const MClientReclaim::const_ref &m)
329 {
330 if (!session->is_open() && !session->is_stale()) {
331 dout(10) << "session not open, dropping this req" << dendl;
332 return;
333 }
334
335 auto reply = MClientReclaimReply::create(0);
336 if (m->get_uuid().empty()) {
337 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
338 reply->set_result(-EINVAL);
339 mds->send_message_client(reply, session);
340 return;
341 }
342
343 unsigned flags = m->get_flags();
344 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
345 dout(10) << __func__ << " unsupported flags" << dendl;
346 reply->set_result(-EOPNOTSUPP);
347 mds->send_message_client(reply, session);
348 return;
349 }
350
351 Session* target = find_session_by_uuid(m->get_uuid());
352 if (target) {
353 if (session->info.auth_name != target->info.auth_name) {
354 dout(10) << __func__ << " session auth_name " << session->info.auth_name
355 << " != target auth_name " << target->info.auth_name << dendl;
356 reply->set_result(-EPERM);
357 mds->send_message_client(reply, session);
358 }
359
360 assert(!target->reclaiming_from);
361 assert(!session->reclaiming_from);
362 session->reclaiming_from = target;
363 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
364 }
365
366 if (flags & CEPH_RECLAIM_RESET) {
367 finish_reclaim_session(session, reply);
368 return;
369 }
370
371 ceph_abort();
372 }
373
374 void Server::finish_reclaim_session(Session *session, const MClientReclaimReply::ref &reply)
375 {
376 Session *target = session->reclaiming_from;
377 if (target) {
378 session->reclaiming_from = nullptr;
379
380 Context *send_reply;
381 if (reply) {
382 int64_t session_id = session->get_client().v;
383 send_reply = new FunctionContext([this, session_id, reply](int r) {
384 assert(mds->mds_lock.is_locked_by_me());
385 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
386 if (!session) {
387 return;
388 }
389 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
390 reply->set_epoch(epoch);
391 mds->send_message_client(reply, session);
392 });
393 } else {
394 send_reply = nullptr;
395 }
396
397 bool blacklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
398 return map.is_blacklisted(target->info.inst.addr);
399 });
400
401 if (blacklisted || !g_conf()->mds_session_blacklist_on_evict) {
402 kill_session(target, send_reply);
403 } else {
404 std::stringstream ss;
405 mds->evict_client(target->get_client().v, false, true, ss, send_reply);
406 }
407 } else if (reply) {
408 mds->send_message_client(reply, session);
409 }
410 }
411
412 void Server::handle_client_reclaim(const MClientReclaim::const_ref &m)
413 {
414 Session *session = mds->get_session(m);
415 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
416 assert(m->get_source().is_client()); // should _not_ come from an mds!
417
418 if (!session) {
419 dout(0) << " ignoring sessionless msg " << *m << dendl;
420 return;
421 }
422
423 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
424 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
425 return;
426 }
427
428 if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
429 finish_reclaim_session(session);
430 } else {
431 reclaim_session(session, m);
432 }
433 }
434
435 void Server::handle_client_session(const MClientSession::const_ref &m)
436 {
437 version_t pv;
438 Session *session = mds->get_session(m);
439
440 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
441 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
442
443 if (!session) {
444 dout(0) << " ignoring sessionless msg " << *m << dendl;
445 return;
446 }
447
448 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
449 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
450 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
451 // close requests need to be handled when mds is active
452 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
453 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
454 return;
455 }
456 } else {
457 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
458 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
459 return;
460 }
461 }
462
463 if (logger)
464 logger->inc(l_mdss_handle_client_session);
465
466 uint64_t sseq = 0;
467 switch (m->get_op()) {
468 case CEPH_SESSION_REQUEST_OPEN:
469 if (session->is_opening() ||
470 session->is_open() ||
471 session->is_stale() ||
472 session->is_killing() ||
473 terminating_sessions) {
474 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
475 return;
476 }
477 ceph_assert(session->is_closed() || session->is_closing());
478
479 if (mds->is_stopping()) {
480 dout(10) << "mds is stopping, dropping open req" << dendl;
481 return;
482 }
483
484 {
485 auto& addr = session->info.inst.addr;
486 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features));
487 auto& client_metadata = session->info.client_metadata;
488
489 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
490 auto now = ceph_clock_now();
491 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
492 auto elapsed = now - m->get_recv_stamp();
493 CachedStackStringStream css;
494 *css << "New client session:"
495 << " addr=\"" << session->info.inst.addr << "\""
496 << ",elapsed=" << elapsed
497 << ",throttled=" << throttle_elapsed
498 << ",status=\"" << status << "\"";
499 if (!err.empty()) {
500 *css << ",error=\"" << err << "\"";
501 }
502 const auto& metadata = session->info.client_metadata;
503 if (auto it = metadata.find("root"); it != metadata.end()) {
504 *css << ",root=\"" << it->second << "\"";
505 }
506 dout(2) << css->strv() << dendl;
507 };
508
509 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str) {
510 auto m = MClientSession::create(CEPH_SESSION_REJECT);
511 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
512 m->metadata["error_string"] = err_str;
513 mds->send_message_client(m, session);
514 log_session_status("REJECTED", err_str);
515 };
516
517 bool blacklisted = mds->objecter->with_osdmap(
518 [&addr](const OSDMap &osd_map) -> bool {
519 return osd_map.is_blacklisted(addr);
520 });
521
522 if (blacklisted) {
523 dout(10) << "rejecting blacklisted client " << addr << dendl;
524 send_reject_message("blacklisted");
525 session->clear();
526 break;
527 }
528
529 if (client_metadata.features.empty())
530 infer_supported_features(session, client_metadata);
531
532 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
533 dout(20) << " features: '" << client_metadata.features << dendl;
534 for (const auto& p : client_metadata) {
535 dout(20) << " " << p.first << ": " << p.second << dendl;
536 }
537
538 feature_bitset_t missing_features = required_client_features;
539 missing_features -= client_metadata.features;
540 if (!missing_features.empty()) {
541 stringstream ss;
542 ss << "missing required features '" << missing_features << "'";
543 send_reject_message(ss.str());
544 mds->clog->warn() << "client session lacks required features '"
545 << missing_features << "' denied (" << session->info.inst << ")";
546 session->clear();
547 break;
548 }
549
550 // Special case for the 'root' metadata path; validate that the claimed
551 // root is actually within the caps of the session
552 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
553 auto claimed_root = it->second;
554 stringstream ss;
555 bool denied = false;
556 // claimed_root has a leading "/" which we strip before passing
557 // into caps check
558 if (claimed_root.empty() || claimed_root[0] != '/') {
559 denied = true;
560 ss << "invalue root '" << claimed_root << "'";
561 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
562 denied = true;
563 ss << "non-allowable root '" << claimed_root << "'";
564 }
565
566 if (denied) {
567 // Tell the client we're rejecting their open
568 send_reject_message(ss.str());
569 mds->clog->warn() << "client session with " << ss.str()
570 << " denied (" << session->info.inst << ")";
571 session->clear();
572 break;
573 }
574 }
575
576 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
577 if (find_session_by_uuid(it->second)) {
578 send_reject_message("duplicated session uuid");
579 mds->clog->warn() << "client session with duplicated session uuid '"
580 << it->second << "' denied (" << session->info.inst << ")";
581 session->clear();
582 break;
583 }
584 }
585
586 if (session->is_closed())
587 mds->sessionmap.add_session(session);
588
589 pv = mds->sessionmap.mark_projected(session);
590 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
591 mds->sessionmap.touch_session(session);
592 auto fin = new FunctionContext([log_session_status = std::move(log_session_status)](int r){
593 ceph_assert(r == 0);
594 log_session_status("ACCEPTED", "");
595 });
596 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
597 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
598 mdlog->flush();
599 }
600 break;
601
602 case CEPH_SESSION_REQUEST_RENEWCAPS:
603 if (session->is_open() || session->is_stale()) {
604 mds->sessionmap.touch_session(session);
605 if (session->is_stale()) {
606 mds->sessionmap.set_state(session, Session::STATE_OPEN);
607 mds->locker->resume_stale_caps(session);
608 mds->sessionmap.touch_session(session);
609 }
610 auto reply = MClientSession::create(CEPH_SESSION_RENEWCAPS, m->get_seq());
611 mds->send_message_client(reply, session);
612 } else {
613 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
614 }
615 break;
616
617 case CEPH_SESSION_REQUEST_CLOSE:
618 {
619 if (session->is_closed() ||
620 session->is_closing() ||
621 session->is_killing()) {
622 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
623 return;
624 }
625 if (session->is_importing()) {
626 dout(10) << "ignoring close req on importing session" << dendl;
627 return;
628 }
629 ceph_assert(session->is_open() ||
630 session->is_stale() ||
631 session->is_opening());
632 if (m->get_seq() < session->get_push_seq()) {
633 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
634 << ", dropping" << dendl;
635 return;
636 }
637 // We are getting a seq that is higher than expected.
638 // Handle the same as any other seqn error.
639 //
640 if (m->get_seq() != session->get_push_seq()) {
641 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
642 << ", BUGGY!" << dendl;
643 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
644 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
645 return;
646 }
647 journal_close_session(session, Session::STATE_CLOSING, NULL);
648 }
649 break;
650
651 case CEPH_SESSION_FLUSHMSG_ACK:
652 finish_flush_session(session, m->get_seq());
653 break;
654
655 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
656 if (mds->is_active())
657 mdlog->flush();
658 break;
659
660 default:
661 ceph_abort();
662 }
663 }
664
665
666 void Server::flush_session(Session *session, MDSGatherBuilder *gather) {
667 if (!session->is_open() ||
668 !session->get_connection() ||
669 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
670 return;
671 }
672
673 version_t seq = session->wait_for_flush(gather->new_sub());
674 mds->send_message_client(
675 MClientSession::create(CEPH_SESSION_FLUSHMSG, seq), session);
676 }
677
678 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
679 {
680 for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
681 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
682 ceph_assert(session);
683 flush_session(session, &gather);
684 }
685 }
686
687 void Server::finish_flush_session(Session *session, version_t seq)
688 {
689 MDSContext::vec finished;
690 session->finish_flush(seq, finished);
691 mds->queue_waiters(finished);
692 }
693
694 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
695 interval_set<inodeno_t>& inos, version_t piv)
696 {
697 dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close")
698 << " " << pv << dendl;
699
700 if (piv) {
701 ceph_assert(session->is_closing() || session->is_killing() ||
702 session->is_opening()); // re-open closing session
703 session->info.prealloc_inos.subtract(inos);
704 mds->inotable->apply_release_ids(inos);
705 ceph_assert(mds->inotable->get_version() == piv);
706 }
707
708 mds->sessionmap.mark_dirty(session);
709
710 // apply
711 if (session->get_state_seq() != state_seq) {
712 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
713 << ", noop" << dendl;
714 // close must have been canceled (by an import?), or any number of other things..
715 } else if (open) {
716 ceph_assert(session->is_opening());
717 mds->sessionmap.set_state(session, Session::STATE_OPEN);
718 mds->sessionmap.touch_session(session);
719 ceph_assert(session->get_connection());
720 auto reply = MClientSession::create(CEPH_SESSION_OPEN);
721 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
722 reply->supported_features = supported_features;
723 mds->send_message_client(reply, session);
724 if (mdcache->is_readonly()) {
725 auto m = MClientSession::create(CEPH_SESSION_FORCE_RO);
726 mds->send_message_client(m, session);
727 }
728 } else if (session->is_closing() ||
729 session->is_killing()) {
730 // kill any lingering capabilities, leases, requests
731 while (!session->caps.empty()) {
732 Capability *cap = session->caps.front();
733 CInode *in = cap->get_inode();
734 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
735 mds->locker->remove_client_cap(in, cap);
736 }
737 while (!session->leases.empty()) {
738 ClientLease *r = session->leases.front();
739 CDentry *dn = static_cast<CDentry*>(r->parent);
740 dout(20) << " killing client lease of " << *dn << dendl;
741 dn->remove_client_lease(r, mds->locker);
742 }
743 if (client_reconnect_gather.erase(session->info.get_client())) {
744 dout(20) << " removing client from reconnect set" << dendl;
745 if (client_reconnect_gather.empty()) {
746 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
747 reconnect_gather_finish();
748 }
749 }
750 if (client_reclaim_gather.erase(session->info.get_client())) {
751 dout(20) << " removing client from reclaim set" << dendl;
752 if (client_reclaim_gather.empty()) {
753 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
754 mds->maybe_clientreplay_done();
755 }
756 }
757
758 if (session->is_closing()) {
759 // mark con disposable. if there is a fault, we will get a
760 // reset and clean it up. if the client hasn't received the
761 // CLOSE message yet, they will reconnect and get an
762 // ms_handle_remote_reset() and realize they had in fact closed.
763 // do this *before* sending the message to avoid a possible
764 // race.
765 if (session->get_connection()) {
766 // Conditional because terminate_sessions will indiscrimately
767 // put sessions in CLOSING whether they ever had a conn or not.
768 session->get_connection()->mark_disposable();
769 }
770
771 // reset session
772 mds->send_message_client(MClientSession::create(CEPH_SESSION_CLOSE), session);
773 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
774 session->clear();
775 mds->sessionmap.remove_session(session);
776 } else if (session->is_killing()) {
777 // destroy session, close connection
778 if (session->get_connection()) {
779 session->get_connection()->mark_down();
780 session->get_connection()->set_priv(NULL);
781 }
782 mds->sessionmap.remove_session(session);
783 } else {
784 ceph_abort();
785 }
786 } else {
787 ceph_abort();
788 }
789 }
790
791 /**
792 * Inject sessions from some source other than actual connections.
793 *
794 * For example:
795 * - sessions inferred from journal replay
796 * - sessions learned from other MDSs during rejoin
797 * - sessions learned from other MDSs during dir/caps migration
798 * - sessions learned from other MDSs during a cross-MDS rename
799 */
800 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
801 map<client_t,client_metadata_t>& cmm,
802 map<client_t, pair<Session*,uint64_t> >& smap)
803 {
804 version_t pv = mds->sessionmap.get_projected();
805
806 dout(10) << "prepare_force_open_sessions " << pv
807 << " on " << cm.size() << " clients"
808 << dendl;
809
810 mds->objecter->with_osdmap(
811 [this, &cm, &cmm](const OSDMap &osd_map) {
812 for (auto p = cm.begin(); p != cm.end(); ) {
813 if (osd_map.is_blacklisted(p->second.addr)) {
814 dout(10) << " ignoring blacklisted client." << p->first
815 << " (" << p->second.addr << ")" << dendl;
816 cmm.erase(p->first);
817 cm.erase(p++);
818 } else {
819 ++p;
820 }
821 }
822 });
823
824 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
825 Session *session = mds->sessionmap.get_or_add_session(p->second);
826 pv = mds->sessionmap.mark_projected(session);
827 uint64_t sseq;
828 if (session->is_closed() ||
829 session->is_closing() ||
830 session->is_killing()) {
831 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
832 auto q = cmm.find(p->first);
833 if (q != cmm.end())
834 session->info.client_metadata.merge(q->second);
835 } else {
836 ceph_assert(session->is_open() ||
837 session->is_opening() ||
838 session->is_stale());
839 sseq = 0;
840 }
841 smap[p->first] = make_pair(session, sseq);
842 session->inc_importing();
843 }
844 return pv;
845 }
846
847 void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
848 bool dec_import)
849 {
850 /*
851 * FIXME: need to carefully consider the race conditions between a
852 * client trying to close a session and an MDS doing an import
853 * trying to force open a session...
854 */
855 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
856 << " initial v " << mds->sessionmap.get_version() << dendl;
857
858 for (auto &it : smap) {
859 Session *session = it.second.first;
860 uint64_t sseq = it.second.second;
861 if (sseq > 0) {
862 if (session->get_state_seq() != sseq) {
863 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
864 } else {
865 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
866 mds->sessionmap.set_state(session, Session::STATE_OPEN);
867 mds->sessionmap.touch_session(session);
868
869 auto reply = MClientSession::create(CEPH_SESSION_OPEN);
870 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
871 reply->supported_features = supported_features;
872 mds->send_message_client(reply, session);
873
874 if (mdcache->is_readonly())
875 mds->send_message_client(MClientSession::create(CEPH_SESSION_FORCE_RO), session);
876 }
877 } else {
878 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
879 ceph_assert(session->is_open() || session->is_stale());
880 }
881
882 if (dec_import) {
883 session->dec_importing();
884 }
885
886 mds->sessionmap.mark_dirty(session);
887 }
888
889 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
890 }
891
892 class C_MDS_TerminatedSessions : public ServerContext {
893 void finish(int r) override {
894 server->terminating_sessions = false;
895 }
896 public:
897 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
898 };
899
900 void Server::terminate_sessions()
901 {
902 dout(5) << "terminating all sessions..." << dendl;
903
904 terminating_sessions = true;
905
906 // kill them off. clients will retry etc.
907 set<Session*> sessions;
908 mds->sessionmap.get_client_session_set(sessions);
909 for (set<Session*>::const_iterator p = sessions.begin();
910 p != sessions.end();
911 ++p) {
912 Session *session = *p;
913 if (session->is_closing() ||
914 session->is_killing() ||
915 session->is_closed())
916 continue;
917 journal_close_session(session, Session::STATE_CLOSING, NULL);
918 }
919
920 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
921 }
922
923
924 void Server::find_idle_sessions()
925 {
926 auto now = clock::now();
927 auto last_cleared_laggy = mds->last_cleared_laggy();
928
929 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
930
931 // timeout/stale
932 // (caps go stale, lease die)
933 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
934 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
935
936 std::vector<Session*> to_evict;
937
938 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
939 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
940 std::vector<Session*> new_stale;
941
942 for (auto session : *(sessions_p1->second)) {
943 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
944 if (last_cap_renew_span < cutoff) {
945 dout(20) << "laggiest active session is " << session->info.inst
946 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
947 break;
948 }
949
950 if (session->last_seen > session->last_cap_renew) {
951 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
952 if (last_cap_renew_span < cutoff) {
953 dout(20) << "laggiest active session is " << session->info.inst
954 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
955 continue;
956 }
957 }
958
959 auto it = session->info.client_metadata.find("timeout");
960 if (it != session->info.client_metadata.end()) {
961 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
962 if (timeout == 0) {
963 dout(10) << "skipping session " << session->info.inst
964 << ", infinite timeout specified" << dendl;
965 continue;
966 }
967 double cutoff = queue_max_age + timeout;
968 if (last_cap_renew_span < cutoff) {
969 dout(10) << "skipping session " << session->info.inst
970 << ", timeout (" << timeout << ") specified"
971 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
972 continue;
973 }
974
975 // do not go through stale, evict it directly.
976 to_evict.push_back(session);
977 } else {
978 dout(10) << "new stale session " << session->info.inst
979 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
980 new_stale.push_back(session);
981 }
982 }
983
984 for (auto session : new_stale) {
985 mds->sessionmap.set_state(session, Session::STATE_STALE);
986 mds->locker->revoke_stale_caps(session);
987 mds->locker->remove_stale_leases(session);
988 mds->send_message_client(MClientSession::create(CEPH_SESSION_STALE, session->get_push_seq()), session);
989 finish_flush_session(session, session->get_push_seq());
990 }
991 }
992
993 // autoclose
994 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
995
996 // don't kick clients if we've been laggy
997 if (last_cleared_laggy < cutoff) {
998 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
999 << "), not kicking any clients to be safe" << dendl;
1000 return;
1001 }
1002
1003 if (mds->sessionmap.get_sessions().size() == 1 && mds->mdsmap->get_num_in_mds() == 1) {
1004 dout(20) << "skipping client eviction because there is only one" << dendl;
1005 return;
1006 }
1007
1008 // Collect a list of sessions exceeding the autoclose threshold
1009 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
1010 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1011 for (auto session : *(sessions_p2->second)) {
1012 assert(session->is_stale());
1013 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1014 if (last_cap_renew_span < cutoff) {
1015 dout(20) << "oldest stale session is " << session->info.inst
1016 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1017 break;
1018 }
1019 to_evict.push_back(session);
1020 }
1021 }
1022
1023 for (auto session: to_evict) {
1024 if (session->is_importing()) {
1025 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1026 continue;
1027 }
1028
1029 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1030 mds->clog->warn() << "evicting unresponsive client " << *session
1031 << ", after " << last_cap_renew_span << " seconds";
1032 dout(10) << "autoclosing stale session " << session->info.inst
1033 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1034
1035 if (g_conf()->mds_session_blacklist_on_timeout) {
1036 std::stringstream ss;
1037 mds->evict_client(session->get_client().v, false, true, ss, nullptr);
1038 } else {
1039 kill_session(session, NULL);
1040 }
1041 }
1042 }
1043
1044 void Server::evict_cap_revoke_non_responders() {
1045 if (!cap_revoke_eviction_timeout) {
1046 return;
1047 }
1048
1049 std::list<client_t> to_evict;
1050 mds->locker->get_late_revoking_clients(&to_evict, cap_revoke_eviction_timeout);
1051
1052 for (auto const &client: to_evict) {
1053 mds->clog->warn() << "client id " << client << " has not responded to"
1054 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1055 << " seconds, evicting";
1056 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1057 << client << dendl;
1058
1059 std::stringstream ss;
1060 bool evicted = mds->evict_client(client.v, false,
1061 g_conf()->mds_session_blacklist_on_evict,
1062 ss, nullptr);
1063 if (evicted && logger) {
1064 logger->inc(l_mdss_cap_revoke_eviction);
1065 }
1066 }
1067 }
1068
1069 void Server::handle_conf_change(const ConfigProxy& conf,
1070 const std::set <std::string> &changed) {
1071 if (changed.count("mds_cap_revoke_eviction_timeout")) {
1072 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
1073 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1074 << cap_revoke_eviction_timeout << dendl;
1075 }
1076 if (changed.count("mds_recall_max_decay_rate")) {
1077 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
1078 }
1079 }
1080
1081 /*
1082 * XXX bump in the interface here, not using an MDSContext here
1083 * because all the callers right now happen to use a SaferCond
1084 */
1085 void Server::kill_session(Session *session, Context *on_safe)
1086 {
1087 ceph_assert(mds->mds_lock.is_locked_by_me());
1088
1089 if ((session->is_opening() ||
1090 session->is_open() ||
1091 session->is_stale()) &&
1092 !session->is_importing()) {
1093 dout(10) << "kill_session " << session << dendl;
1094 journal_close_session(session, Session::STATE_KILLING, on_safe);
1095 } else {
1096 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
1097 if (session->is_closing() ||
1098 session->is_killing()) {
1099 if (on_safe)
1100 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1101 } else {
1102 ceph_assert(session->is_closed() ||
1103 session->is_importing());
1104 if (on_safe)
1105 on_safe->complete(0);
1106 }
1107 }
1108 }
1109
1110 size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
1111 {
1112 bool prenautilus = mds->objecter->with_osdmap(
1113 [&](const OSDMap& o) {
1114 return o.require_osd_release < CEPH_RELEASE_NAUTILUS;
1115 });
1116
1117 std::vector<Session*> victims;
1118 const auto& sessions = mds->sessionmap.get_sessions();
1119 for (const auto& p : sessions) {
1120 if (!p.first.is_client()) {
1121 // Do not apply OSDMap blacklist to MDS daemons, we find out
1122 // about their death via MDSMap.
1123 continue;
1124 }
1125
1126 Session *s = p.second;
1127 auto inst_addr = s->info.inst.addr;
1128 // blacklist entries are always TYPE_ANY for nautilus+
1129 inst_addr.set_type(entity_addr_t::TYPE_ANY);
1130 if (blacklist.count(inst_addr)) {
1131 victims.push_back(s);
1132 continue;
1133 }
1134 if (prenautilus) {
1135 // ...except pre-nautilus, they were TYPE_LEGACY
1136 inst_addr.set_type(entity_addr_t::TYPE_LEGACY);
1137 if (blacklist.count(inst_addr)) {
1138 victims.push_back(s);
1139 }
1140 }
1141 }
1142
1143 for (const auto s : victims) {
1144 kill_session(s, nullptr);
1145 }
1146
1147 dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
1148
1149 return victims.size();
1150 }
1151
1152 void Server::journal_close_session(Session *session, int state, Context *on_safe)
1153 {
1154 uint64_t sseq = mds->sessionmap.set_state(session, state);
1155 version_t pv = mds->sessionmap.mark_projected(session);
1156 version_t piv = 0;
1157
1158 // release alloc and pending-alloc inos for this session
1159 // and wipe out session state, in case the session close aborts for some reason
1160 interval_set<inodeno_t> both;
1161 both.insert(session->info.prealloc_inos);
1162 both.insert(session->pending_prealloc_inos);
1163 if (both.size()) {
1164 mds->inotable->project_release_ids(both);
1165 piv = mds->inotable->get_projected_version();
1166 } else
1167 piv = 0;
1168
1169 mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv),
1170 new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe));
1171 mdlog->flush();
1172
1173 // clean up requests, too
1174 elist<MDRequestImpl*>::iterator p =
1175 session->requests.begin(member_offset(MDRequestImpl,
1176 item_session_request));
1177 while (!p.end()) {
1178 MDRequestRef mdr = mdcache->request_get((*p)->reqid);
1179 ++p;
1180 mdcache->request_kill(mdr);
1181 }
1182
1183 finish_flush_session(session, session->get_push_seq());
1184 }
1185
1186 void Server::reconnect_clients(MDSContext *reconnect_done_)
1187 {
1188 reconnect_done = reconnect_done_;
1189
1190 auto now = clock::now();
1191 set<Session*> sessions;
1192 mds->sessionmap.get_client_session_set(sessions);
1193 for (auto session : sessions) {
1194 if (session->is_open()) {
1195 client_reconnect_gather.insert(session->get_client());
1196 session->last_cap_renew = now;
1197 }
1198 }
1199
1200 if (client_reconnect_gather.empty()) {
1201 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1202 reconnect_gather_finish();
1203 return;
1204 }
1205
1206 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1207
1208 reconnect_start = now;
1209 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1210 mds->sessionmap.dump();
1211 }
1212
1213 void Server::handle_client_reconnect(const MClientReconnect::const_ref &m)
1214 {
1215 dout(7) << "handle_client_reconnect " << m->get_source()
1216 << (m->has_more() ? " (more)" : "") << dendl;
1217 client_t from = m->get_source().num();
1218 Session *session = mds->get_session(m);
1219 if (!session)
1220 return;
1221
1222 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1223 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1224 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1225 return;
1226 }
1227
1228 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
1229 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1230
1231 bool deny = false;
1232 if (!mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
1233 // XXX maybe in the future we can do better than this?
1234 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1235 mds->clog->info() << "denied reconnect attempt (mds is "
1236 << ceph_mds_state_name(mds->get_state())
1237 << ") from " << m->get_source_inst()
1238 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
1239 deny = true;
1240 } else {
1241 std::string error_str;
1242 if (!session->is_open()) {
1243 error_str = "session is closed";
1244 } else if (mdcache->is_readonly()) {
1245 error_str = "mds is readonly";
1246 } else {
1247 if (session->info.client_metadata.features.empty())
1248 infer_supported_features(session, session->info.client_metadata);
1249
1250 feature_bitset_t missing_features = required_client_features;
1251 missing_features -= session->info.client_metadata.features;
1252 if (!missing_features.empty()) {
1253 stringstream ss;
1254 ss << "missing required features '" << missing_features << "'";
1255 error_str = ss.str();
1256 }
1257 }
1258
1259 if (!error_str.empty()) {
1260 deny = true;
1261 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1262 mds->clog->info() << "denied reconnect attempt from "
1263 << m->get_source_inst() << " (" << error_str << ")";
1264 }
1265 }
1266
1267 if (deny) {
1268 auto r = MClientSession::create(CEPH_SESSION_CLOSE);
1269 mds->send_message_client(r, session);
1270 if (session->is_open())
1271 kill_session(session, nullptr);
1272 return;
1273 }
1274
1275 if (!m->has_more()) {
1276 // notify client of success with an OPEN
1277 auto reply = MClientSession::create(CEPH_SESSION_OPEN);
1278 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
1279 reply->supported_features = supported_features;
1280 mds->send_message_client(reply, session);
1281 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1282 }
1283
1284 session->last_cap_renew = clock::now();
1285
1286 // snaprealms
1287 for (const auto &r : m->realms) {
1288 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
1289 if (in && in->state_test(CInode::STATE_PURGING))
1290 continue;
1291 if (in) {
1292 if (in->snaprealm) {
1293 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
1294 } else {
1295 // this can happen if we are non-auth or we rollback snaprealm
1296 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
1297 }
1298 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1299 } else {
1300 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1301 << " seq " << r.realm.seq << dendl;
1302 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1303 }
1304 }
1305
1306 // caps
1307 for (const auto &p : m->caps) {
1308 // make sure our last_cap_id is MAX over all issued caps
1309 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1310 mdcache->last_cap_id = p.second.capinfo.cap_id;
1311
1312 CInode *in = mdcache->get_inode(p.first);
1313 if (in && in->state_test(CInode::STATE_PURGING))
1314 continue;
1315 if (in && in->is_auth()) {
1316 // we recovered it, and it's ours. take note.
1317 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
1318 << " on " << *in << dendl;
1319 in->reconnect_cap(from, p.second, session);
1320 mdcache->add_reconnected_cap(from, p.first, p.second);
1321 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
1322 continue;
1323 }
1324
1325 if (in && !in->is_auth()) {
1326 // not mine.
1327 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1328 // add to cap export list.
1329 mdcache->rejoin_export_caps(p.first, from, p.second,
1330 in->authority().first, true);
1331 } else {
1332 // don't know if the inode is mine
1333 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1334 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
1335 }
1336 }
1337
1338 reconnect_last_seen = clock::now();
1339
1340 if (!m->has_more()) {
1341 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1342
1343 // remove from gather set
1344 client_reconnect_gather.erase(from);
1345 if (client_reconnect_gather.empty())
1346 reconnect_gather_finish();
1347 }
1348 }
1349
1350 void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1351 {
1352 int supported = -1;
1353 auto it = client_metadata.find("ceph_version");
1354 if (it != client_metadata.end()) {
1355 // user space client
1356 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1357 supported = CEPHFS_FEATURE_LUMINOUS;
1358 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1359 supported = CEPHFS_FEATURE_KRAKEN;
1360 } else {
1361 it = client_metadata.find("kernel_version");
1362 if (it != client_metadata.end()) {
1363 // kernel client
1364 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1365 supported = CEPHFS_FEATURE_LUMINOUS;
1366 }
1367 }
1368 if (supported == -1 &&
1369 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1370 supported = CEPHFS_FEATURE_JEWEL;
1371
1372 if (supported >= 0) {
1373 unsigned long value = (1UL << (supported + 1)) - 1;
1374 client_metadata.features = feature_bitset_t(value);
1375 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1376 }
1377 }
1378
1379 void Server::update_required_client_features()
1380 {
1381 vector<size_t> bits = CEPHFS_FEATURES_MDS_REQUIRED;
1382
1383 int min_compat = mds->mdsmap->get_min_compat_client();
1384 if (min_compat >= CEPH_RELEASE_NAUTILUS) {
1385 bits.push_back(CEPHFS_FEATURE_NAUTILUS);
1386 } else if (min_compat >= CEPH_RELEASE_MIMIC)
1387 bits.push_back(CEPHFS_FEATURE_MIMIC);
1388 else if (min_compat >= CEPH_RELEASE_LUMINOUS)
1389 bits.push_back(CEPHFS_FEATURE_LUMINOUS);
1390 else if (min_compat >= CEPH_RELEASE_KRAKEN)
1391 bits.push_back(CEPHFS_FEATURE_KRAKEN);
1392 else if (min_compat >= CEPH_RELEASE_JEWEL)
1393 bits.push_back(CEPHFS_FEATURE_JEWEL);
1394
1395 std::sort(bits.begin(), bits.end());
1396 required_client_features = feature_bitset_t(bits);
1397 dout(7) << "required_client_features: " << required_client_features << dendl;
1398
1399 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1400 set<Session*> sessions;
1401 mds->sessionmap.get_client_session_set(sessions);
1402 for (auto session : sessions) {
1403 feature_bitset_t missing_features = required_client_features;
1404 missing_features -= session->info.client_metadata.features;
1405 if (!missing_features.empty()) {
1406 bool blacklisted = mds->objecter->with_osdmap(
1407 [session](const OSDMap &osd_map) -> bool {
1408 return osd_map.is_blacklisted(session->info.inst.addr);
1409 });
1410 if (blacklisted)
1411 continue;
1412
1413 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1414 << missing_features << "'";
1415 std::stringstream ss;
1416 mds->evict_client(session->get_client().v, false,
1417 g_conf()->mds_session_blacklist_on_evict, ss);
1418 }
1419 }
1420 }
1421 }
1422
1423 void Server::reconnect_gather_finish()
1424 {
1425 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1426 ceph_assert(reconnect_done);
1427
1428 if (!mds->snapclient->is_synced()) {
1429 // make sure snaptable cache is populated. snaprealms will be
1430 // extensively used in rejoin stage.
1431 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1432 mds->snapclient->wait_for_sync(reconnect_done);
1433 } else {
1434 reconnect_done->complete(0);
1435 }
1436 reconnect_done = NULL;
1437 }
1438
1439 void Server::reconnect_tick()
1440 {
1441 if (reconnect_evicting) {
1442 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
1443 return;
1444 }
1445
1446 if (client_reconnect_gather.empty())
1447 return;
1448
1449 auto now = clock::now();
1450 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
1451 if (elapse1 < g_conf()->mds_reconnect_timeout)
1452 return;
1453
1454 vector<Session*> remaining_sessions;
1455 remaining_sessions.reserve(client_reconnect_gather.size());
1456 for (auto c : client_reconnect_gather) {
1457 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1458 ceph_assert(session);
1459 remaining_sessions.push_back(session);
1460 // client re-sends cap flush messages before the reconnect message
1461 if (session->last_seen > reconnect_last_seen)
1462 reconnect_last_seen = session->last_seen;
1463 }
1464
1465 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
1466 if (elapse2 < g_conf()->mds_reconnect_timeout / 2) {
1467 dout(7) << "reconnect_tick: last seen " << elapse2
1468 << " seconds ago, extending reconnect interval" << dendl;
1469 return;
1470 }
1471
1472 dout(7) << "reconnect timed out, " << remaining_sessions.size()
1473 << " clients have not reconnected in time" << dendl;
1474
1475 // If we're doing blacklist evictions, use this to wait for them before
1476 // proceeding to reconnect_gather_finish
1477 MDSGatherBuilder gather(g_ceph_context);
1478
1479 for (auto session : remaining_sessions) {
1480 // Keep sessions that have specified timeout. These sessions will prevent
1481 // mds from going to active. MDS goes to active after they all have been
1482 // killed or reclaimed.
1483 if (session->info.client_metadata.find("timeout") !=
1484 session->info.client_metadata.end()) {
1485 dout(1) << "reconnect keeps " << session->info.inst
1486 << ", need to be reclaimed" << dendl;
1487 client_reclaim_gather.insert(session->get_client());
1488 continue;
1489 }
1490
1491 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
1492
1493 mds->clog->warn() << "evicting unresponsive client " << *session
1494 << ", after waiting " << elapse1
1495 << " seconds during MDS startup";
1496
1497 if (g_conf()->mds_session_blacklist_on_timeout) {
1498 std::stringstream ss;
1499 mds->evict_client(session->get_client().v, false, true, ss,
1500 gather.new_sub());
1501 } else {
1502 kill_session(session, NULL);
1503 }
1504
1505 failed_reconnects++;
1506 }
1507 client_reconnect_gather.clear();
1508
1509 if (gather.has_subs()) {
1510 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1511 gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
1512 [this](int r){reconnect_gather_finish();})));
1513 gather.activate();
1514 reconnect_evicting = true;
1515 } else {
1516 reconnect_gather_finish();
1517 }
1518 }
1519
1520 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1521 {
1522 if (!locks.length()) return;
1523 int numlocks;
1524 ceph_filelock lock;
1525 auto p = locks.cbegin();
1526 decode(numlocks, p);
1527 for (int i = 0; i < numlocks; ++i) {
1528 decode(lock, p);
1529 lock.client = client;
1530 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1531 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1532 }
1533 decode(numlocks, p);
1534 for (int i = 0; i < numlocks; ++i) {
1535 decode(lock, p);
1536 lock.client = client;
1537 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1538 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1539 }
1540 }
1541
1542 /**
1543 * Call this when the MDCache is oversized, to send requests to the clients
1544 * to trim some caps, and consequently unpin some inodes in the MDCache so
1545 * that it can trim too.
1546 */
1547 std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1548 {
1549 const auto now = clock::now();
1550 const bool steady = flags&RecallFlags::STEADY;
1551 const bool enforce_max = flags&RecallFlags::ENFORCE_MAX;
1552
1553 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1554 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1555 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1556 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1557 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
1558
1559 dout(7) << __func__ << ":"
1560 << " min=" << min_caps_per_client
1561 << " max=" << max_caps_per_client
1562 << " total=" << Capability::count()
1563 << " flags=0x" << std::hex << flags
1564 << dendl;
1565
1566 /* trim caps of sessions with the most caps first */
1567 std::multimap<uint64_t, Session*> caps_session;
1568 auto f = [&caps_session, enforce_max, max_caps_per_client](auto& s) {
1569 auto num_caps = s->caps.size();
1570 if (!enforce_max || num_caps > max_caps_per_client) {
1571 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1572 }
1573 };
1574 mds->sessionmap.get_client_sessions(std::move(f));
1575
1576 std::pair<bool, uint64_t> result = {false, 0};
1577 auto& [throttled, caps_recalled] = result;
1578 last_recall_state = now;
1579 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
1580 if (!session->is_open() ||
1581 !session->get_connection() ||
1582 !session->info.inst.name.is_client())
1583 continue;
1584
1585 dout(10) << __func__ << ":"
1586 << " session " << session->info.inst
1587 << " caps " << num_caps
1588 << ", leases " << session->leases.size()
1589 << dendl;
1590
1591 uint64_t newlim;
1592 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1593 newlim = min_caps_per_client;
1594 } else {
1595 newlim = num_caps-recall_max_caps;
1596 }
1597 if (num_caps > newlim) {
1598 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1599 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1600 newlim = num_caps-recall;
1601 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
1602 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1603 const uint64_t global_recall_throttle = recall_throttle.get();
1604 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1605 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1606 throttled = true;
1607 continue;
1608 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1609 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1610 throttled = true;
1611 continue;
1612 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1613 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1614 throttled = true;
1615 break;
1616 }
1617
1618 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1619 if (steady) {
1620 const auto session_recall = session->get_recall_caps();
1621 const auto session_release = session->get_release_caps();
1622 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1623 /* The session has been unable to keep up with the number of caps
1624 * recalled (by half); additionally, to prevent marking sessions
1625 * we've just begun to recall from, the session_recall counter
1626 * (decayed count of caps recently recalled) is **greater** than the
1627 * session threshold for the session's cap recall throttle.
1628 */
1629 dout(15) << " 2*session_release < session_recall"
1630 " (2*" << session_release << " < " << session_recall << ") &&"
1631 " 2*session_recall < recall_max_decay_threshold"
1632 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
1633 " Skipping because we are unlikely to get more released." << dendl;
1634 continue;
1635 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1636 /* The number of caps recalled is less than the number we *could*
1637 * recall (so there isn't much left to recall?) and the number of
1638 * caps is less than the current recall_caps counter (decayed count
1639 * of caps recently recalled).
1640 */
1641 dout(15) << " 2*recall < session_recall "
1642 " (2*" << recall << " < " << session_recall << ") &&"
1643 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1644 " Skipping because we are unlikely to get more released." << dendl;
1645 continue;
1646 }
1647 }
1648
1649 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1650
1651 auto m = MClientSession::create(CEPH_SESSION_RECALL_STATE);
1652 m->head.max_caps = newlim;
1653 mds->send_message_client(m, session);
1654 if (gather) {
1655 flush_session(session, gather);
1656 }
1657 caps_recalled += session->notify_recall_sent(newlim);
1658 recall_throttle.hit(recall);
1659 }
1660 }
1661
1662 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1663
1664 return result;
1665 }
1666
1667 void Server::force_clients_readonly()
1668 {
1669 dout(10) << "force_clients_readonly" << dendl;
1670 set<Session*> sessions;
1671 mds->sessionmap.get_client_session_set(sessions);
1672 for (set<Session*>::const_iterator p = sessions.begin();
1673 p != sessions.end();
1674 ++p) {
1675 Session *session = *p;
1676 if (!session->info.inst.name.is_client() ||
1677 !(session->is_open() || session->is_stale()))
1678 continue;
1679 mds->send_message_client(MClientSession::create(CEPH_SESSION_FORCE_RO), session);
1680 }
1681 }
1682
1683 /*******
1684 * some generic stuff for finishing off requests
1685 */
1686 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1687 {
1688 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1689 ceph_assert(!mdr->has_completed);
1690
1691 // note trace items for eventual reply.
1692 mdr->tracei = in;
1693 if (in)
1694 mdr->pin(in);
1695
1696 mdr->tracedn = dn;
1697 if (dn)
1698 mdr->pin(dn);
1699
1700 early_reply(mdr, in, dn);
1701
1702 mdr->committing = true;
1703 submit_mdlog_entry(le, fin, mdr, __func__);
1704
1705 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1706 if (mds->queue_one_replay()) {
1707 dout(10) << " queued next replay op" << dendl;
1708 } else {
1709 dout(10) << " journaled last replay op" << dendl;
1710 }
1711 } else if (mdr->did_early_reply)
1712 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
1713 else
1714 mdlog->flush();
1715 }
1716
1717 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1718 std::string_view event)
1719 {
1720 if (mdr) {
1721 string event_str("submit entry: ");
1722 event_str += event;
1723 mdr->mark_event(event_str);
1724 }
1725 mdlog->submit_entry(le, fin);
1726 }
1727
1728 /*
1729 * send response built from mdr contents and error code; clean up mdr
1730 */
1731 void Server::respond_to_request(MDRequestRef& mdr, int r)
1732 {
1733 if (mdr->client_request) {
1734 reply_client_request(mdr, MClientReply::create(*mdr->client_request, r));
1735 } else if (mdr->internal_op > -1) {
1736 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1737 if (!mdr->internal_op_finish)
1738 ceph_abort_msg("trying to respond to internal op without finisher");
1739 mdr->internal_op_finish->complete(r);
1740 mdcache->request_finish(mdr);
1741 }
1742 }
1743
1744 // statistics mds req op number and latency
1745 void Server::perf_gather_op_latency(const MClientRequest::const_ref &req, utime_t lat)
1746 {
1747 int code = l_mdss_first;
1748 switch(req->get_op()) {
1749 case CEPH_MDS_OP_LOOKUPHASH:
1750 code = l_mdss_req_lookuphash_latency;
1751 break;
1752 case CEPH_MDS_OP_LOOKUPINO:
1753 code = l_mdss_req_lookupino_latency;
1754 break;
1755 case CEPH_MDS_OP_LOOKUPPARENT:
1756 code = l_mdss_req_lookupparent_latency;
1757 break;
1758 case CEPH_MDS_OP_LOOKUPNAME:
1759 code = l_mdss_req_lookupname_latency;
1760 break;
1761 case CEPH_MDS_OP_LOOKUP:
1762 code = l_mdss_req_lookup_latency;
1763 break;
1764 case CEPH_MDS_OP_LOOKUPSNAP:
1765 code = l_mdss_req_lookupsnap_latency;
1766 break;
1767 case CEPH_MDS_OP_GETATTR:
1768 code = l_mdss_req_getattr_latency;
1769 break;
1770 case CEPH_MDS_OP_SETATTR:
1771 code = l_mdss_req_setattr_latency;
1772 break;
1773 case CEPH_MDS_OP_SETLAYOUT:
1774 code = l_mdss_req_setlayout_latency;
1775 break;
1776 case CEPH_MDS_OP_SETDIRLAYOUT:
1777 code = l_mdss_req_setdirlayout_latency;
1778 break;
1779 case CEPH_MDS_OP_SETXATTR:
1780 code = l_mdss_req_setxattr_latency;
1781 break;
1782 case CEPH_MDS_OP_RMXATTR:
1783 code = l_mdss_req_rmxattr_latency;
1784 break;
1785 case CEPH_MDS_OP_READDIR:
1786 code = l_mdss_req_readdir_latency;
1787 break;
1788 case CEPH_MDS_OP_SETFILELOCK:
1789 code = l_mdss_req_setfilelock_latency;
1790 break;
1791 case CEPH_MDS_OP_GETFILELOCK:
1792 code = l_mdss_req_getfilelock_latency;
1793 break;
1794 case CEPH_MDS_OP_CREATE:
1795 code = l_mdss_req_create_latency;
1796 break;
1797 case CEPH_MDS_OP_OPEN:
1798 code = l_mdss_req_open_latency;
1799 break;
1800 case CEPH_MDS_OP_MKNOD:
1801 code = l_mdss_req_mknod_latency;
1802 break;
1803 case CEPH_MDS_OP_LINK:
1804 code = l_mdss_req_link_latency;
1805 break;
1806 case CEPH_MDS_OP_UNLINK:
1807 code = l_mdss_req_unlink_latency;
1808 break;
1809 case CEPH_MDS_OP_RMDIR:
1810 code = l_mdss_req_rmdir_latency;
1811 break;
1812 case CEPH_MDS_OP_RENAME:
1813 code = l_mdss_req_rename_latency;
1814 break;
1815 case CEPH_MDS_OP_MKDIR:
1816 code = l_mdss_req_mkdir_latency;
1817 break;
1818 case CEPH_MDS_OP_SYMLINK:
1819 code = l_mdss_req_symlink_latency;
1820 break;
1821 case CEPH_MDS_OP_LSSNAP:
1822 code = l_mdss_req_lssnap_latency;
1823 break;
1824 case CEPH_MDS_OP_MKSNAP:
1825 code = l_mdss_req_mksnap_latency;
1826 break;
1827 case CEPH_MDS_OP_RMSNAP:
1828 code = l_mdss_req_rmsnap_latency;
1829 break;
1830 case CEPH_MDS_OP_RENAMESNAP:
1831 code = l_mdss_req_renamesnap_latency;
1832 break;
1833 default: ceph_abort();
1834 }
1835 logger->tinc(code, lat);
1836 }
1837
1838 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
1839 {
1840 if (!g_conf()->mds_early_reply)
1841 return;
1842
1843 if (mdr->no_early_reply) {
1844 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
1845 return;
1846 }
1847
1848 if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
1849 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
1850 return;
1851 }
1852
1853 if (mdr->alloc_ino) {
1854 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
1855 return;
1856 }
1857
1858 const MClientRequest::const_ref &req = mdr->client_request;
1859 entity_inst_t client_inst = req->get_source_inst();
1860 if (client_inst.name.is_mds())
1861 return;
1862
1863 if (req->is_replay()) {
1864 dout(10) << " no early reply on replay op" << dendl;
1865 return;
1866 }
1867
1868
1869 auto reply = MClientReply::create(*req, 0);
1870 reply->set_unsafe();
1871
1872 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1873 //
1874 //_rename_finish() does not send dentry link/unlink message to replicas.
1875 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1876 // that have projected linkages from getting new replica.
1877 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
1878
1879 dout(10) << "early_reply " << reply->get_result()
1880 << " (" << cpp_strerror(reply->get_result())
1881 << ") " << *req << dendl;
1882
1883 if (tracei || tracedn) {
1884 if (tracei)
1885 mdr->cap_releases.erase(tracei->vino());
1886 if (tracedn)
1887 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1888
1889 set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid,
1890 req->get_dentry_wanted(), mdr);
1891 }
1892
1893 reply->set_extra_bl(mdr->reply_extra_bl);
1894 mds->send_message_client(reply, mdr->session);
1895
1896 mdr->did_early_reply = true;
1897
1898 mds->logger->inc(l_mds_reply);
1899 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
1900 mds->logger->tinc(l_mds_reply_latency, lat);
1901 if (client_inst.name.is_client()) {
1902 mds->sessionmap.hit_session(mdr->session);
1903 }
1904 perf_gather_op_latency(req, lat);
1905 dout(20) << "lat " << lat << dendl;
1906
1907 mdr->mark_event("early_replied");
1908 }
1909
1910 /*
1911 * send given reply
1912 * include a trace to tracei
1913 * Clean up mdr
1914 */
1915 void Server::reply_client_request(MDRequestRef& mdr, const MClientReply::ref &reply)
1916 {
1917 ceph_assert(mdr.get());
1918 const MClientRequest::const_ref &req = mdr->client_request;
1919
1920 dout(7) << "reply_client_request " << reply->get_result()
1921 << " (" << cpp_strerror(reply->get_result())
1922 << ") " << *req << dendl;
1923
1924 mdr->mark_event("replying");
1925
1926 Session *session = mdr->session;
1927
1928 // note successful request in session map?
1929 //
1930 // setfilelock requests are special, they only modify states in MDS memory.
1931 // The states get lost when MDS fails. If Client re-send a completed
1932 // setfilelock request, it means that client did not receive corresponding
1933 // setfilelock reply. So MDS should re-execute the setfilelock request.
1934 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
1935 reply->get_result() == 0 && session) {
1936 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
1937 session->add_completed_request(mdr->reqid.tid, created);
1938 if (mdr->ls) {
1939 mdr->ls->touched_sessions.insert(session->info.inst.name);
1940 }
1941 }
1942
1943 // give any preallocated inos to the session
1944 apply_allocated_inos(mdr, session);
1945
1946 // get tracei/tracedn from mdr?
1947 snapid_t snapid = mdr->snapid;
1948 CInode *tracei = mdr->tracei;
1949 CDentry *tracedn = mdr->tracedn;
1950
1951 bool is_replay = mdr->client_request->is_replay();
1952 bool did_early_reply = mdr->did_early_reply;
1953 entity_inst_t client_inst = req->get_source_inst();
1954 int dentry_wanted = req->get_dentry_wanted();
1955
1956 if (!did_early_reply && !is_replay) {
1957
1958 mds->logger->inc(l_mds_reply);
1959 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
1960 mds->logger->tinc(l_mds_reply_latency, lat);
1961 if (session && client_inst.name.is_client()) {
1962 mds->sessionmap.hit_session(session);
1963 }
1964 perf_gather_op_latency(req, lat);
1965 dout(20) << "lat " << lat << dendl;
1966
1967 if (tracei)
1968 mdr->cap_releases.erase(tracei->vino());
1969 if (tracedn)
1970 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1971 }
1972
1973 // drop non-rdlocks before replying, so that we can issue leases
1974 mdcache->request_drop_non_rdlocks(mdr);
1975
1976 // reply at all?
1977 if (session && !client_inst.name.is_mds()) {
1978 // send reply.
1979 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
1980 (tracei || tracedn)) {
1981 if (is_replay) {
1982 if (tracei)
1983 mdcache->try_reconnect_cap(tracei, session);
1984 } else {
1985 // include metadata in reply
1986 set_trace_dist(session, reply, tracei, tracedn,
1987 snapid, dentry_wanted,
1988 mdr);
1989 }
1990 }
1991
1992 // We can set the extra bl unconditionally: if it's already been sent in the
1993 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
1994 reply->set_extra_bl(mdr->reply_extra_bl);
1995
1996 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
1997 mds->send_message_client(reply, session);
1998 }
1999
2000 if (req->is_queued_for_replay() &&
2001 (mdr->has_completed || reply->get_result() < 0)) {
2002 if (reply->get_result() < 0) {
2003 int r = reply->get_result();
2004 derr << "reply_client_request: failed to replay " << *req
2005 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2006 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2007 }
2008 mds->queue_one_replay();
2009 }
2010
2011 // clean up request
2012 mdcache->request_finish(mdr);
2013
2014 // take a closer look at tracei, if it happens to be a remote link
2015 if (tracei &&
2016 tracedn &&
2017 tracedn->get_projected_linkage()->is_remote()) {
2018 mdcache->eval_remote(tracedn);
2019 }
2020 }
2021
2022 /*
2023 * pass inode OR dentry (not both, or we may get confused)
2024 *
2025 * trace is in reverse order (i.e. root inode comes last)
2026 */
2027 void Server::set_trace_dist(Session *session, const MClientReply::ref &reply,
2028 CInode *in, CDentry *dn,
2029 snapid_t snapid,
2030 int dentry_wanted,
2031 MDRequestRef& mdr)
2032 {
2033 // skip doing this for debugging purposes?
2034 if (g_conf()->mds_inject_traceless_reply_probability &&
2035 mdr->ls && !mdr->o_trunc &&
2036 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
2037 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2038 return;
2039 }
2040
2041 // inode, dentry, dir, ..., inode
2042 bufferlist bl;
2043 mds_rank_t whoami = mds->get_nodeid();
2044 client_t client = session->get_client();
2045 utime_t now = ceph_clock_now();
2046
2047 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2048
2049 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
2050
2051 // realm
2052 if (snapid == CEPH_NOSNAP) {
2053 SnapRealm *realm;
2054 if (in)
2055 realm = in->find_snaprealm();
2056 else
2057 realm = dn->get_dir()->get_inode()->find_snaprealm();
2058 reply->snapbl = realm->get_snap_trace();
2059 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2060 }
2061
2062 // dir + dentry?
2063 if (dn) {
2064 reply->head.is_dentry = 1;
2065 CDir *dir = dn->get_dir();
2066 CInode *diri = dir->get_inode();
2067
2068 diri->encode_inodestat(bl, session, NULL, snapid);
2069 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2070
2071 #ifdef MDS_VERIFY_FRAGSTAT
2072 if (dir->is_complete())
2073 dir->verify_fragstat();
2074 #endif
2075 DirStat ds;
2076 ds.frag = dir->get_frag();
2077 ds.auth = dir->get_dir_auth().first;
2078 if (dir->is_auth())
2079 dir->get_dist_spec(ds.dist, whoami);
2080
2081 dir->encode_dirstat(bl, session->info, ds);
2082 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2083
2084 encode(dn->get_name(), bl);
2085 if (snapid == CEPH_NOSNAP)
2086 mds->locker->issue_client_lease(dn, client, bl, now, session);
2087 else {
2088 //null lease
2089 LeaseStat e;
2090 mds->locker->encode_lease(bl, session->info, e);
2091 }
2092 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
2093 } else
2094 reply->head.is_dentry = 0;
2095
2096 // inode
2097 if (in) {
2098 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2099 dout(20) << "set_trace_dist added in " << *in << dendl;
2100 reply->head.is_target = 1;
2101 } else
2102 reply->head.is_target = 0;
2103
2104 reply->set_trace(bl);
2105 }
2106
2107 void Server::handle_client_request(const MClientRequest::const_ref &req)
2108 {
2109 dout(4) << "handle_client_request " << *req << dendl;
2110
2111 if (mds->logger)
2112 mds->logger->inc(l_mds_request);
2113 if (logger)
2114 logger->inc(l_mdss_handle_client_request);
2115
2116 if (!mdcache->is_open()) {
2117 dout(5) << "waiting for root" << dendl;
2118 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2119 return;
2120 }
2121
2122 // active session?
2123 Session *session = 0;
2124 if (req->get_source().is_client()) {
2125 session = mds->get_session(req);
2126 if (!session) {
2127 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
2128 } else if (session->is_closed() ||
2129 session->is_closing() ||
2130 session->is_killing()) {
2131 dout(5) << "session closed|closing|killing, dropping" << dendl;
2132 session = NULL;
2133 }
2134 if (!session) {
2135 if (req->is_queued_for_replay())
2136 mds->queue_one_replay();
2137 return;
2138 }
2139 }
2140
2141 // old mdsmap?
2142 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2143 // send it? hrm, this isn't ideal; they may get a lot of copies if
2144 // they have a high request rate.
2145 }
2146
2147 // completed request?
2148 bool has_completed = false;
2149 if (req->is_replay() || req->get_retry_attempt()) {
2150 ceph_assert(session);
2151 inodeno_t created;
2152 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2153 has_completed = true;
2154 // Don't send traceless reply if the completed request has created
2155 // new inode. Treat the request as lookup request instead.
2156 if (req->is_replay() ||
2157 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2158 req->get_op() != CEPH_MDS_OP_OPEN &&
2159 req->get_op() != CEPH_MDS_OP_CREATE)) {
2160 dout(5) << "already completed " << req->get_reqid() << dendl;
2161 auto reply = MClientReply::create(*req, 0);
2162 if (created != inodeno_t()) {
2163 bufferlist extra;
2164 encode(created, extra);
2165 reply->set_extra_bl(extra);
2166 }
2167 mds->send_message_client(reply, session);
2168
2169 if (req->is_queued_for_replay())
2170 mds->queue_one_replay();
2171
2172 return;
2173 }
2174 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2175 req->get_op() != CEPH_MDS_OP_CREATE) {
2176 dout(10) << " completed request which created new inode " << created
2177 << ", convert it to lookup request" << dendl;
2178 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2179 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2180 }
2181 }
2182 }
2183
2184 // trim completed_request list
2185 if (req->get_oldest_client_tid() > 0) {
2186 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
2187 ceph_assert(session);
2188 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2189 // Sessions 'completed_requests' was dirtied, mark it to be
2190 // potentially flushed at segment expiry.
2191 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2192
2193 if (session->get_num_trim_requests_warnings() > 0 &&
2194 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
2195 session->reset_num_trim_requests_warnings();
2196 } else {
2197 if (session->get_num_completed_requests() >=
2198 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
2199 session->inc_num_trim_requests_warnings();
2200 stringstream ss;
2201 ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
2202 << req->get_oldest_client_tid() << "), "
2203 << session->get_num_completed_requests()
2204 << " completed requests recorded in session\n";
2205 mds->clog->warn() << ss.str();
2206 dout(20) << __func__ << " " << ss.str() << dendl;
2207 }
2208 }
2209 }
2210
2211 // register + dispatch
2212 MDRequestRef mdr = mdcache->request_start(req);
2213 if (!mdr.get())
2214 return;
2215
2216 if (session) {
2217 mdr->session = session;
2218 session->requests.push_back(&mdr->item_session_request);
2219 }
2220
2221 if (has_completed)
2222 mdr->has_completed = true;
2223
2224 // process embedded cap releases?
2225 // (only if NOT replay!)
2226 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
2227 client_t client = req->get_source().num();
2228 for (const auto &r : req->releases) {
2229 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2230 }
2231 req->releases.clear();
2232 }
2233
2234 dispatch_client_request(mdr);
2235 return;
2236 }
2237
2238 void Server::handle_osd_map()
2239 {
2240 /* Note that we check the OSDMAP_FULL flag directly rather than
2241 * using osdmap_full_flag(), because we want to know "is the flag set"
2242 * rather than "does the flag apply to us?" */
2243 mds->objecter->with_osdmap([this](const OSDMap& o) {
2244 auto pi = o.get_pg_pool(mds->mdsmap->get_metadata_pool());
2245 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
2246 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2247 << o.get_epoch() << dendl;
2248 });
2249 }
2250
2251 void Server::dispatch_client_request(MDRequestRef& mdr)
2252 {
2253 // we shouldn't be waiting on anyone.
2254 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
2255
2256 if (mdr->killed) {
2257 dout(10) << "request " << *mdr << " was killed" << dendl;
2258 return;
2259 } else if (mdr->aborted) {
2260 mdr->aborted = false;
2261 mdcache->request_kill(mdr);
2262 return;
2263 }
2264
2265 const MClientRequest::const_ref &req = mdr->client_request;
2266
2267 if (logger) logger->inc(l_mdss_dispatch_client_request);
2268
2269 dout(7) << "dispatch_client_request " << *req << dendl;
2270
2271 if (req->may_write()) {
2272 if (mdcache->is_readonly()) {
2273 dout(10) << " read-only FS" << dendl;
2274 respond_to_request(mdr, -EROFS);
2275 return;
2276 }
2277 if (mdr->has_more() && mdr->more()->slave_error) {
2278 dout(10) << " got error from slaves" << dendl;
2279 respond_to_request(mdr, mdr->more()->slave_error);
2280 return;
2281 }
2282 }
2283
2284 if (is_full) {
2285 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2286 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2287 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2288 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2289 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2290 req->get_op() == CEPH_MDS_OP_CREATE ||
2291 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2292 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2293 ((req->get_op() == CEPH_MDS_OP_LINK ||
2294 req->get_op() == CEPH_MDS_OP_RENAME) &&
2295 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
2296 ) {
2297
2298 dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2299 respond_to_request(mdr, -ENOSPC);
2300 return;
2301 } else {
2302 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2303 }
2304 }
2305
2306 switch (req->get_op()) {
2307 case CEPH_MDS_OP_LOOKUPHASH:
2308 case CEPH_MDS_OP_LOOKUPINO:
2309 handle_client_lookup_ino(mdr, false, false);
2310 break;
2311 case CEPH_MDS_OP_LOOKUPPARENT:
2312 handle_client_lookup_ino(mdr, true, false);
2313 break;
2314 case CEPH_MDS_OP_LOOKUPNAME:
2315 handle_client_lookup_ino(mdr, false, true);
2316 break;
2317
2318 // inodes ops.
2319 case CEPH_MDS_OP_LOOKUP:
2320 handle_client_getattr(mdr, true);
2321 break;
2322
2323 case CEPH_MDS_OP_LOOKUPSNAP:
2324 // lookupsnap does not reference a CDentry; treat it as a getattr
2325 case CEPH_MDS_OP_GETATTR:
2326 handle_client_getattr(mdr, false);
2327 break;
2328
2329 case CEPH_MDS_OP_SETATTR:
2330 handle_client_setattr(mdr);
2331 break;
2332 case CEPH_MDS_OP_SETLAYOUT:
2333 handle_client_setlayout(mdr);
2334 break;
2335 case CEPH_MDS_OP_SETDIRLAYOUT:
2336 handle_client_setdirlayout(mdr);
2337 break;
2338 case CEPH_MDS_OP_SETXATTR:
2339 handle_client_setxattr(mdr);
2340 break;
2341 case CEPH_MDS_OP_RMXATTR:
2342 handle_client_removexattr(mdr);
2343 break;
2344
2345 case CEPH_MDS_OP_READDIR:
2346 handle_client_readdir(mdr);
2347 break;
2348
2349 case CEPH_MDS_OP_SETFILELOCK:
2350 handle_client_file_setlock(mdr);
2351 break;
2352
2353 case CEPH_MDS_OP_GETFILELOCK:
2354 handle_client_file_readlock(mdr);
2355 break;
2356
2357 // funky.
2358 case CEPH_MDS_OP_CREATE:
2359 if (mdr->has_completed)
2360 handle_client_open(mdr); // already created.. just open
2361 else
2362 handle_client_openc(mdr);
2363 break;
2364
2365 case CEPH_MDS_OP_OPEN:
2366 handle_client_open(mdr);
2367 break;
2368
2369 // namespace.
2370 // no prior locks.
2371 case CEPH_MDS_OP_MKNOD:
2372 handle_client_mknod(mdr);
2373 break;
2374 case CEPH_MDS_OP_LINK:
2375 handle_client_link(mdr);
2376 break;
2377 case CEPH_MDS_OP_UNLINK:
2378 case CEPH_MDS_OP_RMDIR:
2379 handle_client_unlink(mdr);
2380 break;
2381 case CEPH_MDS_OP_RENAME:
2382 handle_client_rename(mdr);
2383 break;
2384 case CEPH_MDS_OP_MKDIR:
2385 handle_client_mkdir(mdr);
2386 break;
2387 case CEPH_MDS_OP_SYMLINK:
2388 handle_client_symlink(mdr);
2389 break;
2390
2391
2392 // snaps
2393 case CEPH_MDS_OP_LSSNAP:
2394 handle_client_lssnap(mdr);
2395 break;
2396 case CEPH_MDS_OP_MKSNAP:
2397 handle_client_mksnap(mdr);
2398 break;
2399 case CEPH_MDS_OP_RMSNAP:
2400 handle_client_rmsnap(mdr);
2401 break;
2402 case CEPH_MDS_OP_RENAMESNAP:
2403 handle_client_renamesnap(mdr);
2404 break;
2405
2406 default:
2407 dout(1) << " unknown client op " << req->get_op() << dendl;
2408 respond_to_request(mdr, -EOPNOTSUPP);
2409 }
2410 }
2411
2412
2413 // ---------------------------------------
2414 // SLAVE REQUESTS
2415
2416 void Server::handle_slave_request(const MMDSSlaveRequest::const_ref &m)
2417 {
2418 dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
2419 mds_rank_t from = mds_rank_t(m->get_source().num());
2420
2421 if (logger) logger->inc(l_mdss_handle_slave_request);
2422
2423 // reply?
2424 if (m->is_reply())
2425 return handle_slave_request_reply(m);
2426
2427 // the purpose of rename notify is enforcing causal message ordering. making sure
2428 // bystanders have received all messages from rename srcdn's auth MDS.
2429 if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
2430 auto reply = MMDSSlaveRequest::create(m->get_reqid(), m->get_attempt(), MMDSSlaveRequest::OP_RENAMENOTIFYACK);
2431 mds->send_message(reply, m->get_connection());
2432 return;
2433 }
2434
2435 CDentry *straydn = NULL;
2436 if (m->straybl.length() > 0) {
2437 straydn = mdcache->add_replica_stray(m->straybl, from);
2438 ceph_assert(straydn);
2439 m->straybl.clear();
2440 }
2441
2442 // am i a new slave?
2443 MDRequestRef mdr;
2444 if (mdcache->have_request(m->get_reqid())) {
2445 // existing?
2446 mdr = mdcache->request_get(m->get_reqid());
2447
2448 // is my request newer?
2449 if (mdr->attempt > m->get_attempt()) {
2450 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2451 << ", dropping " << *m << dendl;
2452 return;
2453 }
2454
2455
2456 if (mdr->attempt < m->get_attempt()) {
2457 // mine is old, close it out
2458 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2459 << ", closing out" << dendl;
2460 mdcache->request_finish(mdr);
2461 mdr.reset();
2462 } else if (mdr->slave_to_mds != from) {
2463 dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
2464 return;
2465 }
2466
2467 if (m->get_op() == MMDSSlaveRequest::OP_FINISH && m->is_abort()) {
2468 mdr->aborted = true;
2469 if (mdr->slave_request) {
2470 // only abort on-going xlock, wrlock and auth pin
2471 ceph_assert(!mdr->slave_did_prepare());
2472 } else {
2473 mdcache->request_finish(mdr);
2474 }
2475 return;
2476 }
2477 }
2478 if (!mdr.get()) {
2479 // new?
2480 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
2481 dout(10) << "missing slave request for " << m->get_reqid()
2482 << " OP_FINISH, must have lost race with a forward" << dendl;
2483 return;
2484 }
2485 mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
2486 mdr->set_op_stamp(m->op_stamp);
2487 }
2488 ceph_assert(mdr->slave_request == 0); // only one at a time, please!
2489
2490 if (straydn) {
2491 mdr->pin(straydn);
2492 mdr->straydn = straydn;
2493 }
2494
2495 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2496 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2497 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2498 return;
2499 } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2500 mdr->locks.empty()) {
2501 dout(3) << "not active yet, waiting" << dendl;
2502 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2503 return;
2504 }
2505
2506 mdr->reset_slave_request(m);
2507
2508 dispatch_slave_request(mdr);
2509 }
2510
2511 void Server::handle_slave_request_reply(const MMDSSlaveRequest::const_ref &m)
2512 {
2513 mds_rank_t from = mds_rank_t(m->get_source().num());
2514
2515 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2516 metareqid_t r = m->get_reqid();
2517 if (!mdcache->have_uncommitted_master(r, from)) {
2518 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2519 << from << " reqid " << r << dendl;
2520 return;
2521 }
2522 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2523 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2524 return;
2525 }
2526
2527 if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
2528 metareqid_t r = m->get_reqid();
2529 mdcache->committed_master_slave(r, from);
2530 return;
2531 }
2532
2533 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2534 if (m->get_attempt() != mdr->attempt) {
2535 dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
2536 << m->get_attempt() << dendl;
2537 return;
2538 }
2539
2540 switch (m->get_op()) {
2541 case MMDSSlaveRequest::OP_XLOCKACK:
2542 {
2543 // identify lock, master request
2544 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2545 m->get_object_info());
2546 mdr->more()->slaves.insert(from);
2547 lock->decode_locked_state(m->get_lock_data());
2548 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2549 mdr->locks.emplace_hint(mdr->locks.end(), lock, MutationImpl::LockOp::XLOCK);
2550 mdr->finish_locking(lock);
2551 lock->get_xlock(mdr, mdr->get_client());
2552
2553 ceph_assert(mdr->more()->waiting_on_slave.count(from));
2554 mdr->more()->waiting_on_slave.erase(from);
2555 ceph_assert(mdr->more()->waiting_on_slave.empty());
2556 mdcache->dispatch_request(mdr);
2557 }
2558 break;
2559
2560 case MMDSSlaveRequest::OP_WRLOCKACK:
2561 {
2562 // identify lock, master request
2563 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2564 m->get_object_info());
2565 mdr->more()->slaves.insert(from);
2566 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2567 auto it = mdr->locks.emplace_hint(mdr->locks.end(),
2568 lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
2569 ceph_assert(it->is_remote_wrlock());
2570 ceph_assert(it->wrlock_target == from);
2571
2572 mdr->finish_locking(lock);
2573
2574 ceph_assert(mdr->more()->waiting_on_slave.count(from));
2575 mdr->more()->waiting_on_slave.erase(from);
2576 ceph_assert(mdr->more()->waiting_on_slave.empty());
2577 mdcache->dispatch_request(mdr);
2578 }
2579 break;
2580
2581 case MMDSSlaveRequest::OP_AUTHPINACK:
2582 handle_slave_auth_pin_ack(mdr, m);
2583 break;
2584
2585 case MMDSSlaveRequest::OP_LINKPREPACK:
2586 handle_slave_link_prep_ack(mdr, m);
2587 break;
2588
2589 case MMDSSlaveRequest::OP_RMDIRPREPACK:
2590 handle_slave_rmdir_prep_ack(mdr, m);
2591 break;
2592
2593 case MMDSSlaveRequest::OP_RENAMEPREPACK:
2594 handle_slave_rename_prep_ack(mdr, m);
2595 break;
2596
2597 case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
2598 handle_slave_rename_notify_ack(mdr, m);
2599 break;
2600
2601 default:
2602 ceph_abort();
2603 }
2604 }
2605
2606 void Server::dispatch_slave_request(MDRequestRef& mdr)
2607 {
2608 dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
2609
2610 if (mdr->aborted) {
2611 dout(7) << " abort flag set, finishing" << dendl;
2612 mdcache->request_finish(mdr);
2613 return;
2614 }
2615
2616 if (logger) logger->inc(l_mdss_dispatch_slave_request);
2617
2618 int op = mdr->slave_request->get_op();
2619 switch (op) {
2620 case MMDSSlaveRequest::OP_XLOCK:
2621 case MMDSSlaveRequest::OP_WRLOCK:
2622 {
2623 // identify object
2624 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2625 mdr->slave_request->get_object_info());
2626
2627 if (!lock) {
2628 dout(10) << "don't have object, dropping" << dendl;
2629 ceph_abort(); // can this happen, if we auth pinned properly.
2630 }
2631 if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2632 dout(10) << "not auth for remote xlock attempt, dropping on "
2633 << *lock << " on " << *lock->get_parent() << dendl;
2634 } else {
2635 // use acquire_locks so that we get auth_pinning.
2636 MutationImpl::LockOpVec lov;
2637 for (const auto& p : mdr->locks) {
2638 if (p.is_xlock())
2639 lov.add_xlock(p.lock);
2640 else if (p.is_wrlock())
2641 lov.add_wrlock(p.lock);
2642 }
2643
2644 int replycode = 0;
2645 switch (op) {
2646 case MMDSSlaveRequest::OP_XLOCK:
2647 lov.add_xlock(lock);
2648 replycode = MMDSSlaveRequest::OP_XLOCKACK;
2649 break;
2650 case MMDSSlaveRequest::OP_WRLOCK:
2651 lov.add_wrlock(lock);
2652 replycode = MMDSSlaveRequest::OP_WRLOCKACK;
2653 break;
2654 }
2655
2656 if (!mds->locker->acquire_locks(mdr, lov))
2657 return;
2658
2659 // ack
2660 auto r = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, replycode);
2661 r->set_lock_type(lock->get_type());
2662 lock->get_parent()->set_object_info(r->get_object_info());
2663 if (replycode == MMDSSlaveRequest::OP_XLOCKACK)
2664 lock->encode_locked_state(r->get_lock_data());
2665 mds->send_message(r, mdr->slave_request->get_connection());
2666 }
2667
2668 // done.
2669 mdr->reset_slave_request();
2670 }
2671 break;
2672
2673 case MMDSSlaveRequest::OP_UNXLOCK:
2674 case MMDSSlaveRequest::OP_UNWRLOCK:
2675 {
2676 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2677 mdr->slave_request->get_object_info());
2678 ceph_assert(lock);
2679 auto it = mdr->locks.find(lock);
2680 ceph_assert(it != mdr->locks.end());
2681 bool need_issue = false;
2682 switch (op) {
2683 case MMDSSlaveRequest::OP_UNXLOCK:
2684 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
2685 break;
2686 case MMDSSlaveRequest::OP_UNWRLOCK:
2687 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
2688 break;
2689 }
2690 if (need_issue)
2691 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2692
2693 // done. no ack necessary.
2694 mdr->reset_slave_request();
2695 }
2696 break;
2697
2698 case MMDSSlaveRequest::OP_DROPLOCKS:
2699 mds->locker->drop_locks(mdr.get());
2700 mdr->reset_slave_request();
2701 break;
2702
2703 case MMDSSlaveRequest::OP_AUTHPIN:
2704 handle_slave_auth_pin(mdr);
2705 break;
2706
2707 case MMDSSlaveRequest::OP_LINKPREP:
2708 case MMDSSlaveRequest::OP_UNLINKPREP:
2709 handle_slave_link_prep(mdr);
2710 break;
2711
2712 case MMDSSlaveRequest::OP_RMDIRPREP:
2713 handle_slave_rmdir_prep(mdr);
2714 break;
2715
2716 case MMDSSlaveRequest::OP_RENAMEPREP:
2717 handle_slave_rename_prep(mdr);
2718 break;
2719
2720 case MMDSSlaveRequest::OP_FINISH:
2721 // information about rename imported caps
2722 if (mdr->slave_request->inode_export.length() > 0)
2723 mdr->more()->inode_import = mdr->slave_request->inode_export;
2724 // finish off request.
2725 mdcache->request_finish(mdr);
2726 break;
2727
2728 default:
2729 ceph_abort();
2730 }
2731 }
2732
2733 void Server::handle_slave_auth_pin(MDRequestRef& mdr)
2734 {
2735 dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
2736
2737 // build list of objects
2738 list<MDSCacheObject*> objects;
2739 CInode *auth_pin_freeze = NULL;
2740 bool fail = false, wouldblock = false, readonly = false;
2741
2742 if (mdcache->is_readonly()) {
2743 dout(10) << " read-only FS" << dendl;
2744 readonly = true;
2745 fail = true;
2746 }
2747
2748 if (!fail) {
2749 for (const auto &oi : mdr->slave_request->get_authpins()) {
2750 MDSCacheObject *object = mdcache->get_object(oi);
2751 if (!object) {
2752 dout(10) << " don't have " << oi << dendl;
2753 fail = true;
2754 break;
2755 }
2756
2757 objects.push_back(object);
2758 if (oi == mdr->slave_request->get_authpin_freeze())
2759 auth_pin_freeze = static_cast<CInode*>(object);
2760 }
2761 }
2762
2763 // can we auth pin them?
2764 if (!fail) {
2765 for (list<MDSCacheObject*>::iterator p = objects.begin();
2766 p != objects.end();
2767 ++p) {
2768 if (!(*p)->is_auth()) {
2769 dout(10) << " not auth for " << **p << dendl;
2770 fail = true;
2771 break;
2772 }
2773 if (mdr->is_auth_pinned(*p))
2774 continue;
2775 if (!mdr->can_auth_pin(*p)) {
2776 if (mdr->slave_request->is_nonblock()) {
2777 dout(10) << " can't auth_pin (freezing?) " << **p << " nonblocking" << dendl;
2778 fail = true;
2779 wouldblock = true;
2780 break;
2781 }
2782 // wait
2783 dout(10) << " waiting for authpinnable on " << **p << dendl;
2784 (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2785 mdr->drop_local_auth_pins();
2786
2787 mds->locker->notify_freeze_waiter(*p);
2788 return;
2789 }
2790 }
2791 }
2792
2793 // auth pin!
2794 if (fail) {
2795 mdr->drop_local_auth_pins(); // just in case
2796 } else {
2797 /* freeze authpin wrong inode */
2798 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
2799 mdr->more()->rename_inode != auth_pin_freeze)
2800 mdr->unfreeze_auth_pin(true);
2801
2802 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2803 * on the source inode to complete. This happens after all locks for the rename
2804 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2805 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2806 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2807 * The solution is freeze the inode and prevent other MDRequests from getting new
2808 * auth pins.
2809 */
2810 if (auth_pin_freeze) {
2811 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
2812 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
2813 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
2814 mds->mdlog->flush();
2815 return;
2816 }
2817 }
2818 for (list<MDSCacheObject*>::iterator p = objects.begin();
2819 p != objects.end();
2820 ++p) {
2821 dout(10) << "auth_pinning " << **p << dendl;
2822 mdr->auth_pin(*p);
2823 }
2824 }
2825
2826 // ack!
2827 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
2828
2829 // return list of my auth_pins (if any)
2830 for (const auto &p : mdr->auth_pins) {
2831 MDSCacheObjectInfo info;
2832 p->set_object_info(info);
2833 reply->get_authpins().push_back(info);
2834 if (p == (MDSCacheObject*)auth_pin_freeze)
2835 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
2836 }
2837
2838 if (wouldblock)
2839 reply->mark_error_wouldblock();
2840 if (readonly)
2841 reply->mark_error_rofs();
2842
2843 mds->send_message_mds(reply, mdr->slave_to_mds);
2844
2845 // clean up this request
2846 mdr->reset_slave_request();
2847 return;
2848 }
2849
2850 void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
2851 {
2852 dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
2853 mds_rank_t from = mds_rank_t(ack->get_source().num());
2854
2855 // added auth pins?
2856 set<MDSCacheObject*> pinned;
2857 for (const auto &oi : ack->get_authpins()) {
2858 MDSCacheObject *object = mdcache->get_object(oi);
2859 ceph_assert(object); // we pinned it
2860 dout(10) << " remote has pinned " << *object << dendl;
2861 if (!mdr->is_auth_pinned(object))
2862 mdr->remote_auth_pins[object] = from;
2863 if (oi == ack->get_authpin_freeze())
2864 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
2865 pinned.insert(object);
2866 }
2867
2868 // removed frozen auth pin ?
2869 if (mdr->more()->is_remote_frozen_authpin &&
2870 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
2871 auto p = mdr->remote_auth_pins.find(mdr->more()->rename_inode);
2872 ceph_assert(p != mdr->remote_auth_pins.end());
2873 if (p->second == from) {
2874 mdr->more()->is_remote_frozen_authpin = false;
2875 }
2876 }
2877
2878 // removed auth pins?
2879 auto p = mdr->remote_auth_pins.begin();
2880 while (p != mdr->remote_auth_pins.end()) {
2881 MDSCacheObject* object = p->first;
2882 if (p->second == from && pinned.count(object) == 0) {
2883 dout(10) << " remote has unpinned " << *object << dendl;
2884 mdr->remote_auth_pins.erase(p++);
2885 } else {
2886 ++p;
2887 }
2888 }
2889
2890 if (ack->is_error_rofs()) {
2891 mdr->more()->slave_error = -EROFS;
2892 mdr->aborted = true;
2893 } else if (ack->is_error_wouldblock()) {
2894 mdr->more()->slave_error = -EWOULDBLOCK;
2895 mdr->aborted = true;
2896 }
2897
2898 // note slave
2899 mdr->more()->slaves.insert(from);
2900
2901 // clear from waiting list
2902 ceph_assert(mdr->more()->waiting_on_slave.count(from));
2903 mdr->more()->waiting_on_slave.erase(from);
2904
2905 // go again?
2906 if (mdr->more()->waiting_on_slave.empty())
2907 mdcache->dispatch_request(mdr);
2908 else
2909 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
2910 }
2911
2912
2913 // ---------------------------------------
2914 // HELPERS
2915
2916
2917 /**
2918 * check whether we are permitted to complete a request
2919 *
2920 * Check whether we have permission to perform the operation specified
2921 * by mask on the given inode, based on the capability in the mdr's
2922 * session.
2923 */
2924 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
2925 {
2926 if (mdr->session) {
2927 int r = mdr->session->check_access(
2928 in, mask,
2929 mdr->client_request->get_caller_uid(),
2930 mdr->client_request->get_caller_gid(),
2931 &mdr->client_request->get_caller_gid_list(),
2932 mdr->client_request->head.args.setattr.uid,
2933 mdr->client_request->head.args.setattr.gid);
2934 if (r < 0) {
2935 respond_to_request(mdr, r);
2936 return false;
2937 }
2938 }
2939 return true;
2940 }
2941
2942 /**
2943 * check whether fragment has reached maximum size
2944 *
2945 */
2946 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
2947 {
2948 const auto size = in->get_frag_size();
2949 if (size >= g_conf()->mds_bal_fragment_size_max) {
2950 dout(10) << "fragment " << *in << " size exceeds " << g_conf()->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
2951 respond_to_request(mdr, -ENOSPC);
2952 return false;
2953 }
2954
2955 return true;
2956 }
2957
2958
2959 /** validate_dentry_dir
2960 *
2961 * verify that the dir exists and would own the dname.
2962 * do not check if the dentry exists.
2963 */
2964 CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, std::string_view dname)
2965 {
2966 // make sure parent is a dir?
2967 if (!diri->is_dir()) {
2968 dout(7) << "validate_dentry_dir: not a dir" << dendl;
2969 respond_to_request(mdr, -ENOTDIR);
2970 return NULL;
2971 }
2972
2973 // which dirfrag?
2974 frag_t fg = diri->pick_dirfrag(dname);
2975 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
2976 if (!dir)
2977 return 0;
2978
2979 // frozen?
2980 if (dir->is_frozen()) {
2981 dout(7) << "dir is frozen " << *dir << dendl;
2982 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2983 return NULL;
2984 }
2985
2986 return dir;
2987 }
2988
2989
2990 /** prepare_null_dentry
2991 * prepare a null (or existing) dentry in given dir.
2992 * wait for any dn lock.
2993 */
2994 CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, std::string_view dname, bool okexist)
2995 {
2996 dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
2997 ceph_assert(dir->is_auth());
2998
2999 client_t client = mdr->get_client();
3000
3001 // does it already exist?
3002 CDentry *dn = dir->lookup(dname);
3003 if (dn) {
3004 /*
3005 if (dn->lock.is_xlocked_by_other(mdr)) {
3006 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
3007 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
3008 return 0;
3009 }
3010 */
3011 if (!dn->get_linkage(client, mdr)->is_null()) {
3012 // name already exists
3013 dout(10) << "dentry " << dname << " exists in " << *dir << dendl;
3014 if (!okexist) {
3015 respond_to_request(mdr, -EEXIST);
3016 return 0;
3017 }
3018 } else {
3019 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3020 dn->first = std::max(dn->first, next_snap);
3021 }
3022 return dn;
3023 }
3024
3025 // make sure dir is complete
3026 if (!dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(dname))) {
3027 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
3028 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
3029 return 0;
3030 }
3031
3032 // create
3033 dn = dir->add_null_dentry(dname, mdcache->get_global_snaprealm()->get_newest_seq() + 1);
3034 dn->mark_new();
3035 dout(10) << "prepare_null_dentry added " << *dn << dendl;
3036 return dn;
3037 }
3038
3039 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3040 {
3041 CDentry *straydn = mdr->straydn;
3042 if (straydn) {
3043 string straydname;
3044 in->name_stray_dentry(straydname);
3045 if (straydn->get_name() == straydname)
3046 return straydn;
3047
3048 ceph_assert(!mdr->done_locking);
3049 mdr->unpin(straydn);
3050 }
3051
3052 CDir *straydir = mdcache->get_stray_dir(in);
3053
3054 if (!mdr->client_request->is_replay() &&
3055 !check_fragment_space(mdr, straydir))
3056 return NULL;
3057
3058 straydn = mdcache->get_or_create_stray_dentry(in);
3059 mdr->straydn = straydn;
3060 mdr->pin(straydn);
3061 return straydn;
3062 }
3063
3064 /** prepare_new_inode
3065 *
3066 * create a new inode. set c/m/atime. hit dir pop.
3067 */
3068 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
3069 file_layout_t *layout)
3070 {
3071 CInode *in = new CInode(mdcache);
3072
3073 // Server::prepare_force_open_sessions() can re-open session in closing
3074 // state. In that corner case, session's prealloc_inos are being freed.
3075 // To simplify the code, we disallow using/refilling session's prealloc_ino
3076 // while session is opening.
3077 bool allow_prealloc_inos = !mdr->session->is_opening();
3078
3079 // assign ino
3080 if (allow_prealloc_inos &&
3081 mdr->session->info.prealloc_inos.size()) {
3082 mdr->used_prealloc_ino =
3083 in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used
3084 mds->sessionmap.mark_projected(mdr->session);
3085
3086 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3087 << " (" << mdr->session->info.prealloc_inos
3088 << ", " << mdr->session->info.prealloc_inos.size() << " left)"
3089 << dendl;
3090 } else {
3091 mdr->alloc_ino =
3092 in->inode.ino = mds->inotable->project_alloc_id();
3093 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3094 }
3095
3096 if (useino && useino != in->inode.ino) {
3097 dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
3098 mds->clog->error() << mdr->client_request->get_source()
3099 << " specified ino " << useino
3100 << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
3101 //ceph_abort(); // just for now.
3102 }
3103
3104 if (allow_prealloc_inos &&
3105 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3106 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
3107 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
3108 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
3109 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3110 mds->sessionmap.mark_projected(mdr->session);
3111 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3112 }
3113
3114 in->inode.version = 1;
3115 in->inode.xattr_version = 1;
3116 in->inode.nlink = 1; // FIXME
3117
3118 in->inode.mode = mode;
3119
3120 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
3121 if (in->inode.is_dir()) {
3122 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
3123 } else if (layout) {
3124 in->inode.layout = *layout;
3125 } else {
3126 in->inode.layout = mdcache->default_file_layout;
3127 }
3128
3129 in->inode.truncate_size = -1ull; // not truncated, yet!
3130 in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
3131
3132 CInode *diri = dir->get_inode();
3133
3134 dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
3135
3136 if (diri->inode.mode & S_ISGID) {
3137 dout(10) << " dir is sticky" << dendl;
3138 in->inode.gid = diri->inode.gid;
3139 if (S_ISDIR(mode)) {
3140 dout(10) << " new dir also sticky" << dendl;
3141 in->inode.mode |= S_ISGID;
3142 }
3143 } else
3144 in->inode.gid = mdr->client_request->get_caller_gid();
3145
3146 in->inode.uid = mdr->client_request->get_caller_uid();
3147
3148 in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
3149 mdr->get_op_stamp();
3150
3151 in->inode.change_attr = 0;
3152
3153 const MClientRequest::const_ref &req = mdr->client_request;
3154 if (req->get_data().length()) {
3155 auto p = req->get_data().cbegin();
3156
3157 // xattrs on new inode?
3158 CInode::mempool_xattr_map xattrs;
3159 decode(xattrs, p);
3160 for (const auto &p : xattrs) {
3161 dout(10) << "prepare_new_inode setting xattr " << p.first << dendl;
3162 auto em = in->xattrs.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple(p.second));
3163 if (!em.second)
3164 em.first->second = p.second;
3165 }
3166 }
3167
3168 if (!mds->mdsmap->get_inline_data_enabled() ||
3169 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
3170 in->inode.inline_data.version = CEPH_INLINE_NONE;
3171
3172 mdcache->add_inode(in); // add
3173 dout(10) << "prepare_new_inode " << *in << dendl;
3174 return in;
3175 }
3176
3177 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3178 {
3179 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3180 << " inotablev " << mds->inotable->get_projected_version()
3181 << dendl;
3182 blob->set_ino_alloc(mdr->alloc_ino,
3183 mdr->used_prealloc_ino,
3184 mdr->prealloc_inos,
3185 mdr->client_request->get_source(),
3186 mds->sessionmap.get_projected(),
3187 mds->inotable->get_projected_version());
3188 }
3189
3190 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3191 {
3192 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3193 << " / " << mdr->prealloc_inos
3194 << " / " << mdr->used_prealloc_ino << dendl;
3195
3196 if (mdr->alloc_ino) {
3197 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3198 }
3199 if (mdr->prealloc_inos.size()) {
3200 ceph_assert(session);
3201 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
3202 session->info.prealloc_inos.insert(mdr->prealloc_inos);
3203 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
3204 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3205 }
3206 if (mdr->used_prealloc_ino) {
3207 ceph_assert(session);
3208 session->info.used_inos.erase(mdr->used_prealloc_ino);
3209 mds->sessionmap.mark_dirty(session);
3210 }
3211 }
3212
3213 class C_MDS_TryFindInode : public ServerContext {
3214 MDRequestRef mdr;
3215 public:
3216 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3217 void finish(int r) override {
3218 if (r == -ESTALE) // :( find_ino_peers failed
3219 server->respond_to_request(mdr, r);
3220 else
3221 server->dispatch_client_request(mdr);
3222 }
3223 };
3224
3225 class CF_MDS_MDRContextFactory : public MDSContextFactory {
3226 public:
3227 CF_MDS_MDRContextFactory(MDCache *cache, MDRequestRef &mdr) : cache(cache), mdr(mdr) {}
3228 MDSContext *build() {
3229 return new C_MDS_RetryRequest(cache, mdr);
3230 }
3231 private:
3232 MDCache *cache;
3233 MDRequestRef mdr;
3234 };
3235
3236 CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath)
3237 {
3238 // figure parent dir vs dname
3239 if (refpath.depth() == 0) {
3240 dout(7) << "can't do that to root" << dendl;
3241 respond_to_request(mdr, -EINVAL);
3242 return 0;
3243 }
3244 string dname = refpath.last_dentry();
3245 refpath.pop_dentry();
3246
3247 dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl;
3248
3249 // traverse to parent dir
3250 CInode *diri;
3251 CF_MDS_MDRContextFactory cf(mdcache, mdr);
3252 int r = mdcache->path_traverse(mdr, cf, refpath, &trace, &diri, MDS_TRAVERSE_FORWARD);
3253 if (r > 0) return 0; // delayed
3254 if (r < 0) {
3255 if (r == -ESTALE) {
3256 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3257 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3258 return 0;
3259 }
3260 respond_to_request(mdr, r);
3261 return 0;
3262 }
3263
3264 // is it an auth dir?
3265 CDir *dir = validate_dentry_dir(mdr, diri, dname);
3266 if (!dir)
3267 return 0; // forwarded or waiting for freeze
3268
3269 dout(10) << "traverse_to_auth_dir " << *dir << dendl;
3270 return dir;
3271 }
3272
3273 /* If this returns null, the request has been handled
3274 * as appropriate: forwarded on, or the client's been replied to */
3275 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n,
3276 MutationImpl::LockOpVec& lov,
3277 bool want_auth,
3278 bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's
3279 a snapped dir */
3280 file_layout_t **layout,
3281 bool no_lookup) // true if we cannot return a null dentry lease
3282 {
3283 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
3284 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3285
3286 if (mdr->done_locking)
3287 return mdr->in[n];
3288
3289 // traverse
3290 CF_MDS_MDRContextFactory cf(mdcache, mdr);
3291 int r = mdcache->path_traverse(mdr, cf, refpath, &mdr->dn[n], &mdr->in[n], MDS_TRAVERSE_FORWARD);
3292 if (r > 0)
3293 return NULL; // delayed
3294 if (r < 0) { // error
3295 if (r == -ENOENT && n == 0 && !mdr->dn[n].empty()) {
3296 if (!no_lookup) {
3297 mdr->tracedn = mdr->dn[n].back();
3298 }
3299 respond_to_request(mdr, r);
3300 } else if (r == -ESTALE) {
3301 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3302 MDSContext *c = new C_MDS_TryFindInode(this, mdr);
3303 mdcache->find_ino_peers(refpath.get_ino(), c);
3304 } else {
3305 dout(10) << "FAIL on error " << r << dendl;
3306 respond_to_request(mdr, r);
3307 }
3308 return 0;
3309 }
3310 CInode *ref = mdr->in[n];
3311 dout(10) << "ref is " << *ref << dendl;
3312
3313 // fw to inode auth?
3314 if (mdr->snapid != CEPH_NOSNAP && !no_want_auth)
3315 want_auth = true;
3316
3317 if (want_auth) {
3318 if (ref->is_ambiguous_auth()) {
3319 dout(10) << "waiting for single auth on " << *ref << dendl;
3320 ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
3321 return 0;
3322 }
3323 if (!ref->is_auth()) {
3324 dout(10) << "fw to auth for " << *ref << dendl;
3325 mdcache->request_forward(mdr, ref->authority().first);
3326 return 0;
3327 }
3328
3329 // auth_pin?
3330 // do NOT proceed if freezing, as cap release may defer in that case, and
3331 // we could deadlock when we try to lock @ref.
3332 // if we're already auth_pinned, continue; the release has already been processed.
3333 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3334 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3335 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
3336 ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3337 /* If we have any auth pins, this will deadlock.
3338 * But the only way to get here if we've already got auth pins
3339 * is because we're on an inode with snapshots that got updated
3340 * between dispatches of this request. So we're going to drop
3341 * our locks and our auth pins and reacquire them later.
3342 *
3343 * This is safe since we're only in this function when working on
3344 * a single MDS request; otherwise we'd be in
3345 * rdlock_path_xlock_dentry.
3346 */
3347 mds->locker->drop_locks(mdr.get(), NULL);
3348 mdr->drop_local_auth_pins();
3349 if (!mdr->remote_auth_pins.empty())
3350 mds->locker->notify_freeze_waiter(ref);
3351 return 0;
3352 }
3353
3354 mdr->auth_pin(ref);
3355 }
3356
3357 for (int i=0; i<(int)mdr->dn[n].size(); i++)
3358 lov.add_rdlock(&mdr->dn[n][i]->lock);
3359 if (layout)
3360 mds->locker->include_snap_rdlocks_wlayout(ref, lov, layout);
3361 else
3362 mds->locker->include_snap_rdlocks(ref, lov);
3363
3364 // set and pin ref
3365 mdr->pin(ref);
3366 return ref;
3367 }
3368
3369
3370 /** rdlock_path_xlock_dentry
3371 * traverse path to the directory that could/would contain dentry.
3372 * make sure i am auth for that dentry, forward as necessary.
3373 * create null dentry in place (or use existing if okexist).
3374 * get rdlocks on traversed dentries, xlock on new dentry.
3375 */
3376 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
3377 MutationImpl::LockOpVec& lov,
3378 bool okexist, bool mustexist, bool alwaysxlock,
3379 file_layout_t **layout)
3380 {
3381 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
3382
3383 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3384
3385 client_t client = mdr->get_client();
3386
3387 if (mdr->done_locking)
3388 return mdr->dn[n].back();
3389
3390 CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
3391 if (!dir) return 0;
3392
3393 CInode *diri = dir->get_inode();
3394 if (!mdr->reqid.name.is_mds()) {
3395 if (diri->is_system() && !diri->is_root()) {
3396 respond_to_request(mdr, -EROFS);
3397 return 0;
3398 }
3399 }
3400 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
3401 respond_to_request(mdr, -ENOENT);
3402 return 0;
3403 }
3404
3405 // make a null dentry?
3406 std::string_view dname = refpath.last_dentry();
3407 CDentry *dn;
3408 if (mustexist) {
3409 dn = dir->lookup(dname);
3410
3411 // make sure dir is complete
3412 if (!dn && !dir->is_complete() &&
3413 (!dir->has_bloom() || dir->is_in_bloom(dname))) {
3414 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
3415 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
3416 return 0;
3417 }
3418
3419 // readable?
3420 if (dn && !dn->lock.can_read(client) && dn->lock.get_xlock_by() != mdr) {
3421 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
3422 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
3423 return 0;
3424 }
3425
3426 // exists?
3427 if (!dn || dn->get_linkage(client, mdr)->is_null()) {
3428 dout(7) << "dentry " << dname << " dne in " << *dir << dendl;
3429 respond_to_request(mdr, -ENOENT);
3430 return 0;
3431 }
3432 } else {
3433 dn = prepare_null_dentry(mdr, dir, dname, okexist);
3434 if (!dn)
3435 return 0;
3436 }
3437
3438 mdr->dn[n].push_back(dn);
3439 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
3440 mdr->in[n] = dnl->get_inode();
3441
3442 // -- lock --
3443 // NOTE: rename takes the same set of locks for srcdn
3444 for (int i=0; i<(int)mdr->dn[n].size(); i++)
3445 lov.add_rdlock(&mdr->dn[n][i]->lock);
3446 if (alwaysxlock || dnl->is_null())
3447 lov.add_xlock(&dn->lock); // new dn, xlock
3448 else
3449 lov.add_rdlock(&dn->lock); // existing dn, rdlock
3450 lov.add_wrlock(&dn->get_dir()->inode->filelock); // also, wrlock on dir mtime
3451 lov.add_wrlock(&dn->get_dir()->inode->nestlock); // also, wrlock on dir mtime
3452 if (layout)
3453 mds->locker->include_snap_rdlocks_wlayout(dn->get_dir()->inode, lov, layout);
3454 else
3455 mds->locker->include_snap_rdlocks(dn->get_dir()->inode, lov);
3456
3457 return dn;
3458 }
3459
3460
3461
3462
3463
3464 /**
3465 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3466 *
3467 * @param diri base inode
3468 * @param fg the exact frag we want
3469 * @param mdr request
3470 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3471 */
3472 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3473 {
3474 CDir *dir = diri->get_dirfrag(fg);
3475
3476 // not open and inode not mine?
3477 if (!dir && !diri->is_auth()) {
3478 mds_rank_t inauth = diri->authority().first;
3479 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3480 mdcache->request_forward(mdr, inauth);
3481 return 0;
3482 }
3483
3484 // not open and inode frozen?
3485 if (!dir && diri->is_frozen()) {
3486 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3487 ceph_assert(diri->get_parent_dir());
3488 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3489 return 0;
3490 }
3491
3492 // invent?
3493 if (!dir)
3494 dir = diri->get_or_open_dirfrag(mdcache, fg);
3495
3496 // am i auth for the dirfrag?
3497 if (!dir->is_auth()) {
3498 mds_rank_t auth = dir->authority().first;
3499 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3500 << ", fw to mds." << auth << dendl;
3501 mdcache->request_forward(mdr, auth);
3502 return 0;
3503 }
3504
3505 return dir;
3506 }
3507
3508
3509 // ===============================================================================
3510 // STAT
3511
3512 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3513 {
3514 const MClientRequest::const_ref &req = mdr->client_request;
3515
3516 if (req->get_filepath().depth() == 0 && is_lookup) {
3517 // refpath can't be empty for lookup but it can for
3518 // getattr (we do getattr with empty refpath for mount of '/')
3519 respond_to_request(mdr, -EINVAL);
3520 return;
3521 }
3522
3523 bool want_auth = false;
3524 int mask = req->head.args.getattr.mask;
3525 if (mask & CEPH_STAT_RSTAT)
3526 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3527
3528 MutationImpl::LockOpVec lov;
3529 CInode *ref = rdlock_path_pin_ref(mdr, 0, lov, want_auth, false, NULL,
3530 !is_lookup);
3531 if (!ref) return;
3532
3533 /*
3534 * if client currently holds the EXCL cap on a field, do not rdlock
3535 * it; client's stat() will result in valid info if _either_ EXCL
3536 * cap is held or MDS rdlocks and reads the value here.
3537 *
3538 * handling this case here is easier than weakening rdlock
3539 * semantics... that would cause problems elsewhere.
3540 */
3541 client_t client = mdr->get_client();
3542 int issued = 0;
3543 Capability *cap = ref->get_client_cap(client);
3544 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3545 mdr->snapid <= cap->client_follows))
3546 issued = cap->issued();
3547
3548 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
3549 lov.add_rdlock(&ref->linklock);
3550 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
3551 lov.add_rdlock(&ref->authlock);
3552 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
3553 lov.add_rdlock(&ref->xattrlock);
3554 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3555 // Don't wait on unstable filelock if client is allowed to read file size.
3556 // This can reduce the response time of getattr in the case that multiple
3557 // clients do stat(2) and there are writers.
3558 // The downside of this optimization is that mds may not issue Fs caps along
3559 // with getattr reply. Client may need to send more getattr requests.
3560 if (mdr->is_rdlocked(&ref->filelock)) {
3561 lov.add_rdlock(&ref->filelock);
3562 } else if (ref->filelock.is_stable() ||
3563 ref->filelock.get_num_wrlocks() > 0 ||
3564 !ref->filelock.can_read(mdr->get_client())) {
3565 lov.add_rdlock(&ref->filelock);
3566 mdr->done_locking = false;
3567 }
3568 }
3569
3570 if (!mds->locker->acquire_locks(mdr, lov))
3571 return;
3572
3573 if (!check_access(mdr, ref, MAY_READ))
3574 return;
3575
3576 utime_t now = ceph_clock_now();
3577 mdr->set_mds_stamp(now);
3578
3579 // note which caps are requested, so we return at least a snapshot
3580 // value for them. (currently this matters for xattrs and inline data)
3581 mdr->getattr_caps = mask;
3582
3583 mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
3584
3585 // reply
3586 dout(10) << "reply to stat on " << *req << dendl;
3587 mdr->tracei = ref;
3588 if (is_lookup)
3589 mdr->tracedn = mdr->dn[0].back();
3590 respond_to_request(mdr, 0);
3591 }
3592
3593 struct C_MDS_LookupIno2 : public ServerContext {
3594 MDRequestRef mdr;
3595 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3596 void finish(int r) override {
3597 server->_lookup_ino_2(mdr, r);
3598 }
3599 };
3600
3601 /*
3602 * filepath: ino
3603 */
3604 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3605 bool want_parent, bool want_dentry)
3606 {
3607 const MClientRequest::const_ref &req = mdr->client_request;
3608
3609 if ((uint64_t)req->head.args.lookupino.snapid > 0)
3610 return _lookup_snap_ino(mdr);
3611
3612 inodeno_t ino = req->get_filepath().get_ino();
3613 CInode *in = mdcache->get_inode(ino);
3614 if (in && in->state_test(CInode::STATE_PURGING)) {
3615 respond_to_request(mdr, -ESTALE);
3616 return;
3617 }
3618 if (!in) {
3619 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3620 return;
3621 }
3622
3623 if (mdr && in->snaprealm && !in->snaprealm->have_past_parents_open() &&
3624 !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
3625 return;
3626 }
3627
3628 // check for nothing (not read or write); this still applies the
3629 // path check.
3630 if (!check_access(mdr, in, 0))
3631 return;
3632
3633 CDentry *dn = in->get_projected_parent_dn();
3634 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3635
3636 MutationImpl::LockOpVec lov;
3637 if (dn && (want_parent || want_dentry)) {
3638 mdr->pin(dn);
3639 lov.add_rdlock(&dn->lock);
3640 }
3641
3642 unsigned mask = req->head.args.lookupino.mask;
3643 if (mask) {
3644 Capability *cap = in->get_client_cap(mdr->get_client());
3645 int issued = 0;
3646 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3647 issued = cap->issued();
3648 // permission bits, ACL/security xattrs
3649 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3650 lov.add_rdlock(&in->authlock);
3651 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3652 lov.add_rdlock(&in->xattrlock);
3653
3654 mdr->getattr_caps = mask;
3655 }
3656
3657 if (!lov.empty()) {
3658 if (!mds->locker->acquire_locks(mdr, lov))
3659 return;
3660
3661 if (diri != NULL) {
3662 // need read access to directory inode
3663 if (!check_access(mdr, diri, MAY_READ))
3664 return;
3665 }
3666 }
3667
3668 if (want_parent) {
3669 if (in->is_base()) {
3670 respond_to_request(mdr, -EINVAL);
3671 return;
3672 }
3673 if (!diri || diri->is_stray()) {
3674 respond_to_request(mdr, -ESTALE);
3675 return;
3676 }
3677 dout(10) << "reply to lookup_parent " << *in << dendl;
3678 mdr->tracei = diri;
3679 respond_to_request(mdr, 0);
3680 } else {
3681 if (want_dentry) {
3682 inodeno_t dirino = req->get_filepath2().get_ino();
3683 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3684 respond_to_request(mdr, -ENOENT);
3685 return;
3686 }
3687 dout(10) << "reply to lookup_name " << *in << dendl;
3688 } else
3689 dout(10) << "reply to lookup_ino " << *in << dendl;
3690
3691 mdr->tracei = in;
3692 if (want_dentry)
3693 mdr->tracedn = dn;
3694 respond_to_request(mdr, 0);
3695 }
3696 }
3697
3698 void Server::_lookup_snap_ino(MDRequestRef& mdr)
3699 {
3700 const MClientRequest::const_ref &req = mdr->client_request;
3701
3702 vinodeno_t vino;
3703 vino.ino = req->get_filepath().get_ino();
3704 vino.snapid = (__u64)req->head.args.lookupino.snapid;
3705 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
3706 __u32 hash = req->head.args.lookupino.hash;
3707
3708 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
3709
3710 CInode *in = mdcache->lookup_snap_inode(vino);
3711 if (!in) {
3712 in = mdcache->get_inode(vino.ino);
3713 if (in) {
3714 if (in->state_test(CInode::STATE_PURGING) ||
3715 !in->has_snap_data(vino.snapid)) {
3716 if (in->is_dir() || !parent_ino) {
3717 respond_to_request(mdr, -ESTALE);
3718 return;
3719 }
3720 in = NULL;
3721 }
3722 }
3723 }
3724
3725 if (in) {
3726 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
3727 mdr->snapid = vino.snapid;
3728 mdr->tracei = in;
3729 respond_to_request(mdr, 0);
3730 return;
3731 }
3732
3733 CInode *diri = NULL;
3734 if (parent_ino) {
3735 diri = mdcache->get_inode(parent_ino);
3736 if (!diri) {
3737 mdcache->open_ino(parent_ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
3738 return;
3739 }
3740
3741 if (!diri->is_dir()) {
3742 respond_to_request(mdr, -EINVAL);
3743 return;
3744 }
3745
3746 MutationImpl::LockOpVec lov;
3747 lov.add_rdlock(&diri->dirfragtreelock);
3748 if (!mds->locker->acquire_locks(mdr, lov))
3749 return;
3750
3751 frag_t frag = diri->dirfragtree[hash];
3752 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
3753 if (!dir)
3754 return;
3755
3756 if (!dir->is_complete()) {
3757 if (dir->is_frozen()) {
3758 mds->locker->drop_locks(mdr.get());
3759 mdr->drop_local_auth_pins();
3760 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3761 return;
3762 }
3763 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
3764 return;
3765 }
3766
3767 respond_to_request(mdr, -ESTALE);
3768 } else {
3769 mdcache->open_ino(vino.ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
3770 }
3771 }
3772
3773 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
3774 {
3775 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
3776 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3777
3778 // `r` is a rank if >=0, else an error code
3779 if (r >= 0) {
3780 mds_rank_t dest_rank(r);
3781 if (dest_rank == mds->get_nodeid())
3782 dispatch_client_request(mdr);
3783 else
3784 mdcache->request_forward(mdr, dest_rank);
3785 return;
3786 }
3787
3788 // give up
3789 if (r == -ENOENT || r == -ENODATA)
3790 r = -ESTALE;
3791 respond_to_request(mdr, r);
3792 }
3793
3794
3795 /* This function takes responsibility for the passed mdr*/
3796 void Server::handle_client_open(MDRequestRef& mdr)
3797 {
3798 const MClientRequest::const_ref &req = mdr->client_request;
3799 dout(7) << "open on " << req->get_filepath() << dendl;
3800
3801 int flags = req->head.args.open.flags;
3802 int cmode = ceph_flags_to_mode(flags);
3803 if (cmode < 0) {
3804 respond_to_request(mdr, -EINVAL);
3805 return;
3806 }
3807
3808 bool need_auth = !file_mode_is_readonly(cmode) ||
3809 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
3810
3811 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
3812 dout(7) << "read-only FS" << dendl;
3813 respond_to_request(mdr, -EROFS);
3814 return;
3815 }
3816
3817 MutationImpl::LockOpVec lov;
3818 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, need_auth);
3819 if (!cur)
3820 return;
3821
3822 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
3823 ceph_assert(!need_auth);
3824 mdr->done_locking = false;
3825 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
3826 if (!cur)
3827 return;
3828 }
3829
3830 if (!cur->inode.is_file()) {
3831 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3832 cmode = CEPH_FILE_MODE_PIN;
3833 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3834 if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
3835 flags &= ~CEPH_O_TRUNC;
3836 }
3837
3838 dout(10) << "open flags = " << flags
3839 << ", filemode = " << cmode
3840 << ", need_auth = " << need_auth
3841 << dendl;
3842
3843 // regular file?
3844 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3845 dout(7) << "not a file or dir " << *cur << dendl;
3846 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3847 return;
3848 }*/
3849 if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
3850 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
3851 respond_to_request(mdr, -EINVAL);
3852 return;
3853 }
3854
3855 if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
3856 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
3857 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3858 respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
3859 return;
3860 }
3861
3862 if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
3863 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
3864 dout(7) << "old client cannot open inline data file " << *cur << dendl;
3865 respond_to_request(mdr, -EPERM);
3866 return;
3867 }
3868
3869 // snapped data is read only
3870 if (mdr->snapid != CEPH_NOSNAP &&
3871 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
3872 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
3873 respond_to_request(mdr, -EROFS);
3874 return;
3875 }
3876
3877 unsigned mask = req->head.args.open.mask;
3878 if (mask) {
3879 Capability *cap = cur->get_client_cap(mdr->get_client());
3880 int issued = 0;
3881 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3882 issued = cap->issued();
3883 // permission bits, ACL/security xattrs
3884 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3885 lov.add_rdlock(&cur->authlock);
3886 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3887 lov.add_rdlock(&cur->xattrlock);
3888
3889 mdr->getattr_caps = mask;
3890 }
3891
3892 // O_TRUNC
3893 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
3894 ceph_assert(cur->is_auth());
3895
3896 lov.add_xlock(&cur->filelock);
3897 if (!mds->locker->acquire_locks(mdr, lov))
3898 return;
3899
3900 if (!check_access(mdr, cur, MAY_WRITE))
3901 return;
3902
3903 // wait for pending truncate?
3904 const auto pi = cur->get_projected_inode();
3905 if (pi->is_truncating()) {
3906 dout(10) << " waiting for pending truncate from " << pi->truncate_from
3907 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
3908 mds->locker->drop_locks(mdr.get());
3909 mdr->drop_local_auth_pins();
3910 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
3911 return;
3912 }
3913
3914 do_open_truncate(mdr, cmode);
3915 return;
3916 }
3917
3918 // sync filelock if snapped.
3919 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3920 // and that data itself is flushed so that we can read the snapped data off disk.
3921 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
3922 lov.add_rdlock(&cur->filelock);
3923 }
3924
3925 if (!mds->locker->acquire_locks(mdr, lov))
3926 return;
3927
3928 mask = MAY_READ;
3929 if (cmode & CEPH_FILE_MODE_WR)
3930 mask |= MAY_WRITE;
3931 if (!check_access(mdr, cur, mask))
3932 return;
3933
3934 utime_t now = ceph_clock_now();
3935 mdr->set_mds_stamp(now);
3936
3937 if (cur->is_file() || cur->is_dir()) {
3938 if (mdr->snapid == CEPH_NOSNAP) {
3939 // register new cap
3940 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr->session, 0, req->is_replay());
3941 if (cap)
3942 dout(12) << "open issued caps " << ccap_string(cap->pending())
3943 << " for " << req->get_source()
3944 << " on " << *cur << dendl;
3945 } else {
3946 int caps = ceph_caps_for_mode(cmode);
3947 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
3948 << " for " << req->get_source()
3949 << " snapid " << mdr->snapid
3950 << " on " << *cur << dendl;
3951 mdr->snap_caps = caps;
3952 }
3953 }
3954
3955 // increase max_size?
3956 if (cmode & CEPH_FILE_MODE_WR)
3957 mds->locker->check_inode_max_size(cur);
3958
3959 // make sure this inode gets into the journal
3960 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
3961 mdcache->open_file_table.should_log_open(cur)) {
3962 EOpen *le = new EOpen(mds->mdlog);
3963 mdlog->start_entry(le);
3964 le->add_clean_inode(cur);
3965 mdlog->submit_entry(le);
3966 }
3967
3968 // hit pop
3969 if (cmode & CEPH_FILE_MODE_WR)
3970 mds->balancer->hit_inode(cur, META_POP_IWR);
3971 else
3972 mds->balancer->hit_inode(cur, META_POP_IRD,
3973 mdr->client_request->get_source().num());
3974
3975 CDentry *dn = 0;
3976 if (req->get_dentry_wanted()) {
3977 ceph_assert(mdr->dn[0].size());
3978 dn = mdr->dn[0].back();
3979 }
3980
3981 mdr->tracei = cur;
3982 mdr->tracedn = dn;
3983 respond_to_request(mdr, 0);
3984 }
3985
3986 class C_MDS_openc_finish : public ServerLogContext {
3987 CDentry *dn;
3988 CInode *newi;
3989 public:
3990 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
3991 ServerLogContext(s, r), dn(d), newi(ni) {}
3992 void finish(int r) override {
3993 ceph_assert(r == 0);
3994
3995 dn->pop_projected_linkage();
3996
3997 // dirty inode, dn, dir
3998 newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
3999 newi->mark_dirty(newi->inode.version+1, mdr->ls);
4000 newi->mark_dirty_parent(mdr->ls, true);
4001
4002 mdr->apply();
4003
4004 get_mds()->locker->share_inode_max_size(newi);
4005
4006 MDRequestRef null_ref;
4007 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4008
4009 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
4010
4011 server->respond_to_request(mdr, 0);
4012
4013 ceph_assert(g_conf()->mds_kill_openc_at != 1);
4014 }
4015 };
4016
4017 /* This function takes responsibility for the passed mdr*/
4018 void Server::handle_client_openc(MDRequestRef& mdr)
4019 {
4020 const MClientRequest::const_ref &req = mdr->client_request;
4021 client_t client = mdr->get_client();
4022
4023 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4024
4025 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4026 if (cmode < 0) {
4027 respond_to_request(mdr, -EINVAL);
4028 return;
4029 }
4030
4031 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
4032
4033 if (!excl) {
4034 CF_MDS_MDRContextFactory cf(mdcache, mdr);
4035 int r = mdcache->path_traverse(mdr, cf, req->get_filepath(),
4036 &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
4037 if (r > 0) return;
4038 if (r == 0) {
4039 // it existed.
4040 handle_client_open(mdr);
4041 return;
4042 }
4043 if (r < 0 && r != -ENOENT) {
4044 if (r == -ESTALE) {
4045 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
4046 MDSContext *c = new C_MDS_TryFindInode(this, mdr);
4047 mdcache->find_ino_peers(req->get_filepath().get_ino(), c);
4048 } else {
4049 dout(10) << "FAIL on error " << r << dendl;
4050 respond_to_request(mdr, r);
4051 }
4052 return;
4053 }
4054 }
4055
4056 MutationImpl::LockOpVec lov;
4057 file_layout_t *dir_layout = nullptr;
4058 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov,
4059 !excl, false, false, &dir_layout);
4060 if (!dn) return;
4061 if (mdr->snapid != CEPH_NOSNAP) {
4062 respond_to_request(mdr, -EROFS);
4063 return;
4064 }
4065 // set layout
4066 file_layout_t layout;
4067 if (dir_layout)
4068 layout = *dir_layout;
4069 else
4070 layout = mdcache->default_file_layout;
4071
4072 // What kind of client caps are required to complete this operation
4073 uint64_t access = MAY_WRITE;
4074
4075 const auto default_layout = layout;
4076
4077 // fill in any special params from client
4078 if (req->head.args.open.stripe_unit)
4079 layout.stripe_unit = req->head.args.open.stripe_unit;
4080 if (req->head.args.open.stripe_count)
4081 layout.stripe_count = req->head.args.open.stripe_count;
4082 if (req->head.args.open.object_size)
4083 layout.object_size = req->head.args.open.object_size;
4084 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4085 (__s32)req->head.args.open.pool >= 0) {
4086 layout.pool_id = req->head.args.open.pool;
4087
4088 // make sure we have as new a map as the client
4089 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4090 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4091 return;
4092 }
4093 }
4094
4095 // If client doesn't have capability to modify layout pools, then
4096 // only permit this request if the requested pool matches what the
4097 // file would have inherited anyway from its parent.
4098 if (default_layout != layout) {
4099 access |= MAY_SET_VXATTR;
4100 }
4101
4102 if (!layout.is_valid()) {
4103 dout(10) << " invalid initial file layout" << dendl;
4104 respond_to_request(mdr, -EINVAL);
4105 return;
4106 }
4107 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4108 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4109 respond_to_request(mdr, -EINVAL);
4110 return;
4111 }
4112
4113 // created null dn.
4114 CDir *dir = dn->get_dir();
4115 CInode *diri = dir->get_inode();
4116 lov.add_rdlock(&diri->authlock);
4117 if (!mds->locker->acquire_locks(mdr, lov))
4118 return;
4119
4120 if (!check_access(mdr, diri, access))
4121 return;
4122
4123 if (!check_fragment_space(mdr, dir))
4124 return;
4125
4126 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4127
4128 if (!dnl->is_null()) {
4129 // it existed.
4130 ceph_assert(req->head.args.open.flags & CEPH_O_EXCL);
4131 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
4132 mdr->tracei = dnl->get_inode();
4133 mdr->tracedn = dn;
4134 respond_to_request(mdr, -EEXIST);
4135 return;
4136 }
4137
4138 // create inode.
4139 CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4140 req->head.args.open.mode | S_IFREG, &layout);
4141 ceph_assert(in);
4142
4143 // it's a file.
4144 dn->push_projected_linkage(in);
4145
4146 in->inode.version = dn->pre_dirty();
4147 if (layout.pool_id != mdcache->default_file_layout.pool_id)
4148 in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
4149 in->inode.update_backtrace();
4150 in->inode.rstat.rfiles = 1;
4151
4152 SnapRealm *realm = diri->find_snaprealm();
4153 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4154 ceph_assert(follows >= realm->get_newest_seq());
4155
4156 ceph_assert(dn->first == follows+1);
4157 in->first = dn->first;
4158
4159 // do the open
4160 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
4161 in->authlock.set_state(LOCK_EXCL);
4162 in->xattrlock.set_state(LOCK_EXCL);
4163
4164 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
4165 in->inode.client_ranges[client].range.first = 0;
4166 in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
4167 in->inode.client_ranges[client].follows = follows;
4168 cap->mark_clientwriteable();
4169 }
4170
4171 // prepare finisher
4172 mdr->ls = mdlog->get_current_segment();
4173 EUpdate *le = new EUpdate(mdlog, "openc");
4174 mdlog->start_entry(le);
4175 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4176 journal_allocated_inos(mdr, &le->metablob);
4177 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4178 le->metablob.add_primary_dentry(dn, in, true, true, true);
4179
4180 // make sure this inode gets into the journal
4181 le->metablob.add_opened_ino(in->ino());
4182
4183 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in);
4184
4185 if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
4186 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4187 // add the file created flag onto the reply if create_flags features is supported
4188 encode(in->inode.ino, mdr->reply_extra_bl);
4189 }
4190
4191 journal_and_reply(mdr, in, dn, le, fin);
4192
4193 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4194 // have overshot the split size (multiple opencs in flight), so here is
4195 // an early chance to split the dir if this openc makes it oversized.
4196 mds->balancer->maybe_fragment(dir, false);
4197 }
4198
4199
4200
4201 void Server::handle_client_readdir(MDRequestRef& mdr)
4202 {
4203 const MClientRequest::const_ref &req = mdr->client_request;
4204 client_t client = req->get_source().num();
4205 MutationImpl::LockOpVec lov;
4206 CInode *diri = rdlock_path_pin_ref(mdr, 0, lov, false, true);
4207 if (!diri) return;
4208
4209 // it's a directory, right?
4210 if (!diri->is_dir()) {
4211 // not a dir
4212 dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
4213 respond_to_request(mdr, -ENOTDIR);
4214 return;
4215 }
4216
4217 lov.add_rdlock(&diri->filelock);
4218 lov.add_rdlock(&diri->dirfragtreelock);
4219
4220 if (!mds->locker->acquire_locks(mdr, lov))
4221 return;
4222
4223 if (!check_access(mdr, diri, MAY_READ))
4224 return;
4225
4226 // which frag?
4227 frag_t fg = (__u32)req->head.args.readdir.frag;
4228 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4229 string offset_str = req->get_path2();
4230
4231 __u32 offset_hash = 0;
4232 if (!offset_str.empty())
4233 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4234 else
4235 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4236
4237 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4238 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4239
4240 // does the frag exist?
4241 if (diri->dirfragtree[fg.value()] != fg) {
4242 frag_t newfg;
4243 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4244 if (fg.contains((unsigned)offset_hash)) {
4245 newfg = diri->dirfragtree[offset_hash];
4246 } else {
4247 // client actually wants next frag
4248 newfg = diri->dirfragtree[fg.value()];
4249 }
4250 } else {
4251 offset_str.clear();
4252 newfg = diri->dirfragtree[fg.value()];
4253 }
4254 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4255 fg = newfg;
4256 }
4257
4258 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4259 if (!dir) return;
4260
4261 // ok!
4262 dout(10) << "handle_client_readdir on " << *dir << dendl;
4263 ceph_assert(dir->is_auth());
4264
4265 if (!dir->is_complete()) {
4266 if (dir->is_frozen()) {
4267 dout(7) << "dir is frozen " << *dir << dendl;
4268 mds->locker->drop_locks(mdr.get());
4269 mdr->drop_local_auth_pins();
4270 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4271 return;
4272 }
4273 // fetch
4274 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4275 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4276 return;
4277 }
4278
4279 #ifdef MDS_VERIFY_FRAGSTAT
4280 dir->verify_fragstat();
4281 #endif
4282
4283 utime_t now = ceph_clock_now();
4284 mdr->set_mds_stamp(now);
4285
4286 snapid_t snapid = mdr->snapid;
4287 dout(10) << "snapid " << snapid << dendl;
4288
4289 SnapRealm *realm = diri->find_snaprealm();
4290
4291 unsigned max = req->head.args.readdir.max_entries;
4292 if (!max)
4293 max = dir->get_num_any(); // whatever, something big.
4294 unsigned max_bytes = req->head.args.readdir.max_bytes;
4295 if (!max_bytes)
4296 // make sure at least one item can be encoded
4297 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
4298
4299 // start final blob
4300 bufferlist dirbl;
4301 DirStat ds;
4302 ds.frag = dir->get_frag();
4303 ds.auth = dir->get_dir_auth().first;
4304 if (dir->is_auth())
4305 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4306
4307 dir->encode_dirstat(dirbl, mdr->session->info, ds);
4308
4309 // count bytes available.
4310 // this isn't perfect, but we should capture the main variable/unbounded size items!
4311 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4312 int bytes_left = max_bytes - front_bytes;
4313 bytes_left -= realm->get_snap_trace().length();
4314
4315 // build dir contents
4316 bufferlist dnbl;
4317 __u32 numfiles = 0;
4318 bool start = !offset_hash && offset_str.empty();
4319 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4320 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
4321 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4322 bool end = (it == dir->end());
4323 for (; !end && numfiles < max; end = (it == dir->end())) {
4324 CDentry *dn = it->second;
4325 ++it;
4326
4327 if (dn->state_test(CDentry::STATE_PURGING))
4328 continue;
4329
4330 bool dnp = dn->use_projected(client, mdr);
4331 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4332
4333 if (dnl->is_null())
4334 continue;
4335
4336 if (dn->last < snapid || dn->first > snapid) {
4337 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4338 continue;
4339 }
4340
4341 if (!start) {
4342 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4343 if (!(offset_key < dn->key()))
4344 continue;
4345 }
4346
4347 CInode *in = dnl->get_inode();
4348
4349 if (in && in->ino() == CEPH_INO_CEPH)
4350 continue;
4351
4352 // remote link?
4353 // better for the MDS to do the work, if we think the client will stat any of these files.
4354 if (dnl->is_remote() && !in) {
4355 in = mdcache->get_inode(dnl->get_remote_ino());
4356 if (in) {
4357 dn->link_remote(dnl, in);
4358 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4359 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4360 continue;
4361 } else {
4362 // touch everything i _do_ have
4363 for (auto &p : *dir) {
4364 if (!p.second->get_linkage()->is_null())
4365 mdcache->lru.lru_touch(p.second);
4366 }
4367
4368 // already issued caps and leases, reply immediately.
4369 if (dnbl.length() > 0) {
4370 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4371 dout(10) << " open remote dentry after caps were issued, stopping at "
4372 << dnbl.length() << " < " << bytes_left << dendl;
4373 break;
4374 }
4375
4376 mds->locker->drop_locks(mdr.get());
4377 mdr->drop_local_auth_pins();
4378 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4379 return;
4380 }
4381 }
4382 ceph_assert(in);
4383
4384 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
4385 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4386 break;
4387 }
4388
4389 unsigned start_len = dnbl.length();
4390
4391 // dentry
4392 dout(12) << "including dn " << *dn << dendl;
4393 encode(dn->get_name(), dnbl);
4394 mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session);
4395
4396 // inode
4397 dout(12) << "including inode " << *in << dendl;
4398 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4399 if (r < 0) {
4400 // chop off dn->name, lease
4401 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4402 bufferlist keep;
4403 keep.substr_of(dnbl, 0, start_len);
4404 dnbl.swap(keep);
4405 break;
4406 }
4407 ceph_assert(r >= 0);
4408 numfiles++;
4409
4410 // touch dn
4411 mdcache->lru.lru_touch(dn);
4412 }
4413
4414 __u16 flags = 0;
4415 if (end) {
4416 flags = CEPH_READDIR_FRAG_END;
4417 if (start)
4418 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4419 }
4420 // client only understand END and COMPLETE flags ?
4421 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4422 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4423 }
4424
4425 // finish final blob
4426 encode(numfiles, dirbl);
4427 encode(flags, dirbl);
4428 dirbl.claim_append(dnbl);
4429
4430 // yay, reply
4431 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4432 << " bytes=" << dirbl.length()
4433 << " start=" << (int)start
4434 << " end=" << (int)end
4435 << dendl;
4436 mdr->reply_extra_bl = dirbl;
4437
4438 // bump popularity. NOTE: this doesn't quite capture it.
4439 mds->balancer->hit_dir(dir, META_POP_IRD, -1, numfiles);
4440
4441 // reply
4442 mdr->tracei = diri;
4443 respond_to_request(mdr, 0);
4444 }
4445
4446
4447
4448 // ===============================================================================
4449 // INODE UPDATES
4450
4451
4452 /*
4453 * finisher for basic inode updates
4454 */
4455 class C_MDS_inode_update_finish : public ServerLogContext {
4456 CInode *in;
4457 bool truncating_smaller, changed_ranges, new_realm;
4458 public:
4459 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
4460 bool sm=false, bool cr=false, bool nr=false) :
4461 ServerLogContext(s, r), in(i),
4462 truncating_smaller(sm), changed_ranges(cr), new_realm(nr) { }
4463 void finish(int r) override {
4464 ceph_assert(r == 0);
4465
4466 // apply
4467 in->pop_and_dirty_projected_inode(mdr->ls);
4468 mdr->apply();
4469
4470 MDSRank *mds = get_mds();
4471
4472 // notify any clients
4473 if (truncating_smaller && in->inode.is_truncating()) {
4474 mds->locker->issue_truncate(in);
4475 mds->mdcache->truncate_inode(in, mdr->ls);
4476 }
4477
4478 if (new_realm) {
4479 int op = CEPH_SNAP_OP_SPLIT;
4480 mds->mdcache->send_snap_update(in, 0, op);
4481 mds->mdcache->do_realm_invalidate_and_update_notify(in, op);
4482 }
4483
4484 get_mds()->balancer->hit_inode(in, META_POP_IWR);
4485
4486 server->respond_to_request(mdr, 0);
4487
4488 if (changed_ranges)
4489 get_mds()->locker->share_inode_max_size(in);
4490 }
4491 };
4492
4493 void Server::handle_client_file_setlock(MDRequestRef& mdr)
4494 {
4495 const MClientRequest::const_ref &req = mdr->client_request;
4496 MutationImpl::LockOpVec lov;
4497
4498 // get the inode to operate on, and set up any locks needed for that
4499 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
4500 if (!cur)
4501 return;
4502
4503 lov.add_xlock(&cur->flocklock);
4504 /* acquire_locks will return true if it gets the locks. If it fails,
4505 it will redeliver this request at a later date, so drop the request.
4506 */
4507 if (!mds->locker->acquire_locks(mdr, lov)) {
4508 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
4509 return;
4510 }
4511
4512 // copy the lock change into a ceph_filelock so we can store/apply it
4513 ceph_filelock set_lock;
4514 set_lock.start = req->head.args.filelock_change.start;
4515 set_lock.length = req->head.args.filelock_change.length;
4516 set_lock.client = req->get_orig_source().num();
4517 set_lock.owner = req->head.args.filelock_change.owner;
4518 set_lock.pid = req->head.args.filelock_change.pid;
4519 set_lock.type = req->head.args.filelock_change.type;
4520 bool will_wait = req->head.args.filelock_change.wait;
4521
4522 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
4523
4524 ceph_lock_state_t *lock_state = NULL;
4525 bool interrupt = false;
4526
4527 // get the appropriate lock state
4528 switch (req->head.args.filelock_change.rule) {
4529 case CEPH_LOCK_FLOCK_INTR:
4530 interrupt = true;
4531 // fall-thru
4532 case CEPH_LOCK_FLOCK:
4533 lock_state = cur->get_flock_lock_state();
4534 break;
4535
4536 case CEPH_LOCK_FCNTL_INTR:
4537 interrupt = true;
4538 // fall-thru
4539 case CEPH_LOCK_FCNTL:
4540 lock_state = cur->get_fcntl_lock_state();
4541 break;
4542
4543 default:
4544 dout(10) << "got unknown lock type " << set_lock.type
4545 << ", dropping request!" << dendl;
4546 respond_to_request(mdr, -EOPNOTSUPP);
4547 return;
4548 }
4549
4550 dout(10) << " state prior to lock change: " << *lock_state << dendl;
4551 if (CEPH_LOCK_UNLOCK == set_lock.type) {
4552 list<ceph_filelock> activated_locks;
4553 MDSContext::vec waiters;
4554 if (lock_state->is_waiting(set_lock)) {
4555 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
4556 lock_state->remove_waiting(set_lock);
4557 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4558 } else if (!interrupt) {
4559 dout(10) << " unlock attempt on " << set_lock << dendl;
4560 lock_state->remove_lock(set_lock, activated_locks);
4561 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4562 }
4563 mds->queue_waiters(waiters);
4564
4565 respond_to_request(mdr, 0);
4566 } else {
4567 dout(10) << " lock attempt on " << set_lock << dendl;
4568 bool deadlock = false;
4569 if (mdr->more()->flock_was_waiting &&
4570 !lock_state->is_waiting(set_lock)) {
4571 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
4572 respond_to_request(mdr, -EINTR);
4573 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
4574 dout(10) << " it failed on this attempt" << dendl;
4575 // couldn't set lock right now
4576 if (deadlock) {
4577 respond_to_request(mdr, -EDEADLK);
4578 } else if (!will_wait) {
4579 respond_to_request(mdr, -EWOULDBLOCK);
4580 } else {
4581 dout(10) << " added to waiting list" << dendl;
4582 ceph_assert(lock_state->is_waiting(set_lock));
4583 mdr->more()->flock_was_waiting = true;
4584 mds->locker->drop_locks(mdr.get());
4585 mdr->drop_local_auth_pins();
4586 mdr->mark_event("failed to add lock, waiting");
4587 mdr->mark_nowarn();
4588 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
4589 }
4590 } else
4591 respond_to_request(mdr, 0);
4592 }
4593 dout(10) << " state after lock change: " << *lock_state << dendl;
4594 }
4595
4596 void Server::handle_client_file_readlock(MDRequestRef& mdr)
4597 {
4598 const MClientRequest::const_ref &req = mdr->client_request;
4599 MutationImpl::LockOpVec lov;
4600
4601 // get the inode to operate on, and set up any locks needed for that
4602 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
4603 if (!cur)
4604 return;
4605
4606 /* acquire_locks will return true if it gets the locks. If it fails,
4607 it will redeliver this request at a later date, so drop the request.
4608 */
4609 lov.add_rdlock(&cur->flocklock);
4610 if (!mds->locker->acquire_locks(mdr, lov)) {
4611 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
4612 return;
4613 }
4614
4615 // copy the lock change into a ceph_filelock so we can store/apply it
4616 ceph_filelock checking_lock;
4617 checking_lock.start = req->head.args.filelock_change.start;
4618 checking_lock.length = req->head.args.filelock_change.length;
4619 checking_lock.client = req->get_orig_source().num();
4620 checking_lock.owner = req->head.args.filelock_change.owner;
4621 checking_lock.pid = req->head.args.filelock_change.pid;
4622 checking_lock.type = req->head.args.filelock_change.type;
4623
4624 // get the appropriate lock state
4625 ceph_lock_state_t *lock_state = NULL;
4626 switch (req->head.args.filelock_change.rule) {
4627 case CEPH_LOCK_FLOCK:
4628 lock_state = cur->get_flock_lock_state();
4629 break;
4630
4631 case CEPH_LOCK_FCNTL:
4632 lock_state = cur->get_fcntl_lock_state();
4633 break;
4634
4635 default:
4636 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
4637 respond_to_request(mdr, -EINVAL);
4638 return;
4639 }
4640 lock_state->look_for_lock(checking_lock);
4641
4642 bufferlist lock_bl;
4643 encode(checking_lock, lock_bl);
4644
4645 mdr->reply_extra_bl = lock_bl;
4646 respond_to_request(mdr, 0);
4647 }
4648
4649 void Server::handle_client_setattr(MDRequestRef& mdr)
4650 {
4651 const MClientRequest::const_ref &req = mdr->client_request;
4652 MutationImpl::LockOpVec lov;
4653 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
4654 if (!cur) return;
4655
4656 if (mdr->snapid != CEPH_NOSNAP) {
4657 respond_to_request(mdr, -EROFS);
4658 return;
4659 }
4660 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
4661 respond_to_request(mdr, -EPERM);
4662 return;
4663 }
4664
4665 __u32 mask = req->head.args.setattr.mask;
4666 __u32 access_mask = MAY_WRITE;
4667
4668 // xlock inode
4669 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
4670 lov.add_xlock(&cur->authlock);
4671 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
4672 lov.add_xlock(&cur->filelock);
4673 if (mask & CEPH_SETATTR_CTIME)
4674 lov.add_wrlock(&cur->versionlock);
4675
4676 if (!mds->locker->acquire_locks(mdr, lov))
4677 return;
4678
4679 if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
4680 access_mask |= MAY_CHOWN;
4681
4682 if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
4683 access_mask |= MAY_CHGRP;
4684
4685 if (!check_access(mdr, cur, access_mask))
4686 return;
4687
4688 // trunc from bigger -> smaller?
4689 auto pip = cur->get_projected_inode();
4690
4691 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
4692
4693 // ENOSPC on growing file while full, but allow shrinks
4694 if (is_full && req->head.args.setattr.size > old_size) {
4695 dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
4696 respond_to_request(mdr, -ENOSPC);
4697 return;
4698 }
4699
4700 bool truncating_smaller = false;
4701 if (mask & CEPH_SETATTR_SIZE) {
4702 truncating_smaller = req->head.args.setattr.size < old_size;
4703 if (truncating_smaller && pip->is_truncating()) {
4704 dout(10) << " waiting for pending truncate from " << pip->truncate_from
4705 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
4706 mds->locker->drop_locks(mdr.get());
4707 mdr->drop_local_auth_pins();
4708 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4709 return;
4710 }
4711 }
4712
4713 bool changed_ranges = false;
4714
4715 // project update
4716 mdr->ls = mdlog->get_current_segment();
4717 EUpdate *le = new EUpdate(mdlog, "setattr");
4718 mdlog->start_entry(le);
4719
4720 auto &pi = cur->project_inode();
4721
4722 if (mask & CEPH_SETATTR_UID)
4723 pi.inode.uid = req->head.args.setattr.uid;
4724 if (mask & CEPH_SETATTR_GID)
4725 pi.inode.gid = req->head.args.setattr.gid;
4726
4727 if (mask & CEPH_SETATTR_MODE)
4728 pi.inode.mode = (pi.inode.mode & ~07777) | (req->head.args.setattr.mode & 07777);
4729 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
4730 S_ISREG(pi.inode.mode) &&
4731 (pi.inode.mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
4732 pi.inode.mode &= ~(S_ISUID|S_ISGID);
4733 }
4734
4735 if (mask & CEPH_SETATTR_MTIME)
4736 pi.inode.mtime = req->head.args.setattr.mtime;
4737 if (mask & CEPH_SETATTR_ATIME)
4738 pi.inode.atime = req->head.args.setattr.atime;
4739 if (mask & CEPH_SETATTR_BTIME)
4740 pi.inode.btime = req->head.args.setattr.btime;
4741 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
4742 pi.inode.time_warp_seq++; // maybe not a timewarp, but still a serialization point.
4743 if (mask & CEPH_SETATTR_SIZE) {
4744 if (truncating_smaller) {
4745 pi.inode.truncate(old_size, req->head.args.setattr.size);
4746 le->metablob.add_truncate_start(cur->ino());
4747 } else {
4748 pi.inode.size = req->head.args.setattr.size;
4749 pi.inode.rstat.rbytes = pi.inode.size;
4750 }
4751 pi.inode.mtime = mdr->get_op_stamp();
4752
4753 // adjust client's max_size?
4754 CInode::mempool_inode::client_range_map new_ranges;
4755 bool max_increased = false;
4756 mds->locker->calc_new_client_ranges(cur, pi.inode.size, true, &new_ranges, &max_increased);
4757 if (pi.inode.client_ranges != new_ranges) {
4758 dout(10) << " client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl;
4759 pi.inode.client_ranges = new_ranges;
4760 changed_ranges = true;
4761 }
4762 }
4763
4764 pi.inode.version = cur->pre_dirty();
4765 pi.inode.ctime = mdr->get_op_stamp();
4766 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
4767 pi.inode.rstat.rctime = mdr->get_op_stamp();
4768 pi.inode.change_attr++;
4769
4770 // log + wait
4771 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4772 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4773 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4774
4775 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
4776 truncating_smaller, changed_ranges));
4777
4778 // flush immediately if there are readers/writers waiting
4779 if (mdr->is_xlocked(&cur->filelock) &&
4780 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
4781 mds->mdlog->flush();
4782 }
4783
4784 /* Takes responsibility for mdr */
4785 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
4786 {
4787 CInode *in = mdr->in[0];
4788 client_t client = mdr->get_client();
4789 ceph_assert(in);
4790
4791 dout(10) << "do_open_truncate " << *in << dendl;
4792
4793 SnapRealm *realm = in->find_snaprealm();
4794 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
4795
4796 mdr->ls = mdlog->get_current_segment();
4797 EUpdate *le = new EUpdate(mdlog, "open_truncate");
4798 mdlog->start_entry(le);
4799
4800 // prepare
4801 auto &pi = in->project_inode();
4802 pi.inode.version = in->pre_dirty();
4803 pi.inode.mtime = pi.inode.ctime = mdr->get_op_stamp();
4804 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
4805 pi.inode.rstat.rctime = mdr->get_op_stamp();
4806 pi.inode.change_attr++;
4807
4808 uint64_t old_size = std::max<uint64_t>(pi.inode.size, mdr->client_request->head.args.open.old_size);
4809 if (old_size > 0) {
4810 pi.inode.truncate(old_size, 0);
4811 le->metablob.add_truncate_start(in->ino());
4812 }
4813
4814 bool changed_ranges = false;
4815 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
4816 pi.inode.client_ranges[client].range.first = 0;
4817 pi.inode.client_ranges[client].range.last = pi.inode.get_layout_size_increment();
4818 pi.inode.client_ranges[client].follows = realm->get_newest_seq();
4819 changed_ranges = true;
4820 cap->mark_clientwriteable();
4821 }
4822
4823 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
4824
4825 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
4826 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
4827
4828 // make sure ino gets into the journal
4829 le->metablob.add_opened_ino(in->ino());
4830
4831 mdr->o_trunc = true;
4832
4833 CDentry *dn = 0;
4834 if (mdr->client_request->get_dentry_wanted()) {
4835 ceph_assert(mdr->dn[0].size());
4836 dn = mdr->dn[0].back();
4837 }
4838
4839 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
4840 changed_ranges));
4841 // Although the `open` part can give an early reply, the truncation won't
4842 // happen until our EUpdate is persistent, to give the client a prompt
4843 // response we must also flush that event.
4844 mdlog->flush();
4845 }
4846
4847
4848 /* This function cleans up the passed mdr */
4849 void Server::handle_client_setlayout(MDRequestRef& mdr)
4850 {
4851 const MClientRequest::const_ref &req = mdr->client_request;
4852 MutationImpl::LockOpVec lov;
4853 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
4854 if (!cur) return;
4855
4856 if (mdr->snapid != CEPH_NOSNAP) {
4857 respond_to_request(mdr, -EROFS);
4858 return;
4859 }
4860 if (!cur->is_file()) {
4861 respond_to_request(mdr, -EINVAL);
4862 return;
4863 }
4864 if (cur->get_projected_inode()->size ||
4865 cur->get_projected_inode()->truncate_seq > 1) {
4866 respond_to_request(mdr, -ENOTEMPTY);
4867 return;
4868 }
4869
4870 // validate layout
4871 file_layout_t layout = cur->get_projected_inode()->layout;
4872 // save existing layout for later
4873 const auto old_layout = layout;
4874
4875 int access = MAY_WRITE;
4876
4877 if (req->head.args.setlayout.layout.fl_object_size > 0)
4878 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4879 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4880 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4881 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4882 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4883 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4884 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4885
4886 // make sure we have as new a map as the client
4887 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4888 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4889 return;
4890 }
4891 }
4892
4893 // Don't permit layout modifications without 'p' caps
4894 if (layout != old_layout) {
4895 access |= MAY_SET_VXATTR;
4896 }
4897
4898 if (!layout.is_valid()) {
4899 dout(10) << "bad layout" << dendl;
4900 respond_to_request(mdr, -EINVAL);
4901 return;
4902 }
4903 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4904 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4905 respond_to_request(mdr, -EINVAL);
4906 return;
4907 }
4908
4909 lov.add_xlock(&cur->filelock);
4910 if (!mds->locker->acquire_locks(mdr, lov))
4911 return;
4912
4913 if (!check_access(mdr, cur, access))
4914 return;
4915
4916 // project update
4917 auto &pi = cur->project_inode();
4918 pi.inode.layout = layout;
4919 // add the old pool to the inode
4920 pi.inode.add_old_pool(old_layout.pool_id);
4921 pi.inode.version = cur->pre_dirty();
4922 pi.inode.ctime = mdr->get_op_stamp();
4923 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
4924 pi.inode.rstat.rctime = mdr->get_op_stamp();
4925 pi.inode.change_attr++;
4926
4927 // log + wait
4928 mdr->ls = mdlog->get_current_segment();
4929 EUpdate *le = new EUpdate(mdlog, "setlayout");
4930 mdlog->start_entry(le);
4931 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4932 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4933 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4934
4935 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4936 }
4937
4938 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
4939 {
4940 const MClientRequest::const_ref &req = mdr->client_request;
4941 MutationImpl::LockOpVec lov;
4942 file_layout_t *dir_layout = nullptr;
4943 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout);
4944 if (!cur) return;
4945
4946 if (mdr->snapid != CEPH_NOSNAP) {
4947 respond_to_request(mdr, -EROFS);
4948 return;
4949 }
4950
4951 if (!cur->is_dir()) {
4952 respond_to_request(mdr, -ENOTDIR);
4953 return;
4954 }
4955
4956 lov.add_xlock(&cur->policylock);
4957 if (!mds->locker->acquire_locks(mdr, lov))
4958 return;
4959
4960 // validate layout
4961 const auto old_pi = cur->get_projected_inode();
4962 file_layout_t layout;
4963 if (old_pi->has_layout())
4964 layout = old_pi->layout;
4965 else if (dir_layout)
4966 layout = *dir_layout;
4967 else
4968 layout = mdcache->default_file_layout;
4969
4970 // Level of access required to complete
4971 int access = MAY_WRITE;
4972
4973 const auto old_layout = layout;
4974
4975 if (req->head.args.setlayout.layout.fl_object_size > 0)
4976 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4977 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4978 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4979 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4980 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4981 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4982 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4983 // make sure we have as new a map as the client
4984 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4985 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4986 return;
4987 }
4988 }
4989
4990 if (layout != old_layout) {
4991 access |= MAY_SET_VXATTR;
4992 }
4993
4994 if (!layout.is_valid()) {
4995 dout(10) << "bad layout" << dendl;
4996 respond_to_request(mdr, -EINVAL);
4997 return;
4998 }
4999 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5000 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5001 respond_to_request(mdr, -EINVAL);
5002 return;
5003 }
5004
5005 if (!check_access(mdr, cur, access))
5006 return;
5007
5008 auto &pi = cur->project_inode();
5009 pi.inode.layout = layout;
5010 pi.inode.version = cur->pre_dirty();
5011
5012 // log + wait
5013 mdr->ls = mdlog->get_current_segment();
5014 EUpdate *le = new EUpdate(mdlog, "setlayout");
5015 mdlog->start_entry(le);
5016 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5017 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5018 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5019
5020 mdr->no_early_reply = true;
5021 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5022 }
5023
5024 // XATTRS
5025
5026 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5027 file_layout_t *layout, bool validate)
5028 {
5029 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
5030 try {
5031 if (name == "layout") {
5032 string::iterator begin = value.begin();
5033 string::iterator end = value.end();
5034 keys_and_values<string::iterator> p; // create instance of parser
5035 std::map<string, string> m; // map to receive results
5036 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5037 return -EINVAL;
5038 }
5039 string left(begin, end);
5040 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5041 if (begin != end)
5042 return -EINVAL;
5043 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5044 // Skip validation on each attr, we do it once at the end (avoid
5045 // rejecting intermediate states if the overall result is ok)
5046 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
5047 osdmap, layout, false);
5048 if (r < 0)
5049 return r;
5050 }
5051 } else if (name == "layout.object_size") {
5052 layout->object_size = boost::lexical_cast<unsigned>(value);
5053 } else if (name == "layout.stripe_unit") {
5054 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5055 } else if (name == "layout.stripe_count") {
5056 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5057 } else if (name == "layout.pool") {
5058 try {
5059 layout->pool_id = boost::lexical_cast<unsigned>(value);
5060 } catch (boost::bad_lexical_cast const&) {
5061 int64_t pool = osdmap.lookup_pg_pool_name(value);
5062 if (pool < 0) {
5063 dout(10) << " unknown pool " << value << dendl;
5064 return -ENOENT;
5065 }
5066 layout->pool_id = pool;
5067 }
5068 } else if (name == "layout.pool_namespace") {
5069 layout->pool_ns = value;
5070 } else {
5071 dout(10) << " unknown layout vxattr " << name << dendl;
5072 return -EINVAL;
5073 }
5074 } catch (boost::bad_lexical_cast const&) {
5075 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5076 return -EINVAL;
5077 }
5078
5079 if (validate && !layout->is_valid()) {
5080 dout(10) << "bad layout" << dendl;
5081 return -EINVAL;
5082 }
5083 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
5084 dout(10) << " invalid data pool " << layout->pool_id << dendl;
5085 return -EINVAL;
5086 }
5087 return 0;
5088 }
5089
5090 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5091 {
5092 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5093 try {
5094 if (name == "quota") {
5095 string::iterator begin = value.begin();
5096 string::iterator end = value.end();
5097 if (begin == end) {
5098 // keep quota unchanged. (for create_quota_realm())
5099 return 0;
5100 }
5101 keys_and_values<string::iterator> p; // create instance of parser
5102 std::map<string, string> m; // map to receive results
5103 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5104 return -EINVAL;
5105 }
5106 string left(begin, end);
5107 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5108 if (begin != end)
5109 return -EINVAL;
5110 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5111 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5112 if (r < 0)
5113 return r;
5114 }
5115 } else if (name == "quota.max_bytes") {
5116 int64_t q = boost::lexical_cast<int64_t>(value);
5117 if (q < 0)
5118 return -EINVAL;
5119 quota->max_bytes = q;
5120 } else if (name == "quota.max_files") {
5121 int64_t q = boost::lexical_cast<int64_t>(value);
5122 if (q < 0)
5123 return -EINVAL;
5124 quota->max_files = q;
5125 } else {
5126 dout(10) << " unknown quota vxattr " << name << dendl;
5127 return -EINVAL;
5128 }
5129 } catch (boost::bad_lexical_cast const&) {
5130 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5131 return -EINVAL;
5132 }
5133
5134 if (!quota->is_valid()) {
5135 dout(10) << "bad quota" << dendl;
5136 return -EINVAL;
5137 }
5138 return 0;
5139 }
5140
5141 void Server::create_quota_realm(CInode *in)
5142 {
5143 dout(10) << __func__ << " " << *in << dendl;
5144
5145 auto req = MClientRequest::create(CEPH_MDS_OP_SETXATTR);
5146 req->set_filepath(filepath(in->ino()));
5147 req->set_string2("ceph.quota");
5148 // empty vxattr value
5149 req->set_tid(mds->issue_tid());
5150
5151 mds->send_message_mds(req, in->authority().first);
5152 }
5153
5154 /*
5155 * Verify that the file layout attribute carried by client
5156 * is well-formatted.
5157 * Return 0 on success, otherwise this function takes
5158 * responsibility for the passed mdr.
5159 */
5160 int Server::check_layout_vxattr(MDRequestRef& mdr,
5161 string name,
5162 string value,
5163 file_layout_t *layout)
5164 {
5165 const MClientRequest::const_ref &req = mdr->client_request;
5166 epoch_t epoch;
5167 int r;
5168
5169 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5170 r = parse_layout_vxattr(name, value, osdmap, layout);
5171 epoch = osdmap.get_epoch();
5172 });
5173
5174 if (r == -ENOENT) {
5175
5176 // we don't have the specified pool, make sure our map
5177 // is newer than or as new as the client.
5178 epoch_t req_epoch = req->get_osdmap_epoch();
5179
5180 if (req_epoch > epoch) {
5181
5182 // well, our map is older. consult mds.
5183 Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
5184
5185 if (!mds->objecter->wait_for_map(req_epoch, fin))
5186 return r; // wait, fin will retry this request later
5187
5188 delete fin;
5189
5190 // now we have at least as new a map as the client, try again.
5191 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5192 r = parse_layout_vxattr(name, value, osdmap, layout);
5193 epoch = osdmap.get_epoch();
5194 });
5195
5196 ceph_assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
5197
5198 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5199
5200 // For compatibility with client w/ old code, we still need get the
5201 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5202 // we can remove those code.
5203 mdr->waited_for_osdmap = true;
5204 mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
5205 mds, new C_MDS_RetryRequest(mdcache, mdr)));
5206 return r;
5207 }
5208 }
5209
5210 if (r < 0) {
5211
5212 if (r == -ENOENT)
5213 r = -EINVAL;
5214
5215 respond_to_request(mdr, r);
5216 return r;
5217 }
5218
5219 // all is well
5220 return 0;
5221 }
5222
5223 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
5224 file_layout_t *dir_layout,
5225 MutationImpl::LockOpVec& lov)
5226 {
5227 const MClientRequest::const_ref &req = mdr->client_request;
5228 string name(req->get_path2());
5229 bufferlist bl = req->get_data();
5230 string value (bl.c_str(), bl.length());
5231 dout(10) << "handle_set_vxattr " << name
5232 << " val " << value.length()
5233 << " bytes on " << *cur
5234 << dendl;
5235
5236 CInode::mempool_inode *pip = nullptr;
5237 string rest;
5238
5239 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5240 return;
5241 }
5242
5243 bool new_realm = false;
5244 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5245 if (!cur->is_dir()) {
5246 respond_to_request(mdr, -EINVAL);
5247 return;
5248 }
5249
5250 file_layout_t layout;
5251 if (cur->get_projected_inode()->has_layout())
5252 layout = cur->get_projected_inode()->layout;
5253 else if (dir_layout)
5254 layout = *dir_layout;
5255 else
5256 layout = mdcache->default_file_layout;
5257
5258 rest = name.substr(name.find("layout"));
5259 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5260 return;
5261
5262 lov.add_xlock(&cur->policylock);
5263 if (!mds->locker->acquire_locks(mdr, lov))
5264 return;
5265
5266 auto &pi = cur->project_inode();
5267 pi.inode.layout = layout;
5268 mdr->no_early_reply = true;
5269 pip = &pi.inode;
5270 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
5271 if (!cur->is_file()) {
5272 respond_to_request(mdr, -EINVAL);
5273 return;
5274 }
5275 if (cur->get_projected_inode()->size ||
5276 cur->get_projected_inode()->truncate_seq > 1) {
5277 respond_to_request(mdr, -ENOTEMPTY);
5278 return;
5279 }
5280 file_layout_t layout = cur->get_projected_inode()->layout;
5281 rest = name.substr(name.find("layout"));
5282 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5283 return;
5284
5285 lov.add_xlock(&cur->filelock);
5286 if (!mds->locker->acquire_locks(mdr, lov))
5287 return;
5288
5289 auto &pi = cur->project_inode();
5290 int64_t old_pool = pi.inode.layout.pool_id;
5291 pi.inode.add_old_pool(old_pool);
5292 pi.inode.layout = layout;
5293 pip = &pi.inode;
5294 } else if (name.compare(0, 10, "ceph.quota") == 0) {
5295 if (!cur->is_dir() || cur->is_root()) {
5296 respond_to_request(mdr, -EINVAL);
5297 return;
5298 }
5299
5300 quota_info_t quota = cur->get_projected_inode()->quota;
5301
5302 rest = name.substr(name.find("quota"));
5303 int r = parse_quota_vxattr(rest, value, &quota);
5304 if (r < 0) {
5305 respond_to_request(mdr, r);
5306 return;
5307 }
5308
5309 lov.add_xlock(&cur->policylock);
5310 if (quota.is_enable() && !cur->get_projected_srnode()) {
5311 lov.add_xlock(&cur->snaplock);
5312 new_realm = true;
5313 }
5314
5315 if (!mds->locker->acquire_locks(mdr, lov))
5316 return;
5317
5318 auto &pi = cur->project_inode(false, new_realm);
5319 pi.inode.quota = quota;
5320
5321 if (new_realm) {
5322 SnapRealm *realm = cur->find_snaprealm();
5323 auto seq = realm->get_newest_seq();
5324 auto &newsnap = *pi.snapnode;
5325 newsnap.created = seq;
5326 newsnap.seq = seq;
5327 }
5328 mdr->no_early_reply = true;
5329 pip = &pi.inode;
5330
5331 client_t exclude_ct = mdr->get_client();
5332 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
5333 } else if (name.find("ceph.dir.pin") == 0) {
5334 if (!cur->is_dir() || cur->is_root()) {
5335 respond_to_request(mdr, -EINVAL);
5336 return;
5337 }
5338
5339 mds_rank_t rank;
5340 try {
5341 rank = boost::lexical_cast<mds_rank_t>(value);
5342 if (rank < 0) rank = MDS_RANK_NONE;
5343 } catch (boost::bad_lexical_cast const&) {
5344 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5345 respond_to_request(mdr, -EINVAL);
5346 return;
5347 }
5348
5349 lov.add_xlock(&cur->policylock);
5350 if (!mds->locker->acquire_locks(mdr, lov))
5351 return;
5352
5353 auto &pi = cur->project_inode();
5354 cur->set_export_pin(rank);
5355 pip = &pi.inode;
5356 } else {
5357 dout(10) << " unknown vxattr " << name << dendl;
5358 respond_to_request(mdr, -EINVAL);
5359 return;
5360 }
5361
5362 pip->change_attr++;
5363 pip->ctime = mdr->get_op_stamp();
5364 if (mdr->get_op_stamp() > pip->rstat.rctime)
5365 pip->rstat.rctime = mdr->get_op_stamp();
5366 pip->version = cur->pre_dirty();
5367 if (cur->is_file())
5368 pip->update_backtrace();
5369
5370 // log + wait
5371 mdr->ls = mdlog->get_current_segment();
5372 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
5373 mdlog->start_entry(le);
5374 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5375 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5376 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5377
5378 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5379 false, false, new_realm));
5380 return;
5381 }
5382
5383 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
5384 file_layout_t *dir_layout,
5385 MutationImpl::LockOpVec& lov)
5386 {
5387 const MClientRequest::const_ref &req = mdr->client_request;
5388 string name(req->get_path2());
5389
5390 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
5391
5392 if (name == "ceph.dir.layout") {
5393 if (!cur->is_dir()) {
5394 respond_to_request(mdr, -ENODATA);
5395 return;
5396 }
5397 if (cur->is_root()) {
5398 dout(10) << "can't remove layout policy on the root directory" << dendl;
5399 respond_to_request(mdr, -EINVAL);
5400 return;
5401 }
5402
5403 if (!cur->get_projected_inode()->has_layout()) {
5404 respond_to_request(mdr, -ENODATA);
5405 return;
5406 }
5407
5408 lov.add_xlock(&cur->policylock);
5409 if (!mds->locker->acquire_locks(mdr, lov))
5410 return;
5411
5412 auto &pi = cur->project_inode();
5413 pi.inode.clear_layout();
5414 pi.inode.version = cur->pre_dirty();
5415
5416 // log + wait
5417 mdr->ls = mdlog->get_current_segment();
5418 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
5419 mdlog->start_entry(le);
5420 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5421 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5422 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5423
5424 mdr->no_early_reply = true;
5425 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5426 return;
5427 } else if (name == "ceph.dir.layout.pool_namespace"
5428 || name == "ceph.file.layout.pool_namespace") {
5429 // Namespace is the only layout field that has a meaningful
5430 // null/none value (empty string, means default layout). Is equivalent
5431 // to a setxattr with empty string: pass through the empty payload of
5432 // the rmxattr request to do this.
5433 handle_set_vxattr(mdr, cur, dir_layout, lov);
5434 return;
5435 }
5436
5437 respond_to_request(mdr, -ENODATA);
5438 }
5439
5440 class C_MDS_inode_xattr_update_finish : public ServerLogContext {
5441 CInode *in;
5442 public:
5443
5444 C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
5445 ServerLogContext(s, r), in(i) { }
5446 void finish(int r) override {
5447 ceph_assert(r == 0);
5448
5449 // apply
5450 in->pop_and_dirty_projected_inode(mdr->ls);
5451
5452 mdr->apply();
5453
5454 get_mds()->balancer->hit_inode(in, META_POP_IWR);
5455
5456 server->respond_to_request(mdr, 0);
5457 }
5458 };
5459
5460 void Server::handle_client_setxattr(MDRequestRef& mdr)
5461 {
5462 const MClientRequest::const_ref &req = mdr->client_request;
5463 string name(req->get_path2());
5464 MutationImpl::LockOpVec lov;
5465 CInode *cur;
5466
5467 file_layout_t *dir_layout = NULL;
5468 if (name.compare(0, 15, "ceph.dir.layout") == 0)
5469 cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout);
5470 else
5471 cur = rdlock_path_pin_ref(mdr, 0, lov, true);
5472 if (!cur)
5473 return;
5474
5475 if (mdr->snapid != CEPH_NOSNAP) {
5476 respond_to_request(mdr, -EROFS);
5477 return;
5478 }
5479
5480 int flags = req->head.args.setxattr.flags;
5481
5482 // magic ceph.* namespace?
5483 if (name.compare(0, 5, "ceph.") == 0) {
5484 handle_set_vxattr(mdr, cur, dir_layout, lov);
5485 return;
5486 }
5487
5488 lov.add_xlock(&cur->xattrlock);
5489 if (!mds->locker->acquire_locks(mdr, lov))
5490 return;
5491
5492 if (!check_access(mdr, cur, MAY_WRITE))
5493 return;
5494
5495 auto pxattrs = cur->get_projected_xattrs();
5496 size_t len = req->get_data().length();
5497 size_t inc = len + name.length();
5498
5499 // check xattrs kv pairs size
5500 size_t cur_xattrs_size = 0;
5501 for (const auto& p : *pxattrs) {
5502 if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
5503 continue;
5504 }
5505 cur_xattrs_size += p.first.length() + p.second.length();
5506 }
5507
5508 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
5509 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
5510 << cur_xattrs_size << ", inc " << inc << dendl;
5511 respond_to_request(mdr, -ENOSPC);
5512 return;
5513 }
5514
5515 if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(mempool::mds_co::string(name))) {
5516 dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
5517 respond_to_request(mdr, -EEXIST);
5518 return;
5519 }
5520 if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(mempool::mds_co::string(name))) {
5521 dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
5522 respond_to_request(mdr, -ENODATA);
5523 return;
5524 }
5525
5526 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
5527
5528 // project update
5529 auto &pi = cur->project_inode(true);
5530 pi.inode.version = cur->pre_dirty();
5531 pi.inode.ctime = mdr->get_op_stamp();
5532 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5533 pi.inode.rstat.rctime = mdr->get_op_stamp();
5534 pi.inode.change_attr++;
5535 pi.inode.xattr_version++;
5536 auto &px = *pi.xattrs;
5537 if ((flags & CEPH_XATTR_REMOVE)) {
5538 px.erase(mempool::mds_co::string(name));
5539 } else {
5540 bufferptr b = buffer::create(len);
5541 if (len)
5542 req->get_data().copy(0, len, b.c_str());
5543 auto em = px.emplace(std::piecewise_construct, std::forward_as_tuple(mempool::mds_co::string(name)), std::forward_as_tuple(b));
5544 if (!em.second)
5545 em.first->second = b;
5546 }
5547
5548 // log + wait
5549 mdr->ls = mdlog->get_current_segment();
5550 EUpdate *le = new EUpdate(mdlog, "setxattr");
5551 mdlog->start_entry(le);
5552 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5553 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5554 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5555
5556 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5557 }
5558
5559 void Server::handle_client_removexattr(MDRequestRef& mdr)
5560 {
5561 const MClientRequest::const_ref &req = mdr->client_request;
5562 std::string name(req->get_path2());
5563
5564 MutationImpl::LockOpVec lov;
5565 file_layout_t *dir_layout = nullptr;
5566 CInode *cur;
5567 if (name == "ceph.dir.layout")
5568 cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout);
5569 else
5570 cur = rdlock_path_pin_ref(mdr, 0, lov, true);
5571 if (!cur)
5572 return;
5573
5574 if (mdr->snapid != CEPH_NOSNAP) {
5575 respond_to_request(mdr, -EROFS);
5576 return;
5577 }
5578
5579 if (name.compare(0, 5, "ceph.") == 0) {
5580 handle_remove_vxattr(mdr, cur, dir_layout, lov);
5581 return;
5582 }
5583
5584 lov.add_xlock(&cur->xattrlock);
5585 if (!mds->locker->acquire_locks(mdr, lov))
5586 return;
5587
5588 auto pxattrs = cur->get_projected_xattrs();
5589 if (pxattrs->count(mempool::mds_co::string(name)) == 0) {
5590 dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
5591 respond_to_request(mdr, -ENODATA);
5592 return;
5593 }
5594
5595 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
5596
5597 // project update
5598 auto &pi = cur->project_inode(true);
5599 auto &px = *pi.xattrs;
5600 pi.inode.version = cur->pre_dirty();
5601 pi.inode.ctime = mdr->get_op_stamp();
5602 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5603 pi.inode.rstat.rctime = mdr->get_op_stamp();
5604 pi.inode.change_attr++;
5605 pi.inode.xattr_version++;
5606 px.erase(mempool::mds_co::string(name));
5607
5608 // log + wait
5609 mdr->ls = mdlog->get_current_segment();
5610 EUpdate *le = new EUpdate(mdlog, "removexattr");
5611 mdlog->start_entry(le);
5612 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5613 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5614 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5615
5616 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5617 }
5618
5619
5620 // =================================================================
5621 // DIRECTORY and NAMESPACE OPS
5622
5623
5624 // ------------------------------------------------
5625
5626 // MKNOD
5627
5628 class C_MDS_mknod_finish : public ServerLogContext {
5629 CDentry *dn;
5630 CInode *newi;
5631 public:
5632 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
5633 ServerLogContext(s, r), dn(d), newi(ni) {}
5634 void finish(int r) override {
5635 ceph_assert(r == 0);
5636
5637 // link the inode
5638 dn->pop_projected_linkage();
5639
5640 // be a bit hacky with the inode version, here.. we decrement it
5641 // just to keep mark_dirty() happen. (we didn't bother projecting
5642 // a new version of hte inode since it's just been created)
5643 newi->inode.version--;
5644 newi->mark_dirty(newi->inode.version + 1, mdr->ls);
5645 newi->mark_dirty_parent(mdr->ls, true);
5646
5647 // mkdir?
5648 if (newi->inode.is_dir()) {
5649 CDir *dir = newi->get_dirfrag(frag_t());
5650 ceph_assert(dir);
5651 dir->fnode.version--;
5652 dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
5653 dir->mark_new(mdr->ls);
5654 }
5655
5656 mdr->apply();
5657
5658 MDRequestRef null_ref;
5659 get_mds()->mdcache->send_dentry_link(dn, null_ref);
5660
5661 if (newi->inode.is_file())
5662 get_mds()->locker->share_inode_max_size(newi);
5663
5664 // hit pop
5665 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
5666
5667 // reply
5668 server->respond_to_request(mdr, 0);
5669 }
5670 };
5671
5672
5673 void Server::handle_client_mknod(MDRequestRef& mdr)
5674 {
5675 const MClientRequest::const_ref &req = mdr->client_request;
5676 client_t client = mdr->get_client();
5677 MutationImpl::LockOpVec lov;
5678 file_layout_t *dir_layout = nullptr;
5679 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false,
5680 &dir_layout);
5681 if (!dn) return;
5682 if (mdr->snapid != CEPH_NOSNAP) {
5683 respond_to_request(mdr, -EROFS);
5684 return;
5685 }
5686 CInode *diri = dn->get_dir()->get_inode();
5687 lov.add_rdlock(&diri->authlock);
5688 if (!mds->locker->acquire_locks(mdr, lov))
5689 return;
5690
5691 if (!check_access(mdr, diri, MAY_WRITE))
5692 return;
5693
5694 if (!check_fragment_space(mdr, dn->get_dir()))
5695 return;
5696
5697 unsigned mode = req->head.args.mknod.mode;
5698 if ((mode & S_IFMT) == 0)
5699 mode |= S_IFREG;
5700
5701 // set layout
5702 file_layout_t layout;
5703 if (dir_layout && S_ISREG(mode))
5704 layout = *dir_layout;
5705 else
5706 layout = mdcache->default_file_layout;
5707
5708 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
5709 ceph_assert(newi);
5710
5711 dn->push_projected_linkage(newi);
5712
5713 newi->inode.rdev = req->head.args.mknod.rdev;
5714 newi->inode.version = dn->pre_dirty();
5715 newi->inode.rstat.rfiles = 1;
5716 if (layout.pool_id != mdcache->default_file_layout.pool_id)
5717 newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
5718 newi->inode.update_backtrace();
5719
5720 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
5721 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5722 ceph_assert(follows >= realm->get_newest_seq());
5723
5724 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5725 // want to write to it (e.g., if they are reexporting NFS)
5726 if (S_ISREG(newi->inode.mode)) {
5727 // issue a cap on the file
5728 int cmode = CEPH_FILE_MODE_RDWR;
5729 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5730 if (cap) {
5731 cap->set_wanted(0);
5732
5733 // put locks in excl mode
5734 newi->filelock.set_state(LOCK_EXCL);
5735 newi->authlock.set_state(LOCK_EXCL);
5736 newi->xattrlock.set_state(LOCK_EXCL);
5737
5738 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
5739 newi->inode.client_ranges[client].range.first = 0;
5740 newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
5741 newi->inode.client_ranges[client].follows = follows;
5742 cap->mark_clientwriteable();
5743 }
5744 }
5745
5746 ceph_assert(dn->first == follows + 1);
5747 newi->first = dn->first;
5748
5749 dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
5750
5751 // prepare finisher
5752 mdr->ls = mdlog->get_current_segment();
5753 EUpdate *le = new EUpdate(mdlog, "mknod");
5754 mdlog->start_entry(le);
5755 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5756 journal_allocated_inos(mdr, &le->metablob);
5757
5758 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
5759 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5760 le->metablob.add_primary_dentry(dn, newi, true, true, true);
5761
5762 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5763 }
5764
5765
5766
5767 // MKDIR
5768 /* This function takes responsibility for the passed mdr*/
5769 void Server::handle_client_mkdir(MDRequestRef& mdr)
5770 {
5771 const MClientRequest::const_ref &req = mdr->client_request;
5772 if (req->get_filepath().is_last_dot_or_dotdot()) {
5773 respond_to_request(mdr, -EEXIST);
5774 return;
5775 }
5776
5777 MutationImpl::LockOpVec lov;
5778 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false);
5779 if (!dn) return;
5780 if (mdr->snapid != CEPH_NOSNAP) {
5781 respond_to_request(mdr, -EROFS);
5782 return;
5783 }
5784 CDir *dir = dn->get_dir();
5785 CInode *diri = dir->get_inode();
5786 lov.add_rdlock(&diri->authlock);
5787 if (!mds->locker->acquire_locks(mdr, lov))
5788 return;
5789
5790 // mkdir check access
5791 if (!check_access(mdr, diri, MAY_WRITE))
5792 return;
5793
5794 if (!check_fragment_space(mdr, dir))
5795 return;
5796
5797 // new inode
5798 unsigned mode = req->head.args.mkdir.mode;
5799 mode &= ~S_IFMT;
5800 mode |= S_IFDIR;
5801 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5802 ceph_assert(newi);
5803
5804 // it's a directory.
5805 dn->push_projected_linkage(newi);
5806
5807 newi->inode.version = dn->pre_dirty();
5808 newi->inode.rstat.rsubdirs = 1;
5809 newi->inode.update_backtrace();
5810
5811 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
5812 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5813 ceph_assert(follows >= realm->get_newest_seq());
5814
5815 dout(12) << " follows " << follows << dendl;
5816 ceph_assert(dn->first == follows + 1);
5817 newi->first = dn->first;
5818
5819 // ...and that new dir is empty.
5820 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
5821 newdir->state_set(CDir::STATE_CREATING);
5822 newdir->mark_complete();
5823 newdir->fnode.version = newdir->pre_dirty();
5824
5825 // prepare finisher
5826 mdr->ls = mdlog->get_current_segment();
5827 EUpdate *le = new EUpdate(mdlog, "mkdir");
5828 mdlog->start_entry(le);
5829 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5830 journal_allocated_inos(mdr, &le->metablob);
5831 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5832 le->metablob.add_primary_dentry(dn, newi, true, true);
5833 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
5834
5835 // issue a cap on the directory
5836 int cmode = CEPH_FILE_MODE_RDWR;
5837 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5838 if (cap) {
5839 cap->set_wanted(0);
5840
5841 // put locks in excl mode
5842 newi->filelock.set_state(LOCK_EXCL);
5843 newi->authlock.set_state(LOCK_EXCL);
5844 newi->xattrlock.set_state(LOCK_EXCL);
5845 }
5846
5847 // make sure this inode gets into the journal
5848 le->metablob.add_opened_ino(newi->ino());
5849
5850 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5851
5852 // We hit_dir (via hit_inode) in our finish callback, but by then we might
5853 // have overshot the split size (multiple mkdir in flight), so here is
5854 // an early chance to split the dir if this mkdir makes it oversized.
5855 mds->balancer->maybe_fragment(dir, false);
5856 }
5857
5858
5859 // SYMLINK
5860
5861 void Server::handle_client_symlink(MDRequestRef& mdr)
5862 {
5863 const MClientRequest::const_ref &req = mdr->client_request;
5864 MutationImpl::LockOpVec lov;
5865 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false);
5866 if (!dn) return;
5867 if (mdr->snapid != CEPH_NOSNAP) {
5868 respond_to_request(mdr, -EROFS);
5869 return;
5870 }
5871 CDir *dir = dn->get_dir();
5872 CInode *diri = dir->get_inode();
5873 lov.add_rdlock(&diri->authlock);
5874 if (!mds->locker->acquire_locks(mdr, lov))
5875 return;
5876
5877 if (!check_access(mdr, diri, MAY_WRITE))
5878 return;
5879
5880 if (!check_fragment_space(mdr, dir))
5881 return;
5882
5883 unsigned mode = S_IFLNK | 0777;
5884 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5885 ceph_assert(newi);
5886
5887 // it's a symlink
5888 dn->push_projected_linkage(newi);
5889
5890 newi->symlink = req->get_path2();
5891 newi->inode.size = newi->symlink.length();
5892 newi->inode.rstat.rbytes = newi->inode.size;
5893 newi->inode.rstat.rfiles = 1;
5894 newi->inode.version = dn->pre_dirty();
5895 newi->inode.update_backtrace();
5896
5897 newi->first = dn->first;
5898
5899 // prepare finisher
5900 mdr->ls = mdlog->get_current_segment();
5901 EUpdate *le = new EUpdate(mdlog, "symlink");
5902 mdlog->start_entry(le);
5903 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5904 journal_allocated_inos(mdr, &le->metablob);
5905 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5906 le->metablob.add_primary_dentry(dn, newi, true, true);
5907
5908 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5909 }
5910
5911
5912
5913
5914
5915 // LINK
5916
5917 void Server::handle_client_link(MDRequestRef& mdr)
5918 {
5919 const MClientRequest::const_ref &req = mdr->client_request;
5920
5921 dout(7) << "handle_client_link " << req->get_filepath()
5922 << " to " << req->get_filepath2()
5923 << dendl;
5924
5925 MutationImpl::LockOpVec lov;
5926
5927 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false);
5928 if (!dn) return;
5929 CInode *targeti = rdlock_path_pin_ref(mdr, 1, lov, false);
5930 if (!targeti) return;
5931 if (mdr->snapid != CEPH_NOSNAP) {
5932 respond_to_request(mdr, -EROFS);
5933 return;
5934 }
5935
5936 CDir *dir = dn->get_dir();
5937 dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
5938 dout(7) << "target is " << *targeti << dendl;
5939 if (targeti->is_dir()) {
5940 // if srcdn is replica, need to make sure its linkage is correct
5941 vector<CDentry*>& trace = mdr->dn[1];
5942 if (trace.empty() ||
5943 trace.back()->is_auth() ||
5944 trace.back()->lock.can_read(mdr->get_client())) {
5945 dout(7) << "target is a dir, failing..." << dendl;
5946 respond_to_request(mdr, -EINVAL);
5947 return;
5948 }
5949 }
5950
5951 lov.erase_rdlock(&targeti->snaplock);
5952 lov.add_xlock(&targeti->snaplock);
5953 lov.add_xlock(&targeti->linklock);
5954
5955 if (!mds->locker->acquire_locks(mdr, lov))
5956 return;
5957
5958 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5959 if (!check_access(mdr, targeti, MAY_WRITE))
5960 return;
5961
5962 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
5963 return;
5964
5965 if (!check_fragment_space(mdr, dir))
5966 return;
5967 }
5968
5969 // go!
5970 ceph_assert(g_conf()->mds_kill_link_at != 1);
5971
5972 // local or remote?
5973 if (targeti->is_auth())
5974 _link_local(mdr, dn, targeti);
5975 else
5976 _link_remote(mdr, true, dn, targeti);
5977 }
5978
5979
5980 class C_MDS_link_local_finish : public ServerLogContext {
5981 CDentry *dn;
5982 CInode *targeti;
5983 version_t dnpv;
5984 version_t tipv;
5985 bool adjust_realm;
5986 public:
5987 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
5988 version_t dnpv_, version_t tipv_, bool ar) :
5989 ServerLogContext(s, r), dn(d), targeti(ti),
5990 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
5991 void finish(int r) override {
5992 ceph_assert(r == 0);
5993 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
5994 }
5995 };
5996
5997
5998 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
5999 {
6000 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
6001
6002 mdr->ls = mdlog->get_current_segment();
6003
6004 // predirty NEW dentry
6005 version_t dnpv = dn->pre_dirty();
6006 version_t tipv = targeti->pre_dirty();
6007
6008 // project inode update
6009 auto &pi = targeti->project_inode();
6010 pi.inode.nlink++;
6011 pi.inode.ctime = mdr->get_op_stamp();
6012 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
6013 pi.inode.rstat.rctime = mdr->get_op_stamp();
6014 pi.inode.change_attr++;
6015 pi.inode.version = tipv;
6016
6017 bool adjust_realm = false;
6018 if (!targeti->is_projected_snaprealm_global()) {
6019 sr_t *newsnap = targeti->project_snaprealm();
6020 targeti->mark_snaprealm_global(newsnap);
6021 targeti->record_snaprealm_parent_dentry(newsnap, NULL, targeti->get_projected_parent_dn(), true);
6022 adjust_realm = true;
6023 }
6024
6025 // log + wait
6026 EUpdate *le = new EUpdate(mdlog, "link_local");
6027 mdlog->start_entry(le);
6028 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6029 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
6030 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
6031 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6032 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
6033
6034 // do this after predirty_*, to avoid funky extra dnl arg
6035 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6036
6037 journal_and_reply(mdr, targeti, dn, le,
6038 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
6039 }
6040
6041 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
6042 version_t dnpv, version_t tipv, bool adjust_realm)
6043 {
6044 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
6045
6046 // link and unlock the NEW dentry
6047 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6048 if (!dnl->get_inode())
6049 dn->link_remote(dnl, targeti);
6050 dn->mark_dirty(dnpv, mdr->ls);
6051
6052 // target inode
6053 targeti->pop_and_dirty_projected_inode(mdr->ls);
6054
6055 mdr->apply();
6056
6057 MDRequestRef null_ref;
6058 mdcache->send_dentry_link(dn, null_ref);
6059
6060 if (adjust_realm) {
6061 int op = CEPH_SNAP_OP_SPLIT;
6062 mds->mdcache->send_snap_update(targeti, 0, op);
6063 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6064 }
6065
6066 // bump target popularity
6067 mds->balancer->hit_inode(targeti, META_POP_IWR);
6068 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6069
6070 // reply
6071 respond_to_request(mdr, 0);
6072 }
6073
6074
6075 // link / unlink remote
6076
6077 class C_MDS_link_remote_finish : public ServerLogContext {
6078 bool inc;
6079 CDentry *dn;
6080 CInode *targeti;
6081 version_t dpv;
6082 public:
6083 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
6084 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
6085 dpv(d->get_projected_version()) {}
6086 void finish(int r) override {
6087 ceph_assert(r == 0);
6088 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
6089 }
6090 };
6091
6092 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
6093 {
6094 dout(10) << "_link_remote "
6095 << (inc ? "link ":"unlink ")
6096 << *dn << " to " << *targeti << dendl;
6097
6098 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6099 mds_rank_t linkauth = targeti->authority().first;
6100 if (mdr->more()->witnessed.count(linkauth) == 0) {
6101 if (mds->is_cluster_degraded() &&
6102 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
6103 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
6104 if (mdr->more()->waiting_on_slave.empty())
6105 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
6106 return;
6107 }
6108
6109 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
6110 int op;
6111 if (inc)
6112 op = MMDSSlaveRequest::OP_LINKPREP;
6113 else
6114 op = MMDSSlaveRequest::OP_UNLINKPREP;
6115 auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, op);
6116 targeti->set_object_info(req->get_object_info());
6117 req->op_stamp = mdr->get_op_stamp();
6118 if (auto& desti_srnode = mdr->more()->desti_srnode)
6119 encode(*desti_srnode, req->desti_snapbl);
6120 mds->send_message_mds(req, linkauth);
6121
6122 ceph_assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
6123 mdr->more()->waiting_on_slave.insert(linkauth);
6124 return;
6125 }
6126 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
6127
6128 ceph_assert(g_conf()->mds_kill_link_at != 2);
6129
6130 if (auto& desti_srnode = mdr->more()->desti_srnode) {
6131 delete desti_srnode;
6132 desti_srnode = NULL;
6133 }
6134
6135 mdr->set_mds_stamp(ceph_clock_now());
6136
6137 // add to event
6138 mdr->ls = mdlog->get_current_segment();
6139 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
6140 mdlog->start_entry(le);
6141 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6142 if (!mdr->more()->witnessed.empty()) {
6143 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6144 le->reqid = mdr->reqid;
6145 le->had_slaves = true;
6146 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6147 }
6148
6149 if (inc) {
6150 dn->pre_dirty();
6151 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
6152 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6153 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6154 } else {
6155 dn->pre_dirty();
6156 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
6157 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6158 le->metablob.add_null_dentry(dn, true);
6159 dn->push_projected_linkage();
6160 }
6161
6162 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
6163 }
6164
6165 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
6166 CDentry *dn, CInode *targeti,
6167 version_t dpv)
6168 {
6169 dout(10) << "_link_remote_finish "
6170 << (inc ? "link ":"unlink ")
6171 << *dn << " to " << *targeti << dendl;
6172
6173 ceph_assert(g_conf()->mds_kill_link_at != 3);
6174
6175 if (!mdr->more()->witnessed.empty())
6176 mdcache->logged_master_update(mdr->reqid);
6177
6178 if (inc) {
6179 // link the new dentry
6180 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6181 if (!dnl->get_inode())
6182 dn->link_remote(dnl, targeti);
6183 dn->mark_dirty(dpv, mdr->ls);
6184 } else {
6185 // unlink main dentry
6186 dn->get_dir()->unlink_inode(dn);
6187 dn->pop_projected_linkage();
6188 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
6189 }
6190
6191 mdr->apply();
6192
6193 MDRequestRef null_ref;
6194 if (inc)
6195 mdcache->send_dentry_link(dn, null_ref);
6196 else
6197 mdcache->send_dentry_unlink(dn, NULL, null_ref);
6198
6199 // bump target popularity
6200 mds->balancer->hit_inode(targeti, META_POP_IWR);
6201 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6202
6203 // reply
6204 respond_to_request(mdr, 0);
6205
6206 if (!inc)
6207 // removing a new dn?
6208 dn->get_dir()->try_remove_unlinked_dn(dn);
6209 }
6210
6211
6212 // remote linking/unlinking
6213
6214 class C_MDS_SlaveLinkPrep : public ServerLogContext {
6215 CInode *targeti;
6216 bool adjust_realm;
6217 public:
6218 C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
6219 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
6220 void finish(int r) override {
6221 ceph_assert(r == 0);
6222 server->_logged_slave_link(mdr, targeti, adjust_realm);
6223 }
6224 };
6225
6226 class C_MDS_SlaveLinkCommit : public ServerContext {
6227 MDRequestRef mdr;
6228 CInode *targeti;
6229 public:
6230 C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
6231 ServerContext(s), mdr(r), targeti(t) { }
6232 void finish(int r) override {
6233 server->_commit_slave_link(mdr, r, targeti);
6234 }
6235 };
6236
6237 void Server::handle_slave_link_prep(MDRequestRef& mdr)
6238 {
6239 dout(10) << "handle_slave_link_prep " << *mdr
6240 << " on " << mdr->slave_request->get_object_info()
6241 << dendl;
6242
6243 ceph_assert(g_conf()->mds_kill_link_at != 4);
6244
6245 CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
6246 ceph_assert(targeti);
6247 dout(10) << "targeti " << *targeti << dendl;
6248 CDentry *dn = targeti->get_parent_dn();
6249 CDentry::linkage_t *dnl = dn->get_linkage();
6250 ceph_assert(dnl->is_primary());
6251
6252 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6253
6254 mdr->auth_pin(targeti);
6255
6256 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
6257 ceph_assert(g_conf()->mds_kill_link_at != 5);
6258
6259 // journal it
6260 mdr->ls = mdlog->get_current_segment();
6261 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
6262 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
6263 mdlog->start_entry(le);
6264
6265 auto &pi = dnl->get_inode()->project_inode();
6266
6267 // update journaled target inode
6268 bool inc;
6269 bool adjust_realm = false;
6270 bool realm_projected = false;
6271 if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
6272 inc = true;
6273 pi.inode.nlink++;
6274 if (!targeti->is_projected_snaprealm_global()) {
6275 sr_t *newsnap = targeti->project_snaprealm();
6276 targeti->mark_snaprealm_global(newsnap);
6277 targeti->record_snaprealm_parent_dentry(newsnap, NULL, targeti->get_projected_parent_dn(), true);
6278 adjust_realm = true;
6279 realm_projected = true;
6280 }
6281 } else {
6282 inc = false;
6283 pi.inode.nlink--;
6284 if (targeti->is_projected_snaprealm_global()) {
6285 ceph_assert(mdr->slave_request->desti_snapbl.length());
6286 auto p = mdr->slave_request->desti_snapbl.cbegin();
6287
6288 sr_t *newsnap = targeti->project_snaprealm();
6289 decode(*newsnap, p);
6290
6291 if (pi.inode.nlink == 0)
6292 ceph_assert(!newsnap->is_parent_global());
6293
6294 realm_projected = true;
6295 } else {
6296 ceph_assert(mdr->slave_request->desti_snapbl.length() == 0);
6297 }
6298 }
6299
6300 link_rollback rollback;
6301 rollback.reqid = mdr->reqid;
6302 rollback.ino = targeti->ino();
6303 rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
6304 const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
6305 rollback.old_dir_mtime = pf->fragstat.mtime;
6306 rollback.old_dir_rctime = pf->rstat.rctime;
6307 rollback.was_inc = inc;
6308 if (realm_projected) {
6309 if (targeti->snaprealm) {
6310 encode(true, rollback.snapbl);
6311 targeti->encode_snap_blob(rollback.snapbl);
6312 } else {
6313 encode(false, rollback.snapbl);
6314 }
6315 }
6316 encode(rollback, le->rollback);
6317 mdr->more()->rollback_bl = le->rollback;
6318
6319 pi.inode.ctime = mdr->get_op_stamp();
6320 pi.inode.version = targeti->pre_dirty();
6321
6322 dout(10) << " projected inode " << pi.inode.ino << " v " << pi.inode.version << dendl;
6323
6324 // commit case
6325 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
6326 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
6327
6328 // set up commit waiter
6329 mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
6330
6331 mdr->more()->slave_update_journaled = true;
6332 submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, adjust_realm),
6333 mdr, __func__);
6334 mdlog->flush();
6335 }
6336
6337 void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
6338 {
6339 dout(10) << "_logged_slave_link " << *mdr
6340 << " " << *targeti << dendl;
6341
6342 ceph_assert(g_conf()->mds_kill_link_at != 6);
6343
6344 // update the target
6345 targeti->pop_and_dirty_projected_inode(mdr->ls);
6346 mdr->apply();
6347
6348 // hit pop
6349 mds->balancer->hit_inode(targeti, META_POP_IWR);
6350
6351 // done.
6352 mdr->reset_slave_request();
6353
6354 if (adjust_realm) {
6355 int op = CEPH_SNAP_OP_SPLIT;
6356 mds->mdcache->send_snap_update(targeti, 0, op);
6357 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6358 }
6359
6360 // ack
6361 if (!mdr->aborted) {
6362 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_LINKPREPACK);
6363 mds->send_message_mds(reply, mdr->slave_to_mds);
6364 } else {
6365 dout(10) << " abort flag set, finishing" << dendl;
6366 mdcache->request_finish(mdr);
6367 }
6368 }
6369
6370
6371 struct C_MDS_CommittedSlave : public ServerLogContext {
6372 C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
6373 void finish(int r) override {
6374 server->_committed_slave(mdr);
6375 }
6376 };
6377
6378 void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
6379 {
6380 dout(10) << "_commit_slave_link " << *mdr
6381 << " r=" << r
6382 << " " << *targeti << dendl;
6383
6384 ceph_assert(g_conf()->mds_kill_link_at != 7);
6385
6386 if (r == 0) {
6387 // drop our pins, etc.
6388 mdr->cleanup();
6389
6390 // write a commit to the journal
6391 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
6392 ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
6393 mdlog->start_entry(le);
6394 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
6395 mdlog->flush();
6396 } else {
6397 do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
6398 }
6399 }
6400
6401 void Server::_committed_slave(MDRequestRef& mdr)
6402 {
6403 dout(10) << "_committed_slave " << *mdr << dendl;
6404
6405 ceph_assert(g_conf()->mds_kill_link_at != 8);
6406
6407 auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_COMMITTED);
6408 mds->send_message_mds(req, mdr->slave_to_mds);
6409 mdcache->request_finish(mdr);
6410 }
6411
6412 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
6413 MutationRef mut;
6414 map<client_t,MClientSnap::ref> splits;
6415 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
6416 map<client_t,MClientSnap::ref>&& _splits) :
6417 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
6418 }
6419 void finish(int r) override {
6420 server->_link_rollback_finish(mut, mdr, splits);
6421 }
6422 };
6423
6424 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
6425 {
6426 link_rollback rollback;
6427 auto p = rbl.cbegin();
6428 decode(rollback, p);
6429
6430 dout(10) << "do_link_rollback on " << rollback.reqid
6431 << (rollback.was_inc ? " inc":" dec")
6432 << " ino " << rollback.ino
6433 << dendl;
6434
6435 ceph_assert(g_conf()->mds_kill_link_at != 9);
6436
6437 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
6438 ceph_assert(mdr || mds->is_resolve());
6439
6440 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
6441 mut->ls = mds->mdlog->get_current_segment();
6442
6443 CInode *in = mdcache->get_inode(rollback.ino);
6444 ceph_assert(in);
6445 dout(10) << " target is " << *in << dendl;
6446 ceph_assert(!in->is_projected()); // live slave request hold versionlock xlock.
6447
6448 auto &pi = in->project_inode();
6449 pi.inode.version = in->pre_dirty();
6450 mut->add_projected_inode(in);
6451
6452 // parent dir rctime
6453 CDir *parent = in->get_projected_parent_dn()->get_dir();
6454 fnode_t *pf = parent->project_fnode();
6455 mut->add_projected_fnode(parent);
6456 pf->version = parent->pre_dirty();
6457 if (pf->fragstat.mtime == pi.inode.ctime) {
6458 pf->fragstat.mtime = rollback.old_dir_mtime;
6459 if (pf->rstat.rctime == pi.inode.ctime)
6460 pf->rstat.rctime = rollback.old_dir_rctime;
6461 mut->add_updated_lock(&parent->get_inode()->filelock);
6462 mut->add_updated_lock(&parent->get_inode()->nestlock);
6463 }
6464
6465 // inode
6466 pi.inode.ctime = rollback.old_ctime;
6467 if (rollback.was_inc)
6468 pi.inode.nlink--;
6469 else
6470 pi.inode.nlink++;
6471
6472 map<client_t,MClientSnap::ref> splits;
6473 if (rollback.snapbl.length() && in->snaprealm) {
6474 bool hadrealm;
6475 auto p = rollback.snapbl.cbegin();
6476 decode(hadrealm, p);
6477 if (hadrealm) {
6478 if (!mds->is_resolve()) {
6479 sr_t *new_srnode = new sr_t();
6480 decode(*new_srnode, p);
6481 in->project_snaprealm(new_srnode);
6482 } else {
6483 decode(in->snaprealm->srnode, p);
6484 }
6485 } else {
6486 SnapRealm *realm = parent->get_inode()->find_snaprealm();
6487 if (!mds->is_resolve())
6488 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
6489 in->project_snaprealm(NULL);
6490 }
6491 }
6492
6493 // journal it
6494 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
6495 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
6496 mdlog->start_entry(le);
6497 le->commit.add_dir_context(parent);
6498 le->commit.add_dir(parent, true);
6499 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
6500
6501 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
6502 mdr, __func__);
6503 mdlog->flush();
6504 }
6505
6506 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
6507 map<client_t,MClientSnap::ref>& splits)
6508 {
6509 dout(10) << "_link_rollback_finish" << dendl;
6510
6511 ceph_assert(g_conf()->mds_kill_link_at != 10);
6512
6513 mut->apply();
6514
6515 if (!mds->is_resolve())
6516 mdcache->send_snaps(splits);
6517
6518 if (mdr)
6519 mdcache->request_finish(mdr);
6520
6521 mdcache->finish_rollback(mut->reqid);
6522
6523 mut->cleanup();
6524 }
6525
6526
6527 void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &m)
6528 {
6529 dout(10) << "handle_slave_link_prep_ack " << *mdr
6530 << " " << *m << dendl;
6531 mds_rank_t from = mds_rank_t(m->get_source().num());
6532
6533 ceph_assert(g_conf()->mds_kill_link_at != 11);
6534
6535 // note slave
6536 mdr->more()->slaves.insert(from);
6537
6538 // witnessed!
6539 ceph_assert(mdr->more()->witnessed.count(from) == 0);
6540 mdr->more()->witnessed.insert(from);
6541 ceph_assert(!m->is_not_journaled());
6542 mdr->more()->has_journaled_slaves = true;
6543
6544 // remove from waiting list
6545 ceph_assert(mdr->more()->waiting_on_slave.count(from));
6546 mdr->more()->waiting_on_slave.erase(from);
6547
6548 ceph_assert(mdr->more()->waiting_on_slave.empty());
6549
6550 dispatch_client_request(mdr); // go again!
6551 }
6552
6553
6554
6555
6556
6557 // UNLINK
6558
6559 void Server::handle_client_unlink(MDRequestRef& mdr)
6560 {
6561 const MClientRequest::const_ref &req = mdr->client_request;
6562 client_t client = mdr->get_client();
6563
6564 // rmdir or unlink?
6565 bool rmdir = false;
6566 if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true;
6567
6568 const filepath& refpath = req->get_filepath();
6569 if (refpath.depth() == 0) {
6570 respond_to_request(mdr, -EINVAL);
6571 return;
6572 }
6573 if (refpath.is_last_dot_or_dotdot()) {
6574 respond_to_request(mdr, -ENOTEMPTY);
6575 return;
6576 }
6577
6578 // traverse to path
6579 vector<CDentry*> trace;
6580 CInode *in;
6581 CF_MDS_MDRContextFactory cf(mdcache, mdr);
6582 int r = mdcache->path_traverse(mdr, cf, refpath, &trace, &in, MDS_TRAVERSE_FORWARD);
6583 if (r > 0) return;
6584 if (r < 0) {
6585 if (r == -ESTALE) {
6586 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
6587 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
6588 return;
6589 }
6590 respond_to_request(mdr, r);
6591 return;
6592 }
6593 if (mdr->snapid != CEPH_NOSNAP) {
6594 respond_to_request(mdr, -EROFS);
6595 return;
6596 }
6597
6598 CDentry *dn = trace.back();
6599 ceph_assert(dn);
6600 if (!dn->is_auth()) {
6601 mdcache->request_forward(mdr, dn->authority().first);
6602 return;
6603 }
6604
6605 CInode *diri = dn->get_dir()->get_inode();
6606
6607 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
6608 ceph_assert(!dnl->is_null());
6609
6610 if (rmdir) {
6611 dout(7) << "handle_client_rmdir on " << *dn << dendl;
6612 } else {
6613 dout(7) << "handle_client_unlink on " << *dn << dendl;
6614 }
6615 dout(7) << "dn links to " << *in << dendl;
6616
6617 // rmdir vs is_dir
6618 if (in->is_dir()) {
6619 if (rmdir) {
6620 // do empty directory checks
6621 if (_dir_is_nonempty_unlocked(mdr, in)) {
6622 respond_to_request(mdr, -ENOTEMPTY);
6623 return;
6624 }
6625 } else {
6626 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
6627 respond_to_request(mdr, -EISDIR);
6628 return;
6629 }
6630 } else {
6631 if (rmdir) {
6632 // unlink
6633 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
6634 respond_to_request(mdr, -ENOTDIR);
6635 return;
6636 }
6637 }
6638
6639 // -- create stray dentry? --
6640 CDentry *straydn = NULL;
6641 if (dnl->is_primary()) {
6642 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
6643 if (!straydn)
6644 return;
6645 dout(10) << " straydn is " << *straydn << dendl;
6646 } else if (mdr->straydn) {
6647 mdr->unpin(mdr->straydn);
6648 mdr->straydn = NULL;
6649 }
6650
6651 // lock
6652 MutationImpl::LockOpVec lov;
6653
6654 for (int i=0; i<(int)trace.size()-1; i++)
6655 lov.add_rdlock(&trace[i]->lock);
6656 lov.add_xlock(&dn->lock);
6657 lov.add_wrlock(&diri->filelock);
6658 lov.add_wrlock(&diri->nestlock);
6659 lov.add_xlock(&in->linklock);
6660 if (straydn) {
6661 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
6662 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
6663 lov.add_xlock(&straydn->lock);
6664 }
6665
6666 mds->locker->include_snap_rdlocks(diri, lov);
6667 lov.add_xlock(&in->snaplock);
6668 if (in->is_dir())
6669 lov.add_rdlock(&in->filelock); // to verify it's empty
6670
6671 if (!mds->locker->acquire_locks(mdr, lov))
6672 return;
6673
6674 if (in->is_dir() &&
6675 _dir_is_nonempty(mdr, in)) {
6676 respond_to_request(mdr, -ENOTEMPTY);
6677 return;
6678 }
6679
6680 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6681 if (!check_access(mdr, diri, MAY_WRITE))
6682 return;
6683 }
6684
6685 if (straydn)
6686 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
6687
6688 if (!mdr->more()->desti_srnode) {
6689 if (in->is_projected_snaprealm_global()) {
6690 sr_t *new_srnode = in->prepare_new_srnode(0);
6691 in->record_snaprealm_parent_dentry(new_srnode, NULL, dn, dnl->is_primary());
6692 // dropping the last linkage or dropping the last remote linkage,
6693 // detch the inode from global snaprealm
6694 auto nlink = in->get_projected_inode()->nlink;
6695 if (nlink == 1 ||
6696 (nlink == 2 && !dnl->is_primary() &&
6697 !in->get_projected_parent_dir()->inode->is_stray()))
6698 in->clear_snaprealm_global(new_srnode);
6699 mdr->more()->desti_srnode = new_srnode;
6700 } else if (dnl->is_primary()) {
6701 // prepare snaprealm blob for slave request
6702 SnapRealm *realm = in->find_snaprealm();
6703 snapid_t follows = realm->get_newest_seq();
6704 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
6705 sr_t *new_srnode = in->prepare_new_srnode(follows);
6706 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
6707 mdr->more()->desti_srnode = new_srnode;
6708 }
6709 }
6710 }
6711
6712 // yay!
6713 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
6714 // subtree root auths need to be witnesses
6715 set<mds_rank_t> witnesses;
6716 in->list_replicas(witnesses);
6717 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
6718
6719 for (set<mds_rank_t>::iterator p = witnesses.begin();
6720 p != witnesses.end();
6721 ++p) {
6722 if (mdr->more()->witnessed.count(*p)) {
6723 dout(10) << " already witnessed by mds." << *p << dendl;
6724 } else if (mdr->more()->waiting_on_slave.count(*p)) {
6725 dout(10) << " already waiting on witness mds." << *p << dendl;
6726 } else {
6727 if (!_rmdir_prepare_witness(mdr, *p, trace, straydn))
6728 return;
6729 }
6730 }
6731 if (!mdr->more()->waiting_on_slave.empty())
6732 return; // we're waiting for a witness.
6733 }
6734
6735 // ok!
6736 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
6737 _link_remote(mdr, false, dn, dnl->get_inode());
6738 else
6739 _unlink_local(mdr, dn, straydn);
6740 }
6741
6742 class C_MDS_unlink_local_finish : public ServerLogContext {
6743 CDentry *dn;
6744 CDentry *straydn;
6745 version_t dnpv; // deleted dentry
6746 public:
6747 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
6748 ServerLogContext(s, r), dn(d), straydn(sd),
6749 dnpv(d->get_projected_version()) {}
6750 void finish(int r) override {
6751 ceph_assert(r == 0);
6752 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
6753 }
6754 };
6755
6756 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
6757 {
6758 dout(10) << "_unlink_local " << *dn << dendl;
6759
6760 CDentry::linkage_t *dnl = dn->get_projected_linkage();
6761 CInode *in = dnl->get_inode();
6762
6763
6764 // ok, let's do it.
6765 mdr->ls = mdlog->get_current_segment();
6766
6767 // prepare log entry
6768 EUpdate *le = new EUpdate(mdlog, "unlink_local");
6769 mdlog->start_entry(le);
6770 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6771 if (!mdr->more()->witnessed.empty()) {
6772 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6773 le->reqid = mdr->reqid;
6774 le->had_slaves = true;
6775 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6776 }
6777
6778 if (straydn) {
6779 ceph_assert(dnl->is_primary());
6780 straydn->push_projected_linkage(in);
6781 }
6782
6783 // the unlinked dentry
6784 dn->pre_dirty();
6785
6786 auto &pi = in->project_inode();
6787 {
6788 std::string t;
6789 dn->make_path_string(t, true);
6790 pi.inode.stray_prior_path = std::move(t);
6791 }
6792 pi.inode.version = in->pre_dirty();
6793 pi.inode.ctime = mdr->get_op_stamp();
6794 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
6795 pi.inode.rstat.rctime = mdr->get_op_stamp();
6796 pi.inode.change_attr++;
6797 pi.inode.nlink--;
6798 if (pi.inode.nlink == 0)
6799 in->state_set(CInode::STATE_ORPHAN);
6800
6801 if (mdr->more()->desti_srnode) {
6802 auto& desti_srnode = mdr->more()->desti_srnode;
6803 in->project_snaprealm(desti_srnode);
6804 desti_srnode = NULL;
6805 }
6806
6807 if (straydn) {
6808 // will manually pop projected inode
6809
6810 // primary link. add stray dentry.
6811 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
6812 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6813
6814 pi.inode.update_backtrace();
6815 le->metablob.add_primary_dentry(straydn, in, true, true);
6816 } else {
6817 mdr->add_projected_inode(in);
6818 // remote link. update remote inode.
6819 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
6820 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6821 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
6822 }
6823
6824 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6825 le->metablob.add_null_dentry(dn, true);
6826
6827 if (in->is_dir()) {
6828 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6829 le->metablob.renamed_dirino = in->ino();
6830 }
6831
6832 dn->push_projected_linkage();
6833
6834 if (straydn) {
6835 ceph_assert(in->first <= straydn->first);
6836 in->first = straydn->first;
6837 }
6838
6839 if (in->is_dir()) {
6840 ceph_assert(straydn);
6841 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6842 }
6843
6844 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
6845 }
6846
6847 void Server::_unlink_local_finish(MDRequestRef& mdr,
6848 CDentry *dn, CDentry *straydn,
6849 version_t dnpv)
6850 {
6851 dout(10) << "_unlink_local_finish " << *dn << dendl;
6852
6853 if (!mdr->more()->witnessed.empty())
6854 mdcache->logged_master_update(mdr->reqid);
6855
6856 CInode *strayin = NULL;
6857 bool hadrealm = false;
6858 if (straydn) {
6859 // if there is newly created snaprealm, need to split old snaprealm's
6860 // inodes_with_caps. So pop snaprealm before linkage changes.
6861 strayin = dn->get_linkage()->get_inode();
6862 hadrealm = strayin->snaprealm ? true : false;
6863 strayin->early_pop_projected_snaprealm();
6864 }
6865
6866 // unlink main dentry
6867 dn->get_dir()->unlink_inode(dn);
6868 dn->pop_projected_linkage();
6869
6870 // relink as stray? (i.e. was primary link?)
6871 if (straydn) {
6872 dout(20) << " straydn is " << *straydn << dendl;
6873 straydn->pop_projected_linkage();
6874
6875 strayin->pop_and_dirty_projected_inode(mdr->ls);
6876
6877 mdcache->touch_dentry_bottom(straydn);
6878 }
6879
6880 dn->mark_dirty(dnpv, mdr->ls);
6881 mdr->apply();
6882
6883 mdcache->send_dentry_unlink(dn, straydn, mdr);
6884
6885 if (straydn) {
6886 // update subtree map?
6887 if (strayin->is_dir())
6888 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
6889
6890 if (strayin->snaprealm && !hadrealm)
6891 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
6892 }
6893
6894 // bump pop
6895 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6896
6897 // reply
6898 respond_to_request(mdr, 0);
6899
6900 // removing a new dn?
6901 dn->get_dir()->try_remove_unlinked_dn(dn);
6902
6903 // clean up ?
6904 // respond_to_request() drops locks. So stray reintegration can race with us.
6905 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6906 // Tip off the MDCache that this dentry is a stray that
6907 // might be elegible for purge.
6908 mdcache->notify_stray(straydn);
6909 }
6910 }
6911
6912 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
6913 {
6914 if (mds->is_cluster_degraded() &&
6915 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6916 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
6917 if (mdr->more()->waiting_on_slave.empty())
6918 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6919 return false;
6920 }
6921
6922 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
6923 auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREP);
6924 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
6925 for (auto dn : trace)
6926 req->srcdnpath.push_dentry(dn->get_name());
6927 mdcache->replicate_stray(straydn, who, req->straybl);
6928 if (mdr->more()->desti_srnode)
6929 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
6930
6931 req->op_stamp = mdr->get_op_stamp();
6932 mds->send_message_mds(req, who);
6933
6934 ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
6935 mdr->more()->waiting_on_slave.insert(who);
6936 return true;
6937 }
6938
6939 struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
6940 CDentry *dn, *straydn;
6941 C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
6942 : ServerLogContext(s, r), dn(d), straydn(st) {}
6943 void finish(int r) override {
6944 server->_logged_slave_rmdir(mdr, dn, straydn);
6945 }
6946 };
6947
6948 struct C_MDS_SlaveRmdirCommit : public ServerContext {
6949 MDRequestRef mdr;
6950 CDentry *straydn;
6951 C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
6952 : ServerContext(s), mdr(r), straydn(sd) { }
6953 void finish(int r) override {
6954 server->_commit_slave_rmdir(mdr, r, straydn);
6955 }
6956 };
6957
6958 void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
6959 {
6960 dout(10) << "handle_slave_rmdir_prep " << *mdr
6961 << " " << mdr->slave_request->srcdnpath
6962 << " to " << mdr->slave_request->destdnpath
6963 << dendl;
6964
6965 vector<CDentry*> trace;
6966 filepath srcpath(mdr->slave_request->srcdnpath);
6967 dout(10) << " src " << srcpath << dendl;
6968 CInode *in;
6969 CF_MDS_MDRContextFactory cf(mdcache, mdr);
6970 int r = mdcache->path_traverse(mdr, cf, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK);
6971 if (r > 0) return;
6972 if (r == -ESTALE) {
6973 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
6974 mdr->slave_to_mds);
6975 return;
6976 }
6977 ceph_assert(r == 0);
6978 CDentry *dn = trace.back();
6979 dout(10) << " dn " << *dn << dendl;
6980 mdr->pin(dn);
6981
6982 ceph_assert(mdr->straydn);
6983 CDentry *straydn = mdr->straydn;
6984 dout(10) << " straydn " << *straydn << dendl;
6985
6986 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6987
6988 rmdir_rollback rollback;
6989 rollback.reqid = mdr->reqid;
6990 rollback.src_dir = dn->get_dir()->dirfrag();
6991 rollback.src_dname = dn->get_name();
6992 rollback.dest_dir = straydn->get_dir()->dirfrag();
6993 rollback.dest_dname = straydn->get_name();
6994 if (mdr->slave_request->desti_snapbl.length()) {
6995 if (in->snaprealm) {
6996 encode(true, rollback.snapbl);
6997 in->encode_snap_blob(rollback.snapbl);
6998 } else {
6999 encode(false, rollback.snapbl);
7000 }
7001 }
7002 encode(rollback, mdr->more()->rollback_bl);
7003 // FIXME: rollback snaprealm
7004 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7005
7006 // set up commit waiter
7007 mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
7008
7009 straydn->push_projected_linkage(in);
7010 dn->push_projected_linkage();
7011
7012 ceph_assert(straydn->first >= in->first);
7013 in->first = straydn->first;
7014
7015 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
7016 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
7017 _logged_slave_rmdir(mdr, dn, straydn);
7018 return;
7019 }
7020
7021 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
7022 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
7023 mdlog->start_entry(le);
7024 le->rollback = mdr->more()->rollback_bl;
7025
7026 le->commit.add_dir_context(straydn->get_dir());
7027 le->commit.add_primary_dentry(straydn, in, true);
7028 // slave: no need to journal original dentry
7029
7030 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7031 le->commit.renamed_dirino = in->ino();
7032
7033 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7034
7035 mdr->more()->slave_update_journaled = true;
7036 submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
7037 mdr, __func__);
7038 mdlog->flush();
7039 }
7040
7041 void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7042 {
7043 dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
7044 CInode *in = dn->get_linkage()->get_inode();
7045
7046 bool new_realm;
7047 if (mdr->slave_request->desti_snapbl.length()) {
7048 new_realm = !in->snaprealm;
7049 in->decode_snap_blob(mdr->slave_request->desti_snapbl);
7050 ceph_assert(in->snaprealm);
7051 ceph_assert(in->snaprealm->have_past_parents_open());
7052 } else {
7053 new_realm = false;
7054 }
7055
7056 // update our cache now, so we are consistent with what is in the journal
7057 // when we journal a subtree map
7058 dn->get_dir()->unlink_inode(dn);
7059 straydn->pop_projected_linkage();
7060 dn->pop_projected_linkage();
7061
7062 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->slave_update_journaled);
7063
7064 if (new_realm)
7065 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
7066
7067 // done.
7068 mdr->reset_slave_request();
7069 mdr->straydn = 0;
7070
7071 if (!mdr->aborted) {
7072 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREPACK);
7073 if (!mdr->more()->slave_update_journaled)
7074 reply->mark_not_journaled();
7075 mds->send_message_mds(reply, mdr->slave_to_mds);
7076 } else {
7077 dout(10) << " abort flag set, finishing" << dendl;
7078 mdcache->request_finish(mdr);
7079 }
7080 }
7081
7082 void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
7083 {
7084 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
7085 << " " << *ack << dendl;
7086
7087 mds_rank_t from = mds_rank_t(ack->get_source().num());
7088
7089 mdr->more()->slaves.insert(from);
7090 mdr->more()->witnessed.insert(from);
7091 if (!ack->is_not_journaled())
7092 mdr->more()->has_journaled_slaves = true;
7093
7094 // remove from waiting list
7095 ceph_assert(mdr->more()->waiting_on_slave.count(from));
7096 mdr->more()->waiting_on_slave.erase(from);
7097
7098 if (mdr->more()->waiting_on_slave.empty())
7099 dispatch_client_request(mdr); // go again!
7100 else
7101 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
7102 }
7103
7104 void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
7105 {
7106 dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
7107
7108 if (r == 0) {
7109 if (mdr->more()->slave_update_journaled) {
7110 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7111 if (strayin && !strayin->snaprealm)
7112 mdcache->clear_dirty_bits_for_stray(strayin);
7113 }
7114
7115 mdr->cleanup();
7116
7117 if (mdr->more()->slave_update_journaled) {
7118 // write a commit to the journal
7119 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
7120 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
7121 ESlaveUpdate::RMDIR);
7122 mdlog->start_entry(le);
7123 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
7124 mdlog->flush();
7125 } else {
7126 _committed_slave(mdr);
7127 }
7128 } else {
7129 // abort
7130 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
7131 }
7132 }
7133
7134 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
7135 metareqid_t reqid;
7136 CDentry *dn;
7137 CDentry *straydn;
7138 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
7139 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
7140 void finish(int r) override {
7141 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
7142 }
7143 };
7144
7145 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
7146 {
7147 // unlink the other rollback methods, the rmdir rollback is only
7148 // needed to record the subtree changes in the journal for inode
7149 // replicas who are auth for empty dirfrags. no actual changes to
7150 // the file system are taking place here, so there is no Mutation.
7151
7152 rmdir_rollback rollback;
7153 auto p = rbl.cbegin();
7154 decode(rollback, p);
7155
7156 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
7157 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
7158 ceph_assert(mdr || mds->is_resolve());
7159
7160 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
7161 if (!dir)
7162 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
7163 ceph_assert(dir);
7164 CDentry *dn = dir->lookup(rollback.src_dname);
7165 ceph_assert(dn);
7166 dout(10) << " dn " << *dn << dendl;
7167 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
7168 ceph_assert(straydir);
7169 CDentry *straydn = straydir->lookup(rollback.dest_dname);
7170 ceph_assert(straydn);
7171 dout(10) << " straydn " << *straydn << dendl;
7172 CInode *in = straydn->get_linkage()->get_inode();
7173
7174 dn->push_projected_linkage(in);
7175 straydn->push_projected_linkage();
7176
7177 if (rollback.snapbl.length() && in->snaprealm) {
7178 bool hadrealm;
7179 auto p = rollback.snapbl.cbegin();
7180 decode(hadrealm, p);
7181 if (hadrealm) {
7182 decode(in->snaprealm->srnode, p);
7183 } else {
7184 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
7185 }
7186 }
7187
7188 if (mdr && !mdr->more()->slave_update_journaled) {
7189 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
7190
7191 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
7192 return;
7193 }
7194
7195
7196 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
7197 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
7198 mdlog->start_entry(le);
7199
7200 le->commit.add_dir_context(dn->get_dir());
7201 le->commit.add_primary_dentry(dn, in, true);
7202 // slave: no need to journal straydn
7203
7204 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7205 le->commit.renamed_dirino = in->ino();
7206
7207 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
7208
7209 submit_mdlog_entry(le,
7210 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
7211 dn, straydn),
7212 mdr, __func__);
7213 mdlog->flush();
7214 }
7215
7216 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
7217 {
7218 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
7219
7220 straydn->get_dir()->unlink_inode(straydn);
7221 dn->pop_projected_linkage();
7222 straydn->pop_projected_linkage();
7223
7224 CInode *in = dn->get_linkage()->get_inode();
7225 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
7226 !mdr || mdr->more()->slave_update_journaled);
7227
7228 if (mds->is_resolve()) {
7229 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
7230 mdcache->try_trim_non_auth_subtree(root);
7231 }
7232
7233 if (mdr)
7234 mdcache->request_finish(mdr);
7235
7236 mdcache->finish_rollback(reqid);
7237 }
7238
7239
7240 /** _dir_is_nonempty[_unlocked]
7241 *
7242 * check if a directory is non-empty (i.e. we can rmdir it).
7243 *
7244 * the unlocked varient this is a fastpath check. we can't really be
7245 * sure until we rdlock the filelock.
7246 */
7247 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
7248 {
7249 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
7250 ceph_assert(in->is_auth());
7251
7252 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
7253 return true; // in a snapshot!
7254
7255 list<CDir*> ls;
7256 in->get_dirfrags(ls);
7257 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7258 CDir *dir = *p;
7259 // is the frag obviously non-empty?
7260 if (dir->is_auth()) {
7261 if (dir->get_projected_fnode()->fragstat.size()) {
7262 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7263 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
7264 return true;
7265 }
7266 }
7267 }
7268
7269 return false;
7270 }
7271
7272 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
7273 {
7274 dout(10) << "dir_is_nonempty " << *in << dendl;
7275 ceph_assert(in->is_auth());
7276 ceph_assert(in->filelock.can_read(mdr->get_client()));
7277
7278 frag_info_t dirstat;
7279 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
7280
7281 list<CDir*> ls;
7282 in->get_dirfrags(ls);
7283 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7284 CDir *dir = *p;
7285 const fnode_t *pf = dir->get_projected_fnode();
7286 if (pf->fragstat.size()) {
7287 dout(10) << "dir_is_nonempty dirstat has "
7288 << pf->fragstat.size() << " items " << *dir << dendl;
7289 return true;
7290 }
7291
7292 if (pf->accounted_fragstat.version == dirstat_version)
7293 dirstat.add(pf->accounted_fragstat);
7294 else
7295 dirstat.add(pf->fragstat);
7296 }
7297
7298 return dirstat.size() != in->get_projected_inode()->dirstat.size();
7299 }
7300
7301
7302 // ======================================================
7303
7304
7305 class C_MDS_rename_finish : public ServerLogContext {
7306 CDentry *srcdn;
7307 CDentry *destdn;
7308 CDentry *straydn;
7309 public:
7310 C_MDS_rename_finish(Server *s, MDRequestRef& r,
7311 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
7312 ServerLogContext(s, r),
7313 srcdn(sdn), destdn(ddn), straydn(stdn) { }
7314 void finish(int r) override {
7315 ceph_assert(r == 0);
7316 server->_rename_finish(mdr, srcdn, destdn, straydn);
7317 }
7318 };
7319
7320
7321 /** handle_client_rename
7322 *
7323 * rename master is the destdn auth. this is because cached inodes
7324 * must remain connected. thus, any replica of srci, must also
7325 * replicate destdn, and possibly straydn, so that srci (and
7326 * destdn->inode) remain connected during the rename.
7327 *
7328 * to do this, we freeze srci, then master (destdn auth) verifies that
7329 * all other nodes have also replciated destdn and straydn. note that
7330 * destdn replicas need not also replicate srci. this only works when
7331 * destdn is master.
7332 *
7333 * This function takes responsibility for the passed mdr.
7334 */
7335 void Server::handle_client_rename(MDRequestRef& mdr)
7336 {
7337 const MClientRequest::const_ref &req = mdr->client_request;
7338 dout(7) << "handle_client_rename " << *req << dendl;
7339
7340 filepath destpath = req->get_filepath();
7341 filepath srcpath = req->get_filepath2();
7342 if (destpath.depth() == 0 || srcpath.depth() == 0) {
7343 respond_to_request(mdr, -EINVAL);
7344 return;
7345 }
7346 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
7347 respond_to_request(mdr, -EBUSY);
7348 return;
7349 }
7350
7351 std::string_view destname = destpath.last_dentry();
7352
7353 vector<CDentry*>& srctrace = mdr->dn[1];
7354 vector<CDentry*>& desttrace = mdr->dn[0];
7355
7356 MutationImpl::LockOpVec lov;
7357
7358 CDentry *destdn = rdlock_path_xlock_dentry(mdr, 0, lov, true, false, true);
7359 if (!destdn) return;
7360 dout(10) << " destdn " << *destdn << dendl;
7361 if (mdr->snapid != CEPH_NOSNAP) {
7362 respond_to_request(mdr, -EROFS);
7363 return;
7364 }
7365 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7366 CDir *destdir = destdn->get_dir();
7367 ceph_assert(destdir->is_auth());
7368
7369 CF_MDS_MDRContextFactory cf(mdcache, mdr);
7370 int r = mdcache->path_traverse(mdr, cf, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER);
7371 if (r > 0)
7372 return; // delayed
7373 if (r < 0) {
7374 if (r == -ESTALE) {
7375 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
7376 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
7377 } else {
7378 dout(10) << "FAIL on error " << r << dendl;
7379 respond_to_request(mdr, r);
7380 }
7381 return;
7382
7383 }
7384 ceph_assert(!srctrace.empty());
7385 CDentry *srcdn = srctrace.back();
7386 dout(10) << " srcdn " << *srcdn << dendl;
7387 if (srcdn->last != CEPH_NOSNAP) {
7388 respond_to_request(mdr, -EROFS);
7389 return;
7390 }
7391 CDir *srcdir = srcdn->get_dir();
7392 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7393 CInode *srci = srcdnl->get_inode();
7394 dout(10) << " srci " << *srci << dendl;
7395
7396 CInode *oldin = 0;
7397 if (!destdnl->is_null()) {
7398 //dout(10) << "dest dn exists " << *destdn << dendl;
7399 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
7400 if (!oldin) return;
7401 dout(10) << " oldin " << *oldin << dendl;
7402
7403 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
7404 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
7405 respond_to_request(mdr, -ENOTEMPTY);
7406 return;
7407 }
7408
7409 // if srcdn is replica, need to make sure its linkage is correct
7410 if (srcdn->is_auth() ||
7411 srcdn->lock.can_read(mdr->get_client()) ||
7412 (srcdn->lock.is_xlocked() && srcdn->lock.get_xlock_by() == mdr)) {
7413 // mv /some/thing /to/some/existing_other_thing
7414 if (oldin->is_dir() && !srci->is_dir()) {
7415 respond_to_request(mdr, -EISDIR);
7416 return;
7417 }
7418 if (!oldin->is_dir() && srci->is_dir()) {
7419 respond_to_request(mdr, -ENOTDIR);
7420 return;
7421 }
7422 if (srci == oldin && !srcdir->inode->is_stray()) {
7423 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
7424 return;
7425 }
7426 }
7427 }
7428
7429 // -- some sanity checks --
7430
7431 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
7432 if (destpath.get_ino() != srcpath.get_ino() &&
7433 !(req->get_source().is_mds() &&
7434 MDS_INO_IS_MDSDIR(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
7435 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
7436 CInode *destbase = desttrace[0]->get_dir()->get_inode();
7437 // ok, extend srctrace toward root until it is an ancestor of desttrace.
7438 while (srcbase != destbase &&
7439 !srcbase->is_projected_ancestor_of(destbase)) {
7440 CDentry *pdn = srcbase->get_projected_parent_dn();
7441 srctrace.insert(srctrace.begin(), pdn);
7442 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
7443 srcbase = pdn->get_dir()->get_inode();
7444 }
7445
7446 // then, extend destpath until it shares the same parent inode as srcpath.
7447 while (destbase != srcbase) {
7448 CDentry *pdn = destbase->get_projected_parent_dn();
7449 desttrace.insert(desttrace.begin(), pdn);
7450 lov.add_rdlock(&pdn->lock);
7451 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
7452 destbase = pdn->get_dir()->get_inode();
7453 }
7454 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
7455 }
7456
7457 // src == dest?
7458 if (srcdir == destdir && srcdn->get_name() == destname) {
7459 dout(7) << "rename src=dest, noop" << dendl;
7460 respond_to_request(mdr, 0);
7461 return;
7462 }
7463
7464 // dest a child of src?
7465 // e.g. mv /usr /usr/foo
7466 CDentry *pdn = destdir->inode->get_projected_parent_dn();
7467 while (pdn) {
7468 if (pdn == srcdn) {
7469 dout(7) << "cannot rename item to be a child of itself" << dendl;
7470 respond_to_request(mdr, -EINVAL);
7471 return;
7472 }
7473 pdn = pdn->get_dir()->inode->parent;
7474 }
7475
7476 // is this a stray migration, reintegration or merge? (sanity checks!)
7477 if (mdr->reqid.name.is_mds() &&
7478 !(MDS_INO_IS_MDSDIR(srcpath.get_ino()) &&
7479 MDS_INO_IS_MDSDIR(destpath.get_ino())) &&
7480 !(destdnl->is_remote() &&
7481 destdnl->get_remote_ino() == srci->ino())) {
7482 respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
7483 return;
7484 }
7485
7486 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
7487 if (linkmerge)
7488 dout(10) << " this is a link merge" << dendl;
7489
7490 // -- create stray dentry? --
7491 CDentry *straydn = NULL;
7492 if (destdnl->is_primary() && !linkmerge) {
7493 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
7494 if (!straydn)
7495 return;
7496 dout(10) << " straydn is " << *straydn << dendl;
7497 } else if (mdr->straydn) {
7498 mdr->unpin(mdr->straydn);
7499 mdr->straydn = NULL;
7500 }
7501
7502 // -- prepare witness list --
7503 /*
7504 * NOTE: we use _all_ replicas as witnesses.
7505 * this probably isn't totally necessary (esp for file renames),
7506 * but if/when we change that, we have to make sure rejoin is
7507 * sufficiently robust to handle strong rejoins from survivors
7508 * with totally wrong dentry->inode linkage.
7509 * (currently, it can ignore rename effects, because the resolve
7510 * stage will sort them out.)
7511 */
7512 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
7513 if (srcdn->is_auth())
7514 srcdn->list_replicas(witnesses);
7515 else
7516 witnesses.insert(srcdn->authority().first);
7517 if (srcdnl->is_remote() && !srci->is_auth())
7518 witnesses.insert(srci->authority().first);
7519 destdn->list_replicas(witnesses);
7520 if (destdnl->is_remote() && !oldin->is_auth())
7521 witnesses.insert(oldin->authority().first);
7522 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7523
7524
7525 // -- locks --
7526
7527 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
7528 for (int i=0; i<(int)srctrace.size(); i++)
7529 lov.add_rdlock(&srctrace[i]->lock);
7530 lov.add_xlock(&srcdn->lock);
7531 mds_rank_t srcdirauth = srcdir->authority().first;
7532 if (srcdirauth != mds->get_nodeid()) {
7533 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
7534 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdirauth);
7535 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdirauth);
7536 if (srci->is_dir())
7537 lov.add_rdlock(&srci->dirfragtreelock);
7538 } else {
7539 lov.add_wrlock(&srcdir->inode->filelock);
7540 lov.add_wrlock(&srcdir->inode->nestlock);
7541 }
7542 mds->locker->include_snap_rdlocks(srcdir->inode, lov);
7543
7544 // straydn?
7545 if (straydn) {
7546 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7547 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7548 lov.add_xlock(&straydn->lock);
7549 }
7550
7551 // xlock versionlock on dentries if there are witnesses.
7552 // replicas can't see projected dentry linkages, and will get
7553 // confused if we try to pipeline things.
7554 if (!witnesses.empty()) {
7555 // take xlock on all projected ancestor dentries for srcdn and destdn.
7556 // this ensures the srcdn and destdn can be traversed to by the witnesses.
7557 for (int i= 0; i<(int)srctrace.size(); i++) {
7558 if (srctrace[i]->is_auth() && srctrace[i]->is_projected())
7559 lov.add_xlock(&srctrace[i]->versionlock);
7560 }
7561 for (int i=0; i<(int)desttrace.size(); i++) {
7562 if (desttrace[i]->is_auth() && desttrace[i]->is_projected())
7563 lov.add_xlock(&desttrace[i]->versionlock);
7564 }
7565 // xlock srci and oldin's primary dentries, so witnesses can call
7566 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
7567 // is traversed.
7568 if (srcdnl->is_remote())
7569 lov.add_xlock(&srci->get_projected_parent_dn()->lock);
7570 if (destdnl->is_remote())
7571 lov.add_xlock(&oldin->get_projected_parent_dn()->lock);
7572 }
7573
7574 // we need to update srci's ctime. xlock its least contended lock to do that...
7575 lov.add_xlock(&srci->linklock);
7576 lov.add_xlock(&srci->snaplock);
7577
7578 if (oldin) {
7579 // xlock oldin (for nlink--)
7580 lov.add_xlock(&oldin->linklock);
7581 lov.add_xlock(&oldin->snaplock);
7582 if (oldin->is_dir())
7583 lov.add_rdlock(&oldin->filelock); // to verify it's empty
7584 }
7585
7586 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL;
7587 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
7588 return;
7589
7590 if (linkmerge)
7591 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
7592
7593 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7594 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
7595 return;
7596
7597 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
7598 return;
7599
7600 if (!check_fragment_space(mdr, destdn->get_dir()))
7601 return;
7602
7603 if (!check_access(mdr, srci, MAY_WRITE))
7604 return;
7605 }
7606
7607 // with read lock, really verify oldin is empty
7608 if (oldin &&
7609 oldin->is_dir() &&
7610 _dir_is_nonempty(mdr, oldin)) {
7611 respond_to_request(mdr, -ENOTEMPTY);
7612 return;
7613 }
7614
7615 /* project_snaprealm_past_parent() will do this job
7616 *
7617 // moving between snaprealms?
7618 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
7619 SnapRealm *srcrealm = srci->find_snaprealm();
7620 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
7621 if (srcrealm != destrealm &&
7622 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
7623 destrealm->get_newest_seq() + 1 > srcdn->first)) {
7624 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
7625 mdcache->snaprealm_create(mdr, srci);
7626 return;
7627 }
7628 }
7629 */
7630
7631 ceph_assert(g_conf()->mds_kill_rename_at != 1);
7632
7633 // -- open all srcdn inode frags, if any --
7634 // we need these open so that auth can properly delegate from inode to dirfrags
7635 // after the inode is _ours_.
7636 if (srcdnl->is_primary() &&
7637 !srcdn->is_auth() &&
7638 srci->is_dir()) {
7639 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
7640 mdr->set_stickydirs(srci);
7641
7642 frag_vec_t leaves;
7643 srci->dirfragtree.get_leaves(leaves);
7644 for (const auto& leaf : leaves) {
7645 CDir *dir = srci->get_dirfrag(leaf);
7646 if (!dir) {
7647 dout(10) << " opening " << leaf << " under " << *srci << dendl;
7648 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
7649 return;
7650 }
7651 }
7652 }
7653
7654 // -- prepare snaprealm ---
7655
7656 if (linkmerge) {
7657 if (!mdr->more()->srci_srnode &&
7658 srci->get_projected_inode()->nlink == 1 &&
7659 srci->is_projected_snaprealm_global()) {
7660 sr_t *new_srnode = srci->prepare_new_srnode(0);
7661 srci->record_snaprealm_parent_dentry(new_srnode, NULL, destdn, false);
7662
7663 srci->clear_snaprealm_global(new_srnode);
7664 mdr->more()->srci_srnode = new_srnode;
7665 }
7666 } else {
7667 if (oldin && !mdr->more()->desti_srnode) {
7668 if (oldin->is_projected_snaprealm_global()) {
7669 sr_t *new_srnode = oldin->prepare_new_srnode(0);
7670 oldin->record_snaprealm_parent_dentry(new_srnode, NULL, destdn, destdnl->is_primary());
7671 // dropping the last linkage or dropping the last remote linkage,
7672 // detch the inode from global snaprealm
7673 auto nlink = oldin->get_projected_inode()->nlink;
7674 if (nlink == 1 ||
7675 (nlink == 2 && !destdnl->is_primary() &&
7676 !oldin->get_projected_parent_dir()->inode->is_stray()))
7677 oldin->clear_snaprealm_global(new_srnode);
7678 mdr->more()->desti_srnode = new_srnode;
7679 } else if (destdnl->is_primary()) {
7680 SnapRealm *dest_realm = destdir->inode->find_snaprealm();
7681 snapid_t follows = dest_realm->get_newest_seq();
7682 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
7683 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
7684 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7685 mdr->more()->desti_srnode = new_srnode;
7686 }
7687 }
7688 }
7689 if (!mdr->more()->srci_srnode) {
7690 SnapRealm *dest_realm = destdir->inode->find_snaprealm();
7691 if (srci->is_projected_snaprealm_global()) {
7692 sr_t *new_srnode = srci->prepare_new_srnode(0);
7693 srci->record_snaprealm_parent_dentry(new_srnode, dest_realm, srcdn, srcdnl->is_primary());
7694 mdr->more()->srci_srnode = new_srnode;
7695 } else if (srcdnl->is_primary()) {
7696 SnapRealm *src_realm = srcdir->inode->find_snaprealm();
7697 snapid_t follows = src_realm->get_newest_seq();
7698 if (src_realm != dest_realm &&
7699 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
7700 sr_t *new_srnode = srci->prepare_new_srnode(follows);
7701 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
7702 mdr->more()->srci_srnode = new_srnode;
7703 }
7704 }
7705 }
7706 }
7707
7708 // -- prepare witnesses --
7709
7710 // do srcdn auth last
7711 mds_rank_t last = MDS_RANK_NONE;
7712 if (!srcdn->is_auth()) {
7713 last = srcdn->authority().first;
7714 mdr->more()->srcdn_auth_mds = last;
7715 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
7716 // are involved in the rename operation.
7717 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
7718 dout(10) << " preparing ambiguous auth for srci" << dendl;
7719 ceph_assert(mdr->more()->is_remote_frozen_authpin);
7720 ceph_assert(mdr->more()->rename_inode == srci);
7721 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
7722 return;
7723 }
7724 }
7725
7726 for (set<mds_rank_t>::iterator p = witnesses.begin();
7727 p != witnesses.end();
7728 ++p) {
7729 if (*p == last) continue; // do it last!
7730 if (mdr->more()->witnessed.count(*p)) {
7731 dout(10) << " already witnessed by mds." << *p << dendl;
7732 } else if (mdr->more()->waiting_on_slave.count(*p)) {
7733 dout(10) << " already waiting on witness mds." << *p << dendl;
7734 } else {
7735 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
7736 return;
7737 }
7738 }
7739 if (!mdr->more()->waiting_on_slave.empty())
7740 return; // we're waiting for a witness.
7741
7742 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
7743 dout(10) << " preparing last witness (srcdn auth)" << dendl;
7744 ceph_assert(mdr->more()->waiting_on_slave.count(last) == 0);
7745 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
7746 return;
7747 }
7748
7749 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
7750 if (!mdr->more()->slaves.empty() && !srci->is_dir())
7751 ceph_assert(g_conf()->mds_kill_rename_at != 3);
7752 if (!mdr->more()->slaves.empty() && srci->is_dir())
7753 ceph_assert(g_conf()->mds_kill_rename_at != 4);
7754
7755 // -- declare now --
7756 mdr->set_mds_stamp(ceph_clock_now());
7757
7758 // -- prepare journal entry --
7759 mdr->ls = mdlog->get_current_segment();
7760 EUpdate *le = new EUpdate(mdlog, "rename");
7761 mdlog->start_entry(le);
7762 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7763 if (!mdr->more()->witnessed.empty()) {
7764 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
7765
7766 le->reqid = mdr->reqid;
7767 le->had_slaves = true;
7768
7769 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7770 // no need to send frozen auth pin to recovring auth MDS of srci
7771 mdr->more()->is_remote_frozen_authpin = false;
7772 }
7773
7774 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
7775 if (le->client_map.length())
7776 le->cmapv = mds->sessionmap.get_projected();
7777
7778 // -- commit locally --
7779 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
7780
7781 journal_and_reply(mdr, srci, destdn, le, fin);
7782 mds->balancer->maybe_fragment(destdn->get_dir(), false);
7783 }
7784
7785
7786 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7787 {
7788 dout(10) << "_rename_finish " << *mdr << dendl;
7789
7790 if (!mdr->more()->witnessed.empty())
7791 mdcache->logged_master_update(mdr->reqid);
7792
7793 // apply
7794 _rename_apply(mdr, srcdn, destdn, straydn);
7795
7796 mdcache->send_dentry_link(destdn, mdr);
7797
7798 CDentry::linkage_t *destdnl = destdn->get_linkage();
7799 CInode *in = destdnl->get_inode();
7800 bool need_eval = mdr->more()->cap_imports.count(in);
7801
7802 // test hack: test slave commit
7803 if (!mdr->more()->slaves.empty() && !in->is_dir())
7804 ceph_assert(g_conf()->mds_kill_rename_at != 5);
7805 if (!mdr->more()->slaves.empty() && in->is_dir())
7806 ceph_assert(g_conf()->mds_kill_rename_at != 6);
7807
7808 // bump popularity
7809 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
7810 if (destdnl->is_remote() && in->is_auth())
7811 mds->balancer->hit_inode(in, META_POP_IWR);
7812
7813 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
7814
7815 ceph_assert(g_conf()->mds_kill_rename_at != 7);
7816
7817 // reply
7818 respond_to_request(mdr, 0);
7819
7820 if (need_eval)
7821 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
7822
7823 // clean up?
7824 // respond_to_request() drops locks. So stray reintegration can race with us.
7825 if (straydn && !straydn->get_projected_linkage()->is_null()) {
7826 mdcache->notify_stray(straydn);
7827 }
7828 }
7829
7830
7831
7832 // helpers
7833
7834 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
7835 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
7836 {
7837 if (mds->is_cluster_degraded() &&
7838 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7839 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
7840 if (mdr->more()->waiting_on_slave.empty())
7841 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7842 return false;
7843 }
7844
7845 dout(10) << "_rename_prepare_witness mds." << who << dendl;
7846 auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREP);
7847
7848 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
7849 for (auto dn : srctrace)
7850 req->srcdnpath.push_dentry(dn->get_name());
7851 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
7852 for (auto dn : dsttrace)
7853 req->destdnpath.push_dentry(dn->get_name());
7854 if (straydn)
7855 mdcache->replicate_stray(straydn, who, req->straybl);
7856
7857 if (mdr->more()->srci_srnode)
7858 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
7859 if (mdr->more()->desti_srnode)
7860 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
7861
7862 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
7863
7864 // srcdn auth will verify our current witness list is sufficient
7865 req->witnesses = witnesse;
7866
7867 req->op_stamp = mdr->get_op_stamp();
7868 mds->send_message_mds(req, who);
7869
7870 ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
7871 mdr->more()->waiting_on_slave.insert(who);
7872 return true;
7873 }
7874
7875 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
7876 {
7877 version_t oldpv = mdr->more()->inode_import_v;
7878
7879 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7880
7881 /* import node */
7882 auto blp = mdr->more()->inode_import.cbegin();
7883
7884 // imported caps
7885 map<client_t,entity_inst_t> client_map;
7886 map<client_t, client_metadata_t> client_metadata_map;
7887 decode(client_map, blp);
7888 decode(client_metadata_map, blp);
7889 prepare_force_open_sessions(client_map, client_metadata_map,
7890 mdr->more()->imported_session_map);
7891 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
7892 encode(client_metadata_map, *client_map_bl);
7893
7894 list<ScatterLock*> updated_scatterlocks;
7895 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
7896 mdr->more()->cap_imports, updated_scatterlocks);
7897
7898 // hack: force back to !auth and clean, temporarily
7899 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
7900 srcdnl->get_inode()->mark_clean();
7901
7902 return oldpv;
7903 }
7904
7905 bool Server::_need_force_journal(CInode *diri, bool empty)
7906 {
7907 std::vector<CDir*> dirs;
7908 diri->get_dirfrags(dirs);
7909
7910 bool force_journal = false;
7911 if (empty) {
7912 for (const auto& dir : dirs) {
7913 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
7914 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
7915 force_journal = true;
7916 break;
7917 } else
7918 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
7919 }
7920 } else {
7921 // see if any children of our frags are auth subtrees.
7922 std::vector<CDir*> subtrees;
7923 mdcache->get_subtrees(subtrees);
7924 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
7925 for (const auto& dir : dirs) {
7926 for (const auto& subtree : subtrees) {
7927 if (dir->contains(subtree)) {
7928 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
7929 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
7930 << *subtree << dendl;
7931 force_journal = true;
7932 break;
7933 } else
7934 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
7935 } else
7936 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
7937 }
7938 if (force_journal)
7939 break;
7940 }
7941 }
7942 return force_journal;
7943 }
7944
7945 void Server::_rename_prepare(MDRequestRef& mdr,
7946 EMetaBlob *metablob, bufferlist *client_map_bl,
7947 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7948 {
7949 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7950 if (straydn)
7951 dout(10) << " straydn " << *straydn << dendl;
7952
7953 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7954 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7955 CInode *srci = srcdnl->get_inode();
7956 CInode *oldin = destdnl->get_inode();
7957
7958 // primary+remote link merge?
7959 bool linkmerge = (srci == oldin);
7960 if (linkmerge)
7961 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
7962 bool silent = srcdn->get_dir()->inode->is_stray();
7963
7964 bool force_journal_dest = false;
7965 if (srci->is_dir() && !destdn->is_auth()) {
7966 if (srci->is_auth()) {
7967 // if we are auth for srci and exporting it, force journal because journal replay needs
7968 // the source inode to create auth subtrees.
7969 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
7970 force_journal_dest = true;
7971 } else
7972 force_journal_dest = _need_force_journal(srci, false);
7973 }
7974
7975 bool force_journal_stray = false;
7976 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
7977 force_journal_stray = _need_force_journal(oldin, true);
7978
7979 if (linkmerge)
7980 dout(10) << " merging remote and primary links to the same inode" << dendl;
7981 if (silent)
7982 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
7983 if (force_journal_dest)
7984 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
7985 if (force_journal_stray)
7986 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
7987
7988 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
7989 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
7990 metablob->renamed_dirino = srci->ino();
7991 } else if (oldin && oldin->is_dir() && force_journal_stray) {
7992 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
7993 metablob->renamed_dirino = oldin->ino();
7994 }
7995
7996 // prepare
7997 CInode::mempool_inode *spi = 0; // renamed inode
7998 CInode::mempool_inode *tpi = 0; // target/overwritten inode
7999
8000 // target inode
8001 if (!linkmerge) {
8002 if (destdnl->is_primary()) {
8003 ceph_assert(straydn); // moving to straydn.
8004 // link--, and move.
8005 if (destdn->is_auth()) {
8006 auto &pi= oldin->project_inode(); //project_snaprealm
8007 pi.inode.version = straydn->pre_dirty(pi.inode.version);
8008 pi.inode.update_backtrace();
8009 tpi = &pi.inode;
8010 }
8011 straydn->push_projected_linkage(oldin);
8012 } else if (destdnl->is_remote()) {
8013 // nlink-- targeti
8014 if (oldin->is_auth()) {
8015 auto &pi = oldin->project_inode();
8016 pi.inode.version = oldin->pre_dirty();
8017 tpi = &pi.inode;
8018 }
8019 }
8020 }
8021
8022 // dest
8023 if (srcdnl->is_remote()) {
8024 if (!linkmerge) {
8025 // destdn
8026 if (destdn->is_auth())
8027 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
8028 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8029 // srci
8030 if (srci->is_auth()) {
8031 auto &pi = srci->project_inode();
8032 pi.inode.version = srci->pre_dirty();
8033 spi = &pi.inode;
8034 }
8035 } else {
8036 dout(10) << " will merge remote onto primary link" << dendl;
8037 if (destdn->is_auth()) {
8038 auto &pi = oldin->project_inode();
8039 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
8040 spi = &pi.inode;
8041 }
8042 }
8043 } else { // primary
8044 if (destdn->is_auth()) {
8045 version_t oldpv;
8046 if (srcdn->is_auth())
8047 oldpv = srci->get_projected_version();
8048 else {
8049 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
8050
8051 // note which dirfrags have child subtrees in the journal
8052 // event, so that we can open those (as bounds) during replay.
8053 if (srci->is_dir()) {
8054 list<CDir*> ls;
8055 srci->get_dirfrags(ls);
8056 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
8057 CDir *dir = *p;
8058 if (!dir->is_auth())
8059 metablob->renamed_dir_frags.push_back(dir->get_frag());
8060 }
8061 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
8062 }
8063 }
8064 auto &pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
8065 // & srcdnl->snaprealm
8066 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
8067 pi.inode.update_backtrace();
8068 spi = &pi.inode;
8069 }
8070 destdn->push_projected_linkage(srci);
8071 }
8072
8073 // src
8074 if (srcdn->is_auth())
8075 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
8076 srcdn->push_projected_linkage(); // push null linkage
8077
8078 if (!silent) {
8079 if (spi) {
8080 spi->ctime = mdr->get_op_stamp();
8081 if (mdr->get_op_stamp() > spi->rstat.rctime)
8082 spi->rstat.rctime = mdr->get_op_stamp();
8083 spi->change_attr++;
8084 if (linkmerge)
8085 spi->nlink--;
8086 }
8087 if (tpi) {
8088 tpi->ctime = mdr->get_op_stamp();
8089 if (mdr->get_op_stamp() > tpi->rstat.rctime)
8090 tpi->rstat.rctime = mdr->get_op_stamp();
8091 tpi->change_attr++;
8092 {
8093 std::string t;
8094 destdn->make_path_string(t, true);
8095 tpi->stray_prior_path = std::move(t);
8096 }
8097 tpi->nlink--;
8098 if (tpi->nlink == 0)
8099 oldin->state_set(CInode::STATE_ORPHAN);
8100 }
8101 }
8102
8103 // prepare nesting, mtime updates
8104 int predirty_dir = silent ? 0:PREDIRTY_DIR;
8105
8106 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8107 // then link the source inode to destdn
8108 if (destdnl->is_primary()) {
8109 ceph_assert(straydn);
8110 if (straydn->is_auth()) {
8111 metablob->add_dir_context(straydn->get_dir());
8112 metablob->add_dir(straydn->get_dir(), true);
8113 }
8114 }
8115
8116 // sub off target
8117 if (destdn->is_auth() && !destdnl->is_null()) {
8118 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
8119 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
8120 if (destdnl->is_primary()) {
8121 ceph_assert(straydn);
8122 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
8123 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
8124 }
8125 }
8126
8127 // move srcdn
8128 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
8129 int flags = predirty_dir | predirty_primary;
8130 if (srcdn->is_auth())
8131 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
8132 if (destdn->is_auth())
8133 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
8134
8135 // add it all to the metablob
8136 // target inode
8137 if (!linkmerge) {
8138 if (destdnl->is_primary()) {
8139 ceph_assert(straydn);
8140 if (destdn->is_auth()) {
8141 // project snaprealm, too
8142 if (auto& desti_srnode = mdr->more()->desti_srnode) {
8143 oldin->project_snaprealm(desti_srnode);
8144 if (tpi->nlink == 0)
8145 ceph_assert(!desti_srnode->is_parent_global());
8146 desti_srnode = NULL;
8147 }
8148 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8149 metablob->add_primary_dentry(straydn, oldin, true, true);
8150 } else if (force_journal_stray) {
8151 dout(10) << " forced journaling straydn " << *straydn << dendl;
8152 metablob->add_dir_context(straydn->get_dir());
8153 metablob->add_primary_dentry(straydn, oldin, true);
8154 }
8155 } else if (destdnl->is_remote()) {
8156 if (oldin->is_auth()) {
8157 sr_t *new_srnode = NULL;
8158 if (mdr->slave_request) {
8159 if (mdr->slave_request->desti_snapbl.length() > 0) {
8160 new_srnode = new sr_t();
8161 auto p = mdr->slave_request->desti_snapbl.cbegin();
8162 decode(*new_srnode, p);
8163 }
8164 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8165 new_srnode = desti_srnode;
8166 desti_srnode = NULL;
8167 }
8168 if (new_srnode) {
8169 oldin->project_snaprealm(new_srnode);
8170 if (tpi->nlink == 0)
8171 ceph_assert(!new_srnode->is_parent_global());
8172 }
8173 // auth for targeti
8174 metablob->add_dir_context(oldin->get_projected_parent_dir());
8175 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
8176 CEPH_NOSNAP, 0, destdnl);
8177 metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
8178 }
8179 }
8180 }
8181
8182 // dest
8183 if (srcdnl->is_remote()) {
8184 ceph_assert(!linkmerge);
8185 if (destdn->is_auth() && !destdnl->is_null())
8186 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8187 else
8188 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8189
8190 if (destdn->is_auth())
8191 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8192
8193 if (srci->is_auth() ) { // it's remote
8194 if (mdr->slave_request) {
8195 if (mdr->slave_request->srci_snapbl.length() > 0) {
8196 sr_t *new_srnode = new sr_t();
8197 auto p = mdr->slave_request->srci_snapbl.cbegin();
8198 decode(*new_srnode, p);
8199 srci->project_snaprealm(new_srnode);
8200 }
8201 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8202 srci->project_snaprealm(srci_srnode);
8203 srci_srnode = NULL;
8204 }
8205
8206 CDentry *srci_pdn = srci->get_projected_parent_dn();
8207 metablob->add_dir_context(srci_pdn->get_dir());
8208 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn, CEPH_NOSNAP, 0, srcdnl);
8209 metablob->add_primary_dentry(srci_pdn, srci, true);
8210 }
8211 } else if (srcdnl->is_primary()) {
8212 // project snap parent update?
8213 if (destdn->is_auth()) {
8214 if (auto& srci_srnode = mdr->more()->srci_srnode) {
8215 srci->project_snaprealm(srci_srnode);
8216 srci_srnode = NULL;
8217 }
8218 }
8219
8220 if (destdn->is_auth() && !destdnl->is_null())
8221 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8222
8223 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8224
8225 if (destdn->is_auth())
8226 metablob->add_primary_dentry(destdn, srci, true, true);
8227 else if (force_journal_dest) {
8228 dout(10) << " forced journaling destdn " << *destdn << dendl;
8229 metablob->add_dir_context(destdn->get_dir());
8230 metablob->add_primary_dentry(destdn, srci, true);
8231 if (srcdn->is_auth() && srci->is_dir()) {
8232 // journal new subtrees root dirfrags
8233 list<CDir*> ls;
8234 srci->get_dirfrags(ls);
8235 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
8236 CDir *dir = *p;
8237 if (dir->is_auth())
8238 metablob->add_dir(dir, true);
8239 }
8240 }
8241 }
8242 }
8243
8244 // src
8245 if (srcdn->is_auth()) {
8246 dout(10) << " journaling srcdn " << *srcdn << dendl;
8247 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
8248 // also journal the inode in case we need do slave rename rollback. It is Ok to add
8249 // both primary and NULL dentries. Because during journal replay, null dentry is
8250 // processed after primary dentry.
8251 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
8252 metablob->add_primary_dentry(srcdn, srci, true);
8253 metablob->add_null_dentry(srcdn, true);
8254 } else
8255 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
8256
8257 // make renamed inode first track the dn
8258 if (srcdnl->is_primary() && destdn->is_auth()) {
8259 ceph_assert(srci->first <= destdn->first);
8260 srci->first = destdn->first;
8261 }
8262 // make stray inode first track the straydn
8263 if (straydn && straydn->is_auth()) {
8264 ceph_assert(oldin->first <= straydn->first);
8265 oldin->first = straydn->first;
8266 }
8267
8268 if (oldin && oldin->is_dir()) {
8269 ceph_assert(straydn);
8270 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
8271 }
8272 if (srci->is_dir())
8273 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
8274
8275 }
8276
8277
8278 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8279 {
8280 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8281 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
8282
8283 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8284 CDentry::linkage_t *destdnl = destdn->get_linkage();
8285
8286 CInode *oldin = destdnl->get_inode();
8287
8288 // primary+remote link merge?
8289 bool linkmerge = (srcdnl->get_inode() == oldin);
8290 if (linkmerge)
8291 ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
8292
8293 bool new_in_snaprealm = false;
8294 bool new_oldin_snaprealm = false;
8295
8296 // target inode
8297 if (!linkmerge) {
8298 if (destdnl->is_primary()) {
8299 ceph_assert(straydn);
8300 dout(10) << "straydn is " << *straydn << dendl;
8301
8302 // if there is newly created snaprealm, need to split old snaprealm's
8303 // inodes_with_caps. So pop snaprealm before linkage changes.
8304 if (destdn->is_auth()) {
8305 bool hadrealm = (oldin->snaprealm ? true : false);
8306 oldin->early_pop_projected_snaprealm();
8307 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
8308 } else {
8309 ceph_assert(mdr->slave_request);
8310 if (mdr->slave_request->desti_snapbl.length()) {
8311 new_oldin_snaprealm = !oldin->snaprealm;
8312 oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
8313 ceph_assert(oldin->snaprealm);
8314 ceph_assert(oldin->snaprealm->have_past_parents_open());
8315 }
8316 }
8317
8318 destdn->get_dir()->unlink_inode(destdn, false);
8319
8320 straydn->pop_projected_linkage();
8321 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8322 ceph_assert(!straydn->is_projected()); // no other projected
8323
8324 // nlink-- targeti
8325 if (destdn->is_auth())
8326 oldin->pop_and_dirty_projected_inode(mdr->ls);
8327
8328 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
8329 } else if (destdnl->is_remote()) {
8330 destdn->get_dir()->unlink_inode(destdn, false);
8331 if (oldin->is_auth()) {
8332 oldin->pop_and_dirty_projected_inode(mdr->ls);
8333 } else if (mdr->slave_request) {
8334 if (mdr->slave_request->desti_snapbl.length() > 0) {
8335 ceph_assert(oldin->snaprealm);
8336 oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
8337 }
8338 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8339 delete desti_srnode;
8340 desti_srnode = NULL;
8341 }
8342 }
8343 }
8344
8345 // unlink src before we relink it at dest
8346 CInode *in = srcdnl->get_inode();
8347 ceph_assert(in);
8348
8349 bool srcdn_was_remote = srcdnl->is_remote();
8350 if (!srcdn_was_remote) {
8351 // if there is newly created snaprealm, need to split old snaprealm's
8352 // inodes_with_caps. So pop snaprealm before linkage changes.
8353 if (destdn->is_auth()) {
8354 bool hadrealm = (in->snaprealm ? true : false);
8355 in->early_pop_projected_snaprealm();
8356 new_in_snaprealm = (in->snaprealm && !hadrealm);
8357 } else {
8358 ceph_assert(mdr->slave_request);
8359 if (mdr->slave_request->srci_snapbl.length()) {
8360 new_in_snaprealm = !in->snaprealm;
8361 in->decode_snap_blob(mdr->slave_request->srci_snapbl);
8362 ceph_assert(in->snaprealm);
8363 ceph_assert(in->snaprealm->have_past_parents_open());
8364 }
8365 }
8366 }
8367
8368 srcdn->get_dir()->unlink_inode(srcdn);
8369
8370 // dest
8371 if (srcdn_was_remote) {
8372 if (!linkmerge) {
8373 // destdn
8374 destdnl = destdn->pop_projected_linkage();
8375 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8376 ceph_assert(!destdn->is_projected()); // no other projected
8377
8378 destdn->link_remote(destdnl, in);
8379 if (destdn->is_auth())
8380 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
8381 // in
8382 if (in->is_auth()) {
8383 in->pop_and_dirty_projected_inode(mdr->ls);
8384 } else if (mdr->slave_request) {
8385 if (mdr->slave_request->srci_snapbl.length() > 0) {
8386 ceph_assert(in->snaprealm);
8387 in->decode_snap_blob(mdr->slave_request->srci_snapbl);
8388 }
8389 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8390 delete srci_srnode;
8391 srci_srnode = NULL;
8392 }
8393 } else {
8394 dout(10) << "merging remote onto primary link" << dendl;
8395 oldin->pop_and_dirty_projected_inode(mdr->ls);
8396 }
8397 } else { // primary
8398 if (linkmerge) {
8399 dout(10) << "merging primary onto remote link" << dendl;
8400 destdn->get_dir()->unlink_inode(destdn, false);
8401 }
8402 destdnl = destdn->pop_projected_linkage();
8403 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8404 ceph_assert(!destdn->is_projected()); // no other projected
8405
8406 // srcdn inode import?
8407 if (!srcdn->is_auth() && destdn->is_auth()) {
8408 ceph_assert(mdr->more()->inode_import.length() > 0);
8409
8410 map<client_t,Capability::Import> imported_caps;
8411
8412 // finish cap imports
8413 finish_force_open_sessions(mdr->more()->imported_session_map);
8414 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
8415 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
8416 mdr->more()->srcdn_auth_mds, true,
8417 mdr->more()->imported_session_map,
8418 mdr->more()->cap_imports[destdnl->get_inode()],
8419 imported_caps);
8420 }
8421
8422 mdr->more()->inode_import.clear();
8423 encode(imported_caps, mdr->more()->inode_import);
8424
8425 /* hack: add an auth pin for each xlock we hold. These were
8426 * remote xlocks previously but now they're local and
8427 * we're going to try and unpin when we xlock_finish. */
8428
8429 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
8430 i != mdr->locks.end();
8431 ++i) {
8432 SimpleLock *lock = i->lock;
8433 if (lock->get_parent() != destdnl->get_inode())
8434 break;
8435 if (i->is_xlock() && !lock->is_locallock())
8436 mds->locker->xlock_import(lock);
8437 }
8438
8439 // hack: fix auth bit
8440 in->state_set(CInode::STATE_AUTH);
8441
8442 mdr->clear_ambiguous_auth();
8443 }
8444
8445 if (destdn->is_auth())
8446 in->pop_and_dirty_projected_inode(mdr->ls);
8447 }
8448
8449 // src
8450 if (srcdn->is_auth())
8451 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
8452 srcdn->pop_projected_linkage();
8453 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8454 ceph_assert(!srcdn->is_projected()); // no other projected
8455
8456 // apply remaining projected inodes (nested)
8457 mdr->apply();
8458
8459 // update subtree map?
8460 if (destdnl->is_primary() && in->is_dir())
8461 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
8462
8463 if (straydn && oldin->is_dir())
8464 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
8465
8466 if (new_oldin_snaprealm)
8467 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
8468 if (new_in_snaprealm)
8469 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
8470
8471 // removing a new dn?
8472 if (srcdn->is_auth())
8473 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
8474 }
8475
8476
8477
8478 // ------------
8479 // SLAVE
8480
8481 class C_MDS_SlaveRenamePrep : public ServerLogContext {
8482 CDentry *srcdn, *destdn, *straydn;
8483 public:
8484 C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
8485 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
8486 void finish(int r) override {
8487 server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
8488 }
8489 };
8490
8491 class C_MDS_SlaveRenameCommit : public ServerContext {
8492 MDRequestRef mdr;
8493 CDentry *srcdn, *destdn, *straydn;
8494 public:
8495 C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
8496 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
8497 void finish(int r) override {
8498 server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
8499 }
8500 };
8501
8502 class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
8503 MDRequestRef mdr;
8504 public:
8505 C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
8506 ServerContext(s), mdr(r) {}
8507 void finish(int r) override {
8508 server->_slave_rename_sessions_flushed(mdr);
8509 }
8510 };
8511
8512 void Server::handle_slave_rename_prep(MDRequestRef& mdr)
8513 {
8514 dout(10) << "handle_slave_rename_prep " << *mdr
8515 << " " << mdr->slave_request->srcdnpath
8516 << " to " << mdr->slave_request->destdnpath
8517 << dendl;
8518
8519 if (mdr->slave_request->is_interrupted()) {
8520 dout(10) << " slave request interrupted, sending noop reply" << dendl;
8521 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
8522 reply->mark_interrupted();
8523 mds->send_message_mds(reply, mdr->slave_to_mds);
8524 mdr->reset_slave_request();
8525 return;
8526 }
8527
8528 // discover destdn
8529 filepath destpath(mdr->slave_request->destdnpath);
8530 dout(10) << " dest " << destpath << dendl;
8531 vector<CDentry*> trace;
8532 CF_MDS_MDRContextFactory cf(mdcache, mdr);
8533 int r = mdcache->path_traverse(mdr, cf, destpath, &trace, NULL, MDS_TRAVERSE_DISCOVERXLOCK);
8534 if (r > 0) return;
8535 if (r == -ESTALE) {
8536 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
8537 mdr->slave_to_mds);
8538 return;
8539 }
8540 ceph_assert(r == 0); // we shouldn't get an error here!
8541
8542 CDentry *destdn = trace.back();
8543 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8544 dout(10) << " destdn " << *destdn << dendl;
8545 mdr->pin(destdn);
8546
8547 // discover srcdn
8548 filepath srcpath(mdr->slave_request->srcdnpath);
8549 dout(10) << " src " << srcpath << dendl;
8550 CInode *srci = nullptr;
8551 r = mdcache->path_traverse(mdr, cf, srcpath, &trace, &srci, MDS_TRAVERSE_DISCOVERXLOCK);
8552 if (r > 0) return;
8553 ceph_assert(r == 0);
8554
8555 // srcpath must not point to a null dentry
8556 ceph_assert(srci != nullptr);
8557
8558 CDentry *srcdn = trace.back();
8559 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8560 dout(10) << " srcdn " << *srcdn << dendl;
8561 mdr->pin(srcdn);
8562 mdr->pin(srci);
8563
8564 // stray?
8565 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
8566 if (linkmerge)
8567 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
8568 CDentry *straydn = mdr->straydn;
8569 if (destdnl->is_primary() && !linkmerge)
8570 ceph_assert(straydn);
8571
8572 mdr->set_op_stamp(mdr->slave_request->op_stamp);
8573 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
8574
8575 // set up commit waiter (early, to clean up any freezing etc we do)
8576 if (!mdr->more()->slave_commit)
8577 mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
8578
8579 // am i srcdn auth?
8580 if (srcdn->is_auth()) {
8581 set<mds_rank_t> srcdnrep;
8582 srcdn->list_replicas(srcdnrep);
8583
8584 bool reply_witness = false;
8585 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
8586 // freeze?
8587 // we need this to
8588 // - avoid conflicting lock state changes
8589 // - avoid concurrent updates to the inode
8590 // (this could also be accomplished with the versionlock)
8591 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
8592 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
8593 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
8594
8595 // unfreeze auth pin after freezing the inode to avoid queueing waiters
8596 if (srcdnl->get_inode()->is_frozen_auth_pin())
8597 mdr->unfreeze_auth_pin();
8598
8599 if (!frozen_inode) {
8600 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
8601 return;
8602 }
8603
8604 /*
8605 * set ambiguous auth for srci
8606 * NOTE: we don't worry about ambiguous cache expire as we do
8607 * with subtree migrations because all slaves will pin
8608 * srcdn->get_inode() for duration of this rename.
8609 */
8610 mdr->set_ambiguous_auth(srcdnl->get_inode());
8611
8612 // just mark the source inode as ambiguous auth if more than two MDS are involved.
8613 // the master will send another OP_RENAMEPREP slave request later.
8614 if (mdr->slave_request->witnesses.size() > 1) {
8615 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
8616 reply_witness = true;
8617 }
8618
8619 // make sure bystanders have received all lock related messages
8620 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
8621 if (*p == mdr->slave_to_mds ||
8622 (mds->is_cluster_degraded() &&
8623 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
8624 continue;
8625 auto notify = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMENOTIFY);
8626 mds->send_message_mds(notify, *p);
8627 mdr->more()->waiting_on_slave.insert(*p);
8628 }
8629
8630 // make sure clients have received all cap related messages
8631 set<client_t> export_client_set;
8632 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
8633
8634 MDSGatherBuilder gather(g_ceph_context);
8635 flush_client_sessions(export_client_set, gather);
8636 if (gather.has_subs()) {
8637 mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
8638 gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
8639 gather.activate();
8640 }
8641 }
8642
8643 // is witness list sufficient?
8644 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
8645 if (*p == mdr->slave_to_mds ||
8646 mdr->slave_request->witnesses.count(*p)) continue;
8647 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
8648 reply_witness = true;
8649 break;
8650 }
8651
8652 if (reply_witness) {
8653 ceph_assert(!srcdnrep.empty());
8654 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
8655 reply->witnesses.swap(srcdnrep);
8656 mds->send_message_mds(reply, mdr->slave_to_mds);
8657 mdr->reset_slave_request();
8658 return;
8659 }
8660 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
8661 if (!mdr->more()->waiting_on_slave.empty()) {
8662 dout(10) << " still waiting for rename notify acks from "
8663 << mdr->more()->waiting_on_slave << dendl;
8664 return;
8665 }
8666 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
8667 // set ambiguous auth for srci on witnesses
8668 mdr->set_ambiguous_auth(srcdnl->get_inode());
8669 }
8670
8671 // encode everything we'd need to roll this back... basically, just the original state.
8672 rename_rollback rollback;
8673
8674 rollback.reqid = mdr->reqid;
8675
8676 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
8677 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
8678 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
8679 rollback.orig_src.dname = srcdn->get_name();
8680 if (srcdnl->is_primary())
8681 rollback.orig_src.ino = srcdnl->get_inode()->ino();
8682 else {
8683 ceph_assert(srcdnl->is_remote());
8684 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
8685 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
8686 }
8687
8688 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
8689 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
8690 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
8691 rollback.orig_dest.dname = destdn->get_name();
8692 if (destdnl->is_primary())
8693 rollback.orig_dest.ino = destdnl->get_inode()->ino();
8694 else if (destdnl->is_remote()) {
8695 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
8696 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
8697 }
8698
8699 if (straydn) {
8700 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
8701 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
8702 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
8703 rollback.stray.dname = straydn->get_name();
8704 }
8705 if (mdr->slave_request->desti_snapbl.length()) {
8706 CInode *oldin = destdnl->get_inode();
8707 if (oldin->snaprealm) {
8708 encode(true, rollback.desti_snapbl);
8709 oldin->encode_snap_blob(rollback.desti_snapbl);
8710 } else {
8711 encode(false, rollback.desti_snapbl);
8712 }
8713 }
8714 if (mdr->slave_request->srci_snapbl.length()) {
8715 if (srci->snaprealm) {
8716 encode(true, rollback.srci_snapbl);
8717 srci->encode_snap_blob(rollback.srci_snapbl);
8718 } else {
8719 encode(false, rollback.srci_snapbl);
8720 }
8721 }
8722 encode(rollback, mdr->more()->rollback_bl);
8723 // FIXME: rollback snaprealm
8724 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
8725
8726 // journal.
8727 mdr->ls = mdlog->get_current_segment();
8728 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
8729 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
8730 mdlog->start_entry(le);
8731 le->rollback = mdr->more()->rollback_bl;
8732
8733 bufferlist blah; // inode import data... obviously not used if we're the slave
8734 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
8735
8736 if (le->commit.empty()) {
8737 dout(10) << " empty metablob, skipping journal" << dendl;
8738 mdlog->cancel_entry(le);
8739 mdr->ls = NULL;
8740 _logged_slave_rename(mdr, srcdn, destdn, straydn);
8741 } else {
8742 mdr->more()->slave_update_journaled = true;
8743 submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
8744 mdr, __func__);
8745 mdlog->flush();
8746 }
8747 }
8748
8749 void Server::_logged_slave_rename(MDRequestRef& mdr,
8750 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8751 {
8752 dout(10) << "_logged_slave_rename " << *mdr << dendl;
8753
8754 // prepare ack
8755 MMDSSlaveRequest::ref reply;
8756 if (!mdr->aborted) {
8757 reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
8758 if (!mdr->more()->slave_update_journaled)
8759 reply->mark_not_journaled();
8760 }
8761
8762 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8763 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
8764
8765 // export srci?
8766 if (srcdn->is_auth() && srcdnl->is_primary()) {
8767 // set export bounds for CInode::encode_export()
8768 if (reply) {
8769 list<CDir*> bounds;
8770 if (srcdnl->get_inode()->is_dir()) {
8771 srcdnl->get_inode()->get_dirfrags(bounds);
8772 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
8773 (*p)->state_set(CDir::STATE_EXPORTBOUND);
8774 }
8775
8776 map<client_t,entity_inst_t> exported_client_map;
8777 map<client_t, client_metadata_t> exported_client_metadata_map;
8778 bufferlist inodebl;
8779 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
8780 exported_client_map,
8781 exported_client_metadata_map);
8782
8783 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
8784 (*p)->state_clear(CDir::STATE_EXPORTBOUND);
8785
8786 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
8787 encode(exported_client_metadata_map, reply->inode_export);
8788 reply->inode_export.claim_append(inodebl);
8789 reply->inode_export_v = srcdnl->get_inode()->inode.version;
8790 }
8791
8792 // remove mdr auth pin
8793 mdr->auth_unpin(srcdnl->get_inode());
8794 mdr->more()->is_inode_exporter = true;
8795
8796 if (srcdnl->get_inode()->is_dirty())
8797 srcdnl->get_inode()->mark_clean();
8798
8799 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
8800 }
8801
8802 // apply
8803 _rename_apply(mdr, srcdn, destdn, straydn);
8804
8805 CDentry::linkage_t *destdnl = destdn->get_linkage();
8806
8807 // bump popularity
8808 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
8809 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
8810 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
8811
8812 // done.
8813 mdr->reset_slave_request();
8814 mdr->straydn = 0;
8815
8816 if (reply) {
8817 mds->send_message_mds(reply, mdr->slave_to_mds);
8818 } else {
8819 ceph_assert(mdr->aborted);
8820 dout(10) << " abort flag set, finishing" << dendl;
8821 mdcache->request_finish(mdr);
8822 }
8823 }
8824
8825 void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
8826 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8827 {
8828 dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
8829
8830 CInode *in = destdn->get_linkage()->get_inode();
8831
8832 inodeno_t migrated_stray;
8833 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
8834 migrated_stray = in->ino();
8835
8836 MDSContext::vec finished;
8837 if (r == 0) {
8838 // unfreeze+singleauth inode
8839 // hmm, do i really need to delay this?
8840 if (mdr->more()->is_inode_exporter) {
8841 // drop our pins
8842 // we exported, clear out any xlocks that we moved to another MDS
8843
8844 for (auto i = mdr->locks.lower_bound(&in->versionlock);
8845 i != mdr->locks.end(); ) {
8846 SimpleLock *lock = i->lock;
8847 if (lock->get_parent() != in)
8848 break;
8849 // we only care about xlocks on the exported inode
8850 if (i->is_xlock() && !lock->is_locallock())
8851 mds->locker->xlock_export(i++, mdr.get());
8852 else
8853 ++i;
8854 }
8855
8856 map<client_t,Capability::Import> peer_imported;
8857 auto bp = mdr->more()->inode_import.cbegin();
8858 decode(peer_imported, bp);
8859
8860 dout(10) << " finishing inode export on " << *in << dendl;
8861 mdcache->migrator->finish_export_inode(in, mdr->slave_to_mds, peer_imported, finished);
8862 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
8863
8864 // unfreeze
8865 ceph_assert(in->is_frozen_inode());
8866 in->unfreeze_inode(finished);
8867 }
8868
8869 // singleauth
8870 if (mdr->more()->is_ambiguous_auth) {
8871 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8872 mdr->more()->is_ambiguous_auth = false;
8873 }
8874
8875 if (straydn && mdr->more()->slave_update_journaled) {
8876 CInode *strayin = straydn->get_projected_linkage()->get_inode();
8877 if (strayin && !strayin->snaprealm)
8878 mdcache->clear_dirty_bits_for_stray(strayin);
8879 }
8880
8881 mds->queue_waiters(finished);
8882 mdr->cleanup();
8883
8884 if (mdr->more()->slave_update_journaled) {
8885 // write a commit to the journal
8886 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
8887 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
8888 ESlaveUpdate::RENAME);
8889 mdlog->start_entry(le);
8890 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
8891 mdlog->flush();
8892 } else {
8893 _committed_slave(mdr);
8894 }
8895 } else {
8896
8897 // abort
8898 // rollback_bl may be empty if we froze the inode but had to provide an expanded
8899 // witness list from the master, and they failed before we tried prep again.
8900 if (mdr->more()->rollback_bl.length()) {
8901 if (mdr->more()->is_inode_exporter) {
8902 dout(10) << " reversing inode export of " << *in << dendl;
8903 in->abort_export();
8904 }
8905 if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
8906 mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
8907 // rollback but preserve the slave request
8908 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
8909 mdr->more()->rollback_bl.clear();
8910 } else
8911 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
8912 } else {
8913 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
8914 // singleauth
8915 if (mdr->more()->is_ambiguous_auth) {
8916 if (srcdn->is_auth())
8917 mdr->more()->rename_inode->unfreeze_inode(finished);
8918
8919 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8920 mdr->more()->is_ambiguous_auth = false;
8921 }
8922 mds->queue_waiters(finished);
8923 mdcache->request_finish(mdr);
8924 }
8925 }
8926
8927 if (migrated_stray && mds->is_stopping())
8928 mdcache->shutdown_export_stray_finish(migrated_stray);
8929 }
8930
8931 void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
8932 bool isdir, int linkunlink, nest_info_t &rstat)
8933 {
8934 fnode_t *pf;
8935 pf = dir->project_fnode();
8936 mut->add_projected_fnode(dir);
8937 pf->version = dir->pre_dirty();
8938
8939 if (isdir) {
8940 pf->fragstat.nsubdirs += linkunlink;
8941 } else {
8942 pf->fragstat.nfiles += linkunlink;
8943 }
8944 if (r.ino) {
8945 pf->rstat.rbytes += linkunlink * rstat.rbytes;
8946 pf->rstat.rfiles += linkunlink * rstat.rfiles;
8947 pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
8948 pf->rstat.rsnaps += linkunlink * rstat.rsnaps;
8949 }
8950 if (pf->fragstat.mtime == ctime) {
8951 pf->fragstat.mtime = r.dirfrag_old_mtime;
8952 if (pf->rstat.rctime == ctime)
8953 pf->rstat.rctime = r.dirfrag_old_rctime;
8954 }
8955 mut->add_updated_lock(&dir->get_inode()->filelock);
8956 mut->add_updated_lock(&dir->get_inode()->nestlock);
8957 }
8958
8959 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
8960 MutationRef mut;
8961 CDentry *srcdn;
8962 version_t srcdnpv;
8963 CDentry *destdn;
8964 CDentry *straydn;
8965 map<client_t,MClientSnap::ref> splits[2];
8966 bool finish_mdr;
8967 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
8968 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
8969 map<client_t,MClientSnap::ref> _splits[2], bool f) :
8970 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
8971 straydn(st), finish_mdr(f) {
8972 splits[0].swap(_splits[0]);
8973 splits[1].swap(_splits[1]);
8974 }
8975 void finish(int r) override {
8976 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
8977 destdn, straydn, splits, finish_mdr);
8978 }
8979 };
8980
8981 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
8982 bool finish_mdr)
8983 {
8984 rename_rollback rollback;
8985 auto p = rbl.cbegin();
8986 decode(rollback, p);
8987
8988 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
8989 // need to finish this update before sending resolve to claim the subtree
8990 mdcache->add_rollback(rollback.reqid, master);
8991
8992 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
8993 mut->ls = mds->mdlog->get_current_segment();
8994
8995 CDentry *srcdn = NULL;
8996 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
8997 if (!srcdir)
8998 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
8999 if (srcdir) {
9000 dout(10) << " srcdir " << *srcdir << dendl;
9001 srcdn = srcdir->lookup(rollback.orig_src.dname);
9002 if (srcdn) {
9003 dout(10) << " srcdn " << *srcdn << dendl;
9004 ceph_assert(srcdn->get_linkage()->is_null());
9005 } else
9006 dout(10) << " srcdn not found" << dendl;
9007 } else
9008 dout(10) << " srcdir not found" << dendl;
9009
9010 CDentry *destdn = NULL;
9011 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
9012 if (!destdir)
9013 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
9014 if (destdir) {
9015 dout(10) << " destdir " << *destdir << dendl;
9016 destdn = destdir->lookup(rollback.orig_dest.dname);
9017 if (destdn)
9018 dout(10) << " destdn " << *destdn << dendl;
9019 else
9020 dout(10) << " destdn not found" << dendl;
9021 } else
9022 dout(10) << " destdir not found" << dendl;
9023
9024 CInode *in = NULL;
9025 if (rollback.orig_src.ino) {
9026 in = mdcache->get_inode(rollback.orig_src.ino);
9027 if (in && in->is_dir())
9028 ceph_assert(srcdn && destdn);
9029 } else
9030 in = mdcache->get_inode(rollback.orig_src.remote_ino);
9031
9032 CDir *straydir = NULL;
9033 CDentry *straydn = NULL;
9034 if (rollback.stray.dirfrag.ino) {
9035 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
9036 if (straydir) {
9037 dout(10) << "straydir " << *straydir << dendl;
9038 straydn = straydir->lookup(rollback.stray.dname);
9039 if (straydn) {
9040 dout(10) << " straydn " << *straydn << dendl;
9041 ceph_assert(straydn->get_linkage()->is_primary());
9042 } else
9043 dout(10) << " straydn not found" << dendl;
9044 } else
9045 dout(10) << "straydir not found" << dendl;
9046 }
9047
9048 CInode *target = NULL;
9049 if (rollback.orig_dest.ino) {
9050 target = mdcache->get_inode(rollback.orig_dest.ino);
9051 if (target)
9052 ceph_assert(destdn && straydn);
9053 } else if (rollback.orig_dest.remote_ino)
9054 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
9055
9056 // can't use is_auth() in the resolve stage
9057 mds_rank_t whoami = mds->get_nodeid();
9058 // slave
9059 ceph_assert(!destdn || destdn->authority().first != whoami);
9060 ceph_assert(!straydn || straydn->authority().first != whoami);
9061
9062 bool force_journal_src = false;
9063 bool force_journal_dest = false;
9064 if (in && in->is_dir() && srcdn->authority().first != whoami)
9065 force_journal_src = _need_force_journal(in, false);
9066 if (in && target && target->is_dir())
9067 force_journal_dest = _need_force_journal(in, true);
9068
9069 version_t srcdnpv = 0;
9070 // repair src
9071 if (srcdn) {
9072 if (srcdn->authority().first == whoami)
9073 srcdnpv = srcdn->pre_dirty();
9074 if (rollback.orig_src.ino) {
9075 ceph_assert(in);
9076 srcdn->push_projected_linkage(in);
9077 } else
9078 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
9079 rollback.orig_src.remote_d_type);
9080 }
9081
9082 map<client_t,MClientSnap::ref> splits[2];
9083
9084 CInode::mempool_inode *pip = nullptr;
9085 if (in) {
9086 bool projected;
9087 if (in->get_projected_parent_dn()->authority().first == whoami) {
9088 auto &pi = in->project_inode();
9089 pip = &pi.inode;
9090 mut->add_projected_inode(in);
9091 pip->version = in->pre_dirty();
9092 projected = true;
9093 } else {
9094 pip = in->get_projected_inode();
9095 projected = false;
9096 }
9097 if (pip->ctime == rollback.ctime)
9098 pip->ctime = rollback.orig_src.old_ctime;
9099
9100 if (rollback.srci_snapbl.length() && in->snaprealm) {
9101 bool hadrealm;
9102 auto p = rollback.srci_snapbl.cbegin();
9103 decode(hadrealm, p);
9104 if (hadrealm) {
9105 if (projected && !mds->is_resolve()) {
9106 sr_t *new_srnode = new sr_t();
9107 decode(*new_srnode, p);
9108 in->project_snaprealm(new_srnode);
9109 } else
9110 decode(in->snaprealm->srnode, p);
9111 } else {
9112 SnapRealm *realm;
9113 if (rollback.orig_src.ino) {
9114 ceph_assert(srcdir);
9115 realm = srcdir->get_inode()->find_snaprealm();
9116 } else {
9117 realm = in->snaprealm->parent;
9118 }
9119 if (!mds->is_resolve())
9120 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
9121 if (projected)
9122 in->project_snaprealm(NULL);
9123 else
9124 in->snaprealm->merge_to(realm);
9125 }
9126 }
9127 }
9128
9129 if (srcdn && srcdn->authority().first == whoami) {
9130 nest_info_t blah;
9131 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
9132 in ? in->is_dir() : false, 1, pip ? pip->accounted_rstat : blah);
9133 }
9134
9135 // repair dest
9136 if (destdn) {
9137 if (rollback.orig_dest.ino && target) {
9138 destdn->push_projected_linkage(target);
9139 } else if (rollback.orig_dest.remote_ino) {
9140 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
9141 rollback.orig_dest.remote_d_type);
9142 } else {
9143 // the dentry will be trimmed soon, it's ok to have wrong linkage
9144 if (rollback.orig_dest.ino)
9145 ceph_assert(mds->is_resolve());
9146 destdn->push_projected_linkage();
9147 }
9148 }
9149
9150 if (straydn)
9151 straydn->push_projected_linkage();
9152
9153 if (target) {
9154 bool projected;
9155 CInode::mempool_inode *ti = nullptr;
9156 if (target->get_projected_parent_dn()->authority().first == whoami) {
9157 auto &pi = target->project_inode();
9158 ti = &pi.inode;
9159 mut->add_projected_inode(target);
9160 ti->version = target->pre_dirty();
9161 projected = true;
9162 } else {
9163 ti = target->get_projected_inode();
9164 projected = false;
9165 }
9166 if (ti->ctime == rollback.ctime)
9167 ti->ctime = rollback.orig_dest.old_ctime;
9168 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
9169 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
9170 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
9171 else
9172 ceph_assert(rollback.orig_dest.remote_ino &&
9173 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
9174 } else
9175 ti->nlink++;
9176
9177 if (rollback.desti_snapbl.length() && target->snaprealm) {
9178 bool hadrealm;
9179 auto p = rollback.desti_snapbl.cbegin();
9180 decode(hadrealm, p);
9181 if (hadrealm) {
9182 if (projected && !mds->is_resolve()) {
9183 sr_t *new_srnode = new sr_t();
9184 decode(*new_srnode, p);
9185 target->project_snaprealm(new_srnode);
9186 } else
9187 decode(target->snaprealm->srnode, p);
9188 } else {
9189 SnapRealm *realm;
9190 if (rollback.orig_dest.ino) {
9191 ceph_assert(destdir);
9192 realm = destdir->get_inode()->find_snaprealm();
9193 } else {
9194 realm = target->snaprealm->parent;
9195 }
9196 if (!mds->is_resolve())
9197 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
9198 if (projected)
9199 target->project_snaprealm(NULL);
9200 else
9201 target->snaprealm->merge_to(realm);
9202 }
9203 }
9204 }
9205
9206 if (srcdn)
9207 dout(0) << " srcdn back to " << *srcdn << dendl;
9208 if (in)
9209 dout(0) << " srci back to " << *in << dendl;
9210 if (destdn)
9211 dout(0) << " destdn back to " << *destdn << dendl;
9212 if (target)
9213 dout(0) << " desti back to " << *target << dendl;
9214
9215 // journal it
9216 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
9217 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
9218 mdlog->start_entry(le);
9219
9220 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
9221 le->commit.add_dir_context(srcdir);
9222 if (rollback.orig_src.ino)
9223 le->commit.add_primary_dentry(srcdn, 0, true);
9224 else
9225 le->commit.add_remote_dentry(srcdn, true);
9226 }
9227
9228 if (!rollback.orig_src.ino && // remote linkage
9229 in && in->authority().first == whoami) {
9230 le->commit.add_dir_context(in->get_projected_parent_dir());
9231 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9232 }
9233
9234 if (force_journal_dest) {
9235 ceph_assert(rollback.orig_dest.ino);
9236 le->commit.add_dir_context(destdir);
9237 le->commit.add_primary_dentry(destdn, 0, true);
9238 }
9239
9240 // slave: no need to journal straydn
9241
9242 if (target && target != in && target->authority().first == whoami) {
9243 ceph_assert(rollback.orig_dest.remote_ino);
9244 le->commit.add_dir_context(target->get_projected_parent_dir());
9245 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
9246 }
9247
9248 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
9249 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
9250 le->commit.renamed_dirino = in->ino();
9251 if (srcdn->authority().first == whoami) {
9252 list<CDir*> ls;
9253 in->get_dirfrags(ls);
9254 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
9255 CDir *dir = *p;
9256 if (!dir->is_auth())
9257 le->commit.renamed_dir_frags.push_back(dir->get_frag());
9258 }
9259 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
9260 }
9261 } else if (force_journal_dest) {
9262 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
9263 le->commit.renamed_dirino = target->ino();
9264 }
9265
9266 if (target && target->is_dir()) {
9267 ceph_assert(destdn);
9268 mdcache->project_subtree_rename(target, straydir, destdir);
9269 }
9270
9271 if (in && in->is_dir()) {
9272 ceph_assert(srcdn);
9273 mdcache->project_subtree_rename(in, destdir, srcdir);
9274 }
9275
9276 if (mdr && !mdr->more()->slave_update_journaled) {
9277 ceph_assert(le->commit.empty());
9278 mdlog->cancel_entry(le);
9279 mut->ls = NULL;
9280 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
9281 } else {
9282 ceph_assert(!le->commit.empty());
9283 if (mdr)
9284 mdr->more()->slave_update_journaled = false;
9285 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
9286 srcdn, srcdnpv, destdn, straydn,
9287 splits, finish_mdr);
9288 submit_mdlog_entry(le, fin, mdr, __func__);
9289 mdlog->flush();
9290 }
9291 }
9292
9293 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
9294 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
9295 map<client_t,MClientSnap::ref> splits[2], bool finish_mdr)
9296 {
9297 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
9298
9299 if (straydn) {
9300 straydn->get_dir()->unlink_inode(straydn);
9301 straydn->pop_projected_linkage();
9302 }
9303 if (destdn) {
9304 destdn->get_dir()->unlink_inode(destdn);
9305 destdn->pop_projected_linkage();
9306 }
9307 if (srcdn) {
9308 srcdn->pop_projected_linkage();
9309 if (srcdn->authority().first == mds->get_nodeid()) {
9310 srcdn->mark_dirty(srcdnpv, mut->ls);
9311 if (srcdn->get_linkage()->is_primary())
9312 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
9313 }
9314 }
9315
9316 mut->apply();
9317
9318 if (srcdn && srcdn->get_linkage()->is_primary()) {
9319 CInode *in = srcdn->get_linkage()->get_inode();
9320 if (in && in->is_dir()) {
9321 ceph_assert(destdn);
9322 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
9323 }
9324 }
9325
9326 if (destdn) {
9327 CInode *oldin = destdn->get_linkage()->get_inode();
9328 // update subtree map?
9329 if (oldin && oldin->is_dir()) {
9330 ceph_assert(straydn);
9331 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
9332 }
9333 }
9334
9335 if (mds->is_resolve()) {
9336 CDir *root = NULL;
9337 if (straydn)
9338 root = mdcache->get_subtree_root(straydn->get_dir());
9339 else if (destdn)
9340 root = mdcache->get_subtree_root(destdn->get_dir());
9341 if (root)
9342 mdcache->try_trim_non_auth_subtree(root);
9343 } else {
9344 mdcache->send_snaps(splits[1]);
9345 mdcache->send_snaps(splits[0]);
9346 }
9347
9348 if (mdr) {
9349 MDSContext::vec finished;
9350 if (mdr->more()->is_ambiguous_auth) {
9351 if (srcdn->is_auth())
9352 mdr->more()->rename_inode->unfreeze_inode(finished);
9353
9354 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9355 mdr->more()->is_ambiguous_auth = false;
9356 }
9357 mds->queue_waiters(finished);
9358 if (finish_mdr || mdr->aborted)
9359 mdcache->request_finish(mdr);
9360 else
9361 mdr->more()->slave_rolling_back = false;
9362 }
9363
9364 mdcache->finish_rollback(mut->reqid);
9365
9366 mut->cleanup();
9367 }
9368
9369 void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
9370 {
9371 dout(10) << "handle_slave_rename_prep_ack " << *mdr
9372 << " witnessed by " << ack->get_source()
9373 << " " << *ack << dendl;
9374 mds_rank_t from = mds_rank_t(ack->get_source().num());
9375
9376 // note slave
9377 mdr->more()->slaves.insert(from);
9378 if (mdr->more()->srcdn_auth_mds == from &&
9379 mdr->more()->is_remote_frozen_authpin &&
9380 !mdr->more()->is_ambiguous_auth) {
9381 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
9382 }
9383
9384 // witnessed? or add extra witnesses?
9385 ceph_assert(mdr->more()->witnessed.count(from) == 0);
9386 if (ack->is_interrupted()) {
9387 dout(10) << " slave request interrupted, noop" << dendl;
9388 } else if (ack->witnesses.empty()) {
9389 mdr->more()->witnessed.insert(from);
9390 if (!ack->is_not_journaled())
9391 mdr->more()->has_journaled_slaves = true;
9392 } else {
9393 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
9394 mdr->more()->extra_witnesses = ack->witnesses;
9395 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
9396 }
9397
9398 // srci import?
9399 if (ack->inode_export.length()) {
9400 dout(10) << " got srci import" << dendl;
9401 mdr->more()->inode_import.share(ack->inode_export);
9402 mdr->more()->inode_import_v = ack->inode_export_v;
9403 }
9404
9405 // remove from waiting list
9406 ceph_assert(mdr->more()->waiting_on_slave.count(from));
9407 mdr->more()->waiting_on_slave.erase(from);
9408
9409 if (mdr->more()->waiting_on_slave.empty())
9410 dispatch_client_request(mdr); // go again!
9411 else
9412 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
9413 }
9414
9415 void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
9416 {
9417 dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
9418 << ack->get_source() << dendl;
9419 ceph_assert(mdr->is_slave());
9420 mds_rank_t from = mds_rank_t(ack->get_source().num());
9421
9422 if (mdr->more()->waiting_on_slave.count(from)) {
9423 mdr->more()->waiting_on_slave.erase(from);
9424
9425 if (mdr->more()->waiting_on_slave.empty()) {
9426 if (mdr->slave_request)
9427 dispatch_slave_request(mdr);
9428 } else
9429 dout(10) << " still waiting for rename notify acks from "
9430 << mdr->more()->waiting_on_slave << dendl;
9431 }
9432 }
9433
9434 void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
9435 {
9436 dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
9437
9438 if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
9439 mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
9440
9441 if (mdr->more()->waiting_on_slave.empty()) {
9442 if (mdr->slave_request)
9443 dispatch_slave_request(mdr);
9444 } else
9445 dout(10) << " still waiting for rename notify acks from "
9446 << mdr->more()->waiting_on_slave << dendl;
9447 }
9448 }
9449
9450 // snaps
9451 /* This function takes responsibility for the passed mdr*/
9452 void Server::handle_client_lssnap(MDRequestRef& mdr)
9453 {
9454 const MClientRequest::const_ref &req = mdr->client_request;
9455
9456 // traverse to path
9457 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
9458 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
9459 respond_to_request(mdr, -ESTALE);
9460 return;
9461 }
9462 if (!diri->is_auth()) {
9463 mdcache->request_forward(mdr, diri->authority().first);
9464 return;
9465 }
9466 if (!diri->is_dir()) {
9467 respond_to_request(mdr, -ENOTDIR);
9468 return;
9469 }
9470 dout(10) << "lssnap on " << *diri << dendl;
9471
9472 // lock snap
9473 MutationImpl::LockOpVec lov;
9474 mds->locker->include_snap_rdlocks(diri, lov);
9475 if (!mds->locker->acquire_locks(mdr, lov))
9476 return;
9477
9478 if (!check_access(mdr, diri, MAY_READ))
9479 return;
9480
9481 SnapRealm *realm = diri->find_snaprealm();
9482 map<snapid_t,const SnapInfo*> infomap;
9483 realm->get_snap_info(infomap, diri->get_oldest_snap());
9484
9485 unsigned max_entries = req->head.args.readdir.max_entries;
9486 if (!max_entries)
9487 max_entries = infomap.size();
9488 int max_bytes = req->head.args.readdir.max_bytes;
9489 if (!max_bytes)
9490 // make sure at least one item can be encoded
9491 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
9492
9493 __u64 last_snapid = 0;
9494 string offset_str = req->get_path2();
9495 if (!offset_str.empty())
9496 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
9497
9498 //Empty DirStat
9499 bufferlist dirbl;
9500 static DirStat empty;
9501 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
9502
9503 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
9504
9505 __u32 num = 0;
9506 bufferlist dnbl;
9507 auto p = infomap.upper_bound(last_snapid);
9508 for (; p != infomap.end() && num < max_entries; ++p) {
9509 dout(10) << p->first << " -> " << *p->second << dendl;
9510
9511 // actual
9512 string snap_name;
9513 if (p->second->ino == diri->ino())
9514 snap_name = p->second->name;
9515 else
9516 snap_name = p->second->get_long_name();
9517
9518 unsigned start_len = dnbl.length();
9519 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
9520 break;
9521
9522 encode(snap_name, dnbl);
9523 //infinite lease
9524 LeaseStat e(-1, -1, 0);
9525 mds->locker->encode_lease(dnbl, mdr->session->info, e);
9526 dout(20) << "encode_infinite_lease" << dendl;
9527
9528 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
9529 if (r < 0) {
9530 bufferlist keep;
9531 keep.substr_of(dnbl, 0, start_len);
9532 dnbl.swap(keep);
9533 break;
9534 }
9535 ++num;
9536 }
9537
9538 encode(num, dirbl);
9539 __u16 flags = 0;
9540 if (p == infomap.end()) {
9541 flags = CEPH_READDIR_FRAG_END;
9542 if (last_snapid == 0)
9543 flags |= CEPH_READDIR_FRAG_COMPLETE;
9544 }
9545 encode(flags, dirbl);
9546 dirbl.claim_append(dnbl);
9547
9548 mdr->reply_extra_bl = dirbl;
9549 mdr->tracei = diri;
9550 respond_to_request(mdr, 0);
9551 }
9552
9553
9554 // MKSNAP
9555
9556 struct C_MDS_mksnap_finish : public ServerLogContext {
9557 CInode *diri;
9558 SnapInfo info;
9559 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
9560 ServerLogContext(s, r), diri(di), info(i) {}
9561 void finish(int r) override {
9562 server->_mksnap_finish(mdr, diri, info);
9563 }
9564 };
9565
9566 /* This function takes responsibility for the passed mdr*/
9567 void Server::handle_client_mksnap(MDRequestRef& mdr)
9568 {
9569 const MClientRequest::const_ref &req = mdr->client_request;
9570 // make sure we have as new a map as the client
9571 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
9572 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
9573 return;
9574 }
9575 if (!mds->mdsmap->allows_snaps()) {
9576 // you can't make snapshots until you set an option right now
9577 respond_to_request(mdr, -EPERM);
9578 return;
9579 }
9580
9581 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
9582 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
9583 respond_to_request(mdr, -ESTALE);
9584 return;
9585 }
9586
9587 if (!diri->is_auth()) { // fw to auth?
9588 mdcache->request_forward(mdr, diri->authority().first);
9589 return;
9590 }
9591
9592 // dir only
9593 if (!diri->is_dir()) {
9594 respond_to_request(mdr, -ENOTDIR);
9595 return;
9596 }
9597 if (diri->is_system() && !diri->is_root()) {
9598 // no snaps in system dirs (root is ok)
9599 respond_to_request(mdr, -EPERM);
9600 return;
9601 }
9602
9603 std::string_view snapname = req->get_filepath().last_dentry();
9604
9605 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
9606 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
9607 respond_to_request(mdr, -EPERM);
9608 return;
9609 }
9610
9611 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
9612
9613 // lock snap
9614 MutationImpl::LockOpVec lov;
9615
9616 mds->locker->include_snap_rdlocks(diri, lov);
9617 lov.erase_rdlock(&diri->snaplock);
9618 lov.add_xlock(&diri->snaplock);
9619
9620 if (!mds->locker->acquire_locks(mdr, lov))
9621 return;
9622
9623 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
9624 return;
9625
9626 // make sure name is unique
9627 if (diri->snaprealm &&
9628 diri->snaprealm->exists(snapname)) {
9629 respond_to_request(mdr, -EEXIST);
9630 return;
9631 }
9632 if (snapname.length() == 0 ||
9633 snapname[0] == '_') {
9634 respond_to_request(mdr, -EINVAL);
9635 return;
9636 }
9637
9638 // allocate a snapid
9639 if (!mdr->more()->stid) {
9640 // prepare an stid
9641 mds->snapclient->prepare_create(diri->ino(), snapname,
9642 mdr->get_mds_stamp(),
9643 &mdr->more()->stid, &mdr->more()->snapidbl,
9644 new C_MDS_RetryRequest(mdcache, mdr));
9645 return;
9646 }
9647
9648 version_t stid = mdr->more()->stid;
9649 snapid_t snapid;
9650 auto p = mdr->more()->snapidbl.cbegin();
9651 decode(snapid, p);
9652 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
9653
9654 ceph_assert(mds->snapclient->get_cached_version() >= stid);
9655
9656 // journal
9657 SnapInfo info;
9658 info.ino = diri->ino();
9659 info.snapid = snapid;
9660 info.name = snapname;
9661 info.stamp = mdr->get_op_stamp();
9662
9663 auto &pi = diri->project_inode(false, true);
9664 pi.inode.ctime = info.stamp;
9665 if (info.stamp > pi.inode.rstat.rctime)
9666 pi.inode.rstat.rctime = info.stamp;
9667 pi.inode.rstat.rsnaps++;
9668 pi.inode.version = diri->pre_dirty();
9669
9670 // project the snaprealm
9671 auto &newsnap = *pi.snapnode;
9672 newsnap.created = snapid;
9673 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
9674 if (!em.second)
9675 em.first->second = info;
9676 newsnap.seq = snapid;
9677 newsnap.last_created = snapid;
9678
9679 // journal the inode changes
9680 mdr->ls = mdlog->get_current_segment();
9681 EUpdate *le = new EUpdate(mdlog, "mksnap");
9682 mdlog->start_entry(le);
9683
9684 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
9685 le->metablob.add_table_transaction(TABLE_SNAP, stid);
9686 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
9687 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
9688
9689 // journal the snaprealm changes
9690 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
9691 mdr, __func__);
9692 mdlog->flush();
9693 }
9694
9695 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
9696 {
9697 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
9698
9699 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
9700
9701 diri->pop_and_dirty_projected_inode(mdr->ls);
9702 mdr->apply();
9703
9704 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
9705
9706 // create snap
9707 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
9708
9709 // notify other mds
9710 mdcache->send_snap_update(diri, mdr->more()->stid, op);
9711
9712 mdcache->do_realm_invalidate_and_update_notify(diri, op);
9713
9714 // yay
9715 mdr->in[0] = diri;
9716 mdr->snapid = info.snapid;
9717 mdr->tracei = diri;
9718 respond_to_request(mdr, 0);
9719 }
9720
9721
9722 // RMSNAP
9723
9724 struct C_MDS_rmsnap_finish : public ServerLogContext {
9725 CInode *diri;
9726 snapid_t snapid;
9727 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
9728 ServerLogContext(s, r), diri(di), snapid(sn) {}
9729 void finish(int r) override {
9730 server->_rmsnap_finish(mdr, diri, snapid);
9731 }
9732 };
9733
9734 /* This function takes responsibility for the passed mdr*/
9735 void Server::handle_client_rmsnap(MDRequestRef& mdr)
9736 {
9737 const MClientRequest::const_ref &req = mdr->client_request;
9738
9739 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
9740 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
9741 respond_to_request(mdr, -ESTALE);
9742 return;
9743 }
9744 if (!diri->is_auth()) { // fw to auth?
9745 mdcache->request_forward(mdr, diri->authority().first);
9746 return;
9747 }
9748 if (!diri->is_dir()) {
9749 respond_to_request(mdr, -ENOTDIR);
9750 return;
9751 }
9752
9753 std::string_view snapname = req->get_filepath().last_dentry();
9754
9755 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
9756 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
9757 respond_to_request(mdr, -EPERM);
9758 return;
9759 }
9760
9761 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
9762
9763 // does snap exist?
9764 if (snapname.length() == 0 || snapname[0] == '_') {
9765 respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
9766 return;
9767 }
9768 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
9769 respond_to_request(mdr, -ENOENT);
9770 return;
9771 }
9772 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
9773 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
9774
9775 MutationImpl::LockOpVec lov;
9776 mds->locker->include_snap_rdlocks(diri, lov);
9777 lov.erase_rdlock(&diri->snaplock);
9778 lov.add_xlock(&diri->snaplock);
9779
9780 if (!mds->locker->acquire_locks(mdr, lov))
9781 return;
9782
9783 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
9784 return;
9785
9786 // prepare
9787 if (!mdr->more()->stid) {
9788 mds->snapclient->prepare_destroy(diri->ino(), snapid,
9789 &mdr->more()->stid, &mdr->more()->snapidbl,
9790 new C_MDS_RetryRequest(mdcache, mdr));
9791 return;
9792 }
9793 version_t stid = mdr->more()->stid;
9794 auto p = mdr->more()->snapidbl.cbegin();
9795 snapid_t seq;
9796 decode(seq, p);
9797 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
9798
9799 ceph_assert(mds->snapclient->get_cached_version() >= stid);
9800
9801 // journal
9802 auto &pi = diri->project_inode(false, true);
9803 pi.inode.version = diri->pre_dirty();
9804 pi.inode.ctime = mdr->get_op_stamp();
9805 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
9806 pi.inode.rstat.rctime = mdr->get_op_stamp();
9807 pi.inode.rstat.rsnaps--;
9808
9809 mdr->ls = mdlog->get_current_segment();
9810 EUpdate *le = new EUpdate(mdlog, "rmsnap");
9811 mdlog->start_entry(le);
9812
9813 // project the snaprealm
9814 auto &newnode = *pi.snapnode;
9815 newnode.snaps.erase(snapid);
9816 newnode.seq = seq;
9817 newnode.last_destroyed = seq;
9818
9819 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
9820 le->metablob.add_table_transaction(TABLE_SNAP, stid);
9821 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
9822 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
9823
9824 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
9825 mdr, __func__);
9826 mdlog->flush();
9827 }
9828
9829 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
9830 {
9831 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
9832 snapid_t stid = mdr->more()->stid;
9833 auto p = mdr->more()->snapidbl.cbegin();
9834 snapid_t seq;
9835 decode(seq, p);
9836
9837 diri->pop_and_dirty_projected_inode(mdr->ls);
9838 mdr->apply();
9839
9840 mds->snapclient->commit(stid, mdr->ls);
9841
9842 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
9843
9844 // notify other mds
9845 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
9846
9847 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
9848
9849 // yay
9850 mdr->in[0] = diri;
9851 respond_to_request(mdr, 0);
9852
9853 // purge snapshot data
9854 if (diri->snaprealm->have_past_parents_open())
9855 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
9856 }
9857
9858 struct C_MDS_renamesnap_finish : public ServerLogContext {
9859 CInode *diri;
9860 snapid_t snapid;
9861 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
9862 ServerLogContext(s, r), diri(di), snapid(sn) {}
9863 void finish(int r) override {
9864 server->_renamesnap_finish(mdr, diri, snapid);
9865 }
9866 };
9867
9868 /* This function takes responsibility for the passed mdr*/
9869 void Server::handle_client_renamesnap(MDRequestRef& mdr)
9870 {
9871 const MClientRequest::const_ref &req = mdr->client_request;
9872 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
9873 respond_to_request(mdr, -EINVAL);
9874 return;
9875 }
9876
9877 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
9878 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
9879 respond_to_request(mdr, -ESTALE);
9880 return;
9881 }
9882
9883 if (!diri->is_auth()) { // fw to auth?
9884 mdcache->request_forward(mdr, diri->authority().first);
9885 return;
9886 }
9887
9888 if (!diri->is_dir()) { // dir only
9889 respond_to_request(mdr, -ENOTDIR);
9890 return;
9891 }
9892
9893 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
9894 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
9895 respond_to_request(mdr, -EPERM);
9896 return;
9897 }
9898
9899 std::string_view dstname = req->get_filepath().last_dentry();
9900 std::string_view srcname = req->get_filepath2().last_dentry();
9901 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
9902
9903 if (srcname.length() == 0 || srcname[0] == '_') {
9904 respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
9905 return;
9906 }
9907 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
9908 respond_to_request(mdr, -ENOENT);
9909 return;
9910 }
9911 if (dstname.length() == 0 || dstname[0] == '_') {
9912 respond_to_request(mdr, -EINVAL);
9913 return;
9914 }
9915 if (diri->snaprealm->exists(dstname)) {
9916 respond_to_request(mdr, -EEXIST);
9917 return;
9918 }
9919
9920 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
9921 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
9922
9923 // lock snap
9924 MutationImpl::LockOpVec lov;
9925
9926 mds->locker->include_snap_rdlocks(diri, lov);
9927 lov.erase_rdlock(&diri->snaplock);
9928 lov.add_xlock(&diri->snaplock);
9929
9930 if (!mds->locker->acquire_locks(mdr, lov))
9931 return;
9932
9933 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
9934 return;
9935
9936 // prepare
9937 if (!mdr->more()->stid) {
9938 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
9939 &mdr->more()->stid,
9940 new C_MDS_RetryRequest(mdcache, mdr));
9941 return;
9942 }
9943
9944 version_t stid = mdr->more()->stid;
9945 dout(10) << " stid is " << stid << dendl;
9946
9947 ceph_assert(mds->snapclient->get_cached_version() >= stid);
9948
9949 // journal
9950 auto &pi = diri->project_inode(false, true);
9951 pi.inode.ctime = mdr->get_op_stamp();
9952 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
9953 pi.inode.rstat.rctime = mdr->get_op_stamp();
9954 pi.inode.version = diri->pre_dirty();
9955
9956 // project the snaprealm
9957 auto &newsnap = *pi.snapnode;
9958 auto it = newsnap.snaps.find(snapid);
9959 ceph_assert(it != newsnap.snaps.end());
9960 it->second.name = dstname;
9961
9962 // journal the inode changes
9963 mdr->ls = mdlog->get_current_segment();
9964 EUpdate *le = new EUpdate(mdlog, "renamesnap");
9965 mdlog->start_entry(le);
9966
9967 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
9968 le->metablob.add_table_transaction(TABLE_SNAP, stid);
9969 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
9970 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
9971
9972 // journal the snaprealm changes
9973 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
9974 mdr, __func__);
9975 mdlog->flush();
9976 }
9977
9978 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
9979 {
9980 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
9981
9982 diri->pop_and_dirty_projected_inode(mdr->ls);
9983 mdr->apply();
9984
9985 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
9986
9987 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
9988
9989 // notify other mds
9990 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
9991
9992 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
9993
9994 // yay
9995 mdr->in[0] = diri;
9996 mdr->tracei = diri;
9997 mdr->snapid = snapid;
9998 respond_to_request(mdr, 0);
9999 }
10000
10001 /**
10002 * Return true if server is in state RECONNECT and this
10003 * client has not yet reconnected.
10004 */
10005 bool Server::waiting_for_reconnect(client_t c) const
10006 {
10007 return client_reconnect_gather.count(c) > 0;
10008 }
10009
10010 void Server::dump_reconnect_status(Formatter *f) const
10011 {
10012 f->open_object_section("reconnect_status");
10013 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
10014 f->close_section();
10015 }