]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
21
22 #include "MDSRank.h"
23 #include "Server.h"
24 #include "Locker.h"
25 #include "MDCache.h"
26 #include "MDLog.h"
27 #include "Migrator.h"
28 #include "MDBalancer.h"
29 #include "InoTable.h"
30 #include "SnapClient.h"
31 #include "Mutation.h"
32 #include "cephfs_features.h"
33
34 #include "msg/Messenger.h"
35
36 #include "osdc/Objecter.h"
37
38 #include "events/EUpdate.h"
39 #include "events/ESlaveUpdate.h"
40 #include "events/ESession.h"
41 #include "events/EOpen.h"
42 #include "events/ECommitted.h"
43
44 #include "include/stringify.h"
45 #include "include/filepath.h"
46 #include "common/errno.h"
47 #include "common/Timer.h"
48 #include "common/perf_counters.h"
49 #include "include/compat.h"
50 #include "osd/OSDMap.h"
51
52 #include <errno.h>
53 #include <math.h>
54
55 #include <list>
56 #include <iostream>
57 #include <string_view>
58
59 #include "common/config.h"
60
61 #define dout_context g_ceph_context
62 #define dout_subsys ceph_subsys_mds
63 #undef dout_prefix
64 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
65
66 class ServerContext : public MDSContext {
67 protected:
68 Server *server;
69 MDSRank *get_mds() override
70 {
71 return server->mds;
72 }
73
74 public:
75 explicit ServerContext(Server *s) : server(s) {
76 ceph_assert(server != NULL);
77 }
78 };
79
80 class ServerLogContext : public MDSLogContextBase {
81 protected:
82 Server *server;
83 MDSRank *get_mds() override
84 {
85 return server->mds;
86 }
87
88 MDRequestRef mdr;
89 void pre_finish(int r) override {
90 if (mdr)
91 mdr->mark_event("journal_committed: ");
92 }
93 public:
94 explicit ServerLogContext(Server *s) : server(s) {
95 ceph_assert(server != NULL);
96 }
97 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
98 ceph_assert(server != NULL);
99 }
100 };
101
102 void Server::create_logger()
103 {
104 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
105
106 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
107 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
108 plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
109 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
110 plb.add_u64_counter(l_mdss_handle_client_session,
111 "handle_client_session", "Client session messages", "hcs",
112 PerfCountersBuilder::PRIO_INTERESTING);
113 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
114 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
115
116 // fop latencies are useful
117 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
118 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
119 "Request type lookup hash of inode latency");
120 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
121 "Request type lookup inode latency");
122 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
123 "Request type lookup parent latency");
124 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
125 "Request type lookup name latency");
126 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
127 "Request type lookup latency");
128 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
129 "Request type lookup snapshot latency");
130 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
131 "Request type get attribute latency");
132 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
133 "Request type set attribute latency");
134 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
135 "Request type set file layout latency");
136 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
137 "Request type set directory layout latency");
138 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
139 "Request type set extended attribute latency");
140 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
141 "Request type remove extended attribute latency");
142 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
143 "Request type read directory latency");
144 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
145 "Request type set file lock latency");
146 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
147 "Request type get file lock latency");
148 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
149 "Request type create latency");
150 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
151 "Request type open latency");
152 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
153 "Request type make node latency");
154 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
155 "Request type link latency");
156 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
157 "Request type unlink latency");
158 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
159 "Request type remove directory latency");
160 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
161 "Request type rename latency");
162 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
163 "Request type make directory latency");
164 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
165 "Request type symbolic link latency");
166 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
167 "Request type list snapshot latency");
168 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
169 "Request type make snapshot latency");
170 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
171 "Request type remove snapshot latency");
172 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
173 "Request type rename snapshot latency");
174
175 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
176 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
177 "Client requests dispatched");
178 plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request",
179 "Server requests dispatched");
180
181 logger = plb.create_perf_counters();
182 g_ceph_context->get_perfcounters_collection()->add(logger);
183 }
184
185 Server::Server(MDSRank *m) :
186 mds(m),
187 mdcache(mds->mdcache), mdlog(mds->mdlog),
188 logger(0),
189 is_full(false),
190 reconnect_done(NULL),
191 failed_reconnects(0),
192 reconnect_evicting(false),
193 terminating_sessions(false),
194 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate"))
195 {
196 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
197 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
198 }
199
200 void Server::dispatch(const Message::const_ref &m)
201 {
202 switch (m->get_type()) {
203 case CEPH_MSG_CLIENT_RECONNECT:
204 handle_client_reconnect(MClientReconnect::msgref_cast(m));
205 return;
206 }
207
208 // active?
209 // handle_slave_request()/handle_client_session() will wait if necessary
210 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
211 const auto &req = MClientRequest::msgref_cast(m);
212 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
213 Session *session = mds->get_session(req);
214 if (!session || session->is_closed()) {
215 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
216 return;
217 }
218 bool queue_replay = false;
219 if (req->is_replay()) {
220 dout(3) << "queuing replayed op" << dendl;
221 queue_replay = true;
222 if (req->head.ino &&
223 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
224 mdcache->add_replay_ino_alloc(inodeno_t(req->head.ino));
225 }
226 } else if (req->get_retry_attempt()) {
227 // process completed request in clientreplay stage. The completed request
228 // might have created new file/directorie. This guarantees MDS sends a reply
229 // to client before other request modifies the new file/directorie.
230 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
231 dout(3) << "queuing completed op" << dendl;
232 queue_replay = true;
233 }
234 // this request was created before the cap reconnect message, drop any embedded
235 // cap releases.
236 req->releases.clear();
237 }
238 if (queue_replay) {
239 req->mark_queued_for_replay();
240 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
241 return;
242 }
243 }
244
245 bool wait_for_active = true;
246 if (mds->is_stopping()) {
247 wait_for_active = false;
248 } else if (mds->is_clientreplay()) {
249 if (req->is_queued_for_replay()) {
250 wait_for_active = false;
251 }
252 }
253 if (wait_for_active) {
254 dout(3) << "not active yet, waiting" << dendl;
255 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
256 return;
257 }
258 }
259
260 switch (m->get_type()) {
261 case CEPH_MSG_CLIENT_SESSION:
262 handle_client_session(MClientSession::msgref_cast(m));
263 return;
264 case CEPH_MSG_CLIENT_REQUEST:
265 handle_client_request(MClientRequest::msgref_cast(m));
266 return;
267 case CEPH_MSG_CLIENT_RECLAIM:
268 handle_client_reclaim(MClientReclaim::msgref_cast(m));
269 return;
270 case MSG_MDS_SLAVE_REQUEST:
271 handle_slave_request(MMDSSlaveRequest::msgref_cast(m));
272 return;
273 default:
274 derr << "server unknown message " << m->get_type() << dendl;
275 ceph_abort_msg("server unknown message");
276 }
277 }
278
279
280
281 // ----------------------------------------------------------
282 // SESSION management
283
284 class C_MDS_session_finish : public ServerLogContext {
285 Session *session;
286 uint64_t state_seq;
287 bool open;
288 version_t cmapv;
289 interval_set<inodeno_t> inos;
290 version_t inotablev;
291 Context *fin;
292 public:
293 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
294 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
295 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
296 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { }
297 void finish(int r) override {
298 ceph_assert(r == 0);
299 server->_session_logged(session, state_seq, open, cmapv, inos, inotablev);
300 if (fin) {
301 fin->complete(r);
302 }
303 }
304 };
305
306 Session* Server::find_session_by_uuid(std::string_view uuid)
307 {
308 Session* session = nullptr;
309 for (auto& it : mds->sessionmap.get_sessions()) {
310 auto& metadata = it.second->info.client_metadata;
311
312 auto p = metadata.find("uuid");
313 if (p == metadata.end() || p->second != uuid)
314 continue;
315
316 if (!session) {
317 session = it.second;
318 } else if (!session->reclaiming_from) {
319 assert(it.second->reclaiming_from == session);
320 session = it.second;
321 } else {
322 assert(session->reclaiming_from == it.second);
323 }
324 }
325 return session;
326 }
327
328 void Server::reclaim_session(Session *session, const MClientReclaim::const_ref &m)
329 {
330 if (!session->is_open() && !session->is_stale()) {
331 dout(10) << "session not open, dropping this req" << dendl;
332 return;
333 }
334
335 auto reply = MClientReclaimReply::create(0);
336 if (m->get_uuid().empty()) {
337 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
338 reply->set_result(-EINVAL);
339 mds->send_message_client(reply, session);
340 return;
341 }
342
343 unsigned flags = m->get_flags();
344 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
345 dout(10) << __func__ << " unsupported flags" << dendl;
346 reply->set_result(-EOPNOTSUPP);
347 mds->send_message_client(reply, session);
348 return;
349 }
350
351 Session* target = find_session_by_uuid(m->get_uuid());
352 if (target) {
353 if (session->info.auth_name != target->info.auth_name) {
354 dout(10) << __func__ << " session auth_name " << session->info.auth_name
355 << " != target auth_name " << target->info.auth_name << dendl;
356 reply->set_result(-EPERM);
357 mds->send_message_client(reply, session);
358 }
359
360 assert(!target->reclaiming_from);
361 assert(!session->reclaiming_from);
362 session->reclaiming_from = target;
363 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
364 }
365
366 if (flags & CEPH_RECLAIM_RESET) {
367 finish_reclaim_session(session, reply);
368 return;
369 }
370
371 ceph_abort();
372 }
373
374 void Server::finish_reclaim_session(Session *session, const MClientReclaimReply::ref &reply)
375 {
376 Session *target = session->reclaiming_from;
377 if (target) {
378 session->reclaiming_from = nullptr;
379
380 Context *send_reply;
381 if (reply) {
382 int64_t session_id = session->get_client().v;
383 send_reply = new FunctionContext([this, session_id, reply](int r) {
384 assert(mds->mds_lock.is_locked_by_me());
385 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
386 if (!session) {
387 return;
388 }
389 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
390 reply->set_epoch(epoch);
391 mds->send_message_client(reply, session);
392 });
393 } else {
394 send_reply = nullptr;
395 }
396
397 bool blacklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
398 return map.is_blacklisted(target->info.inst.addr);
399 });
400
401 if (blacklisted || !g_conf()->mds_session_blacklist_on_evict) {
402 kill_session(target, send_reply);
403 } else {
404 std::stringstream ss;
405 mds->evict_client(target->get_client().v, false, true, ss, send_reply);
406 }
407 } else if (reply) {
408 mds->send_message_client(reply, session);
409 }
410 }
411
412 void Server::handle_client_reclaim(const MClientReclaim::const_ref &m)
413 {
414 Session *session = mds->get_session(m);
415 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
416 assert(m->get_source().is_client()); // should _not_ come from an mds!
417
418 if (!session) {
419 dout(0) << " ignoring sessionless msg " << *m << dendl;
420 return;
421 }
422
423 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
424 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
425 return;
426 }
427
428 if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
429 finish_reclaim_session(session);
430 } else {
431 reclaim_session(session, m);
432 }
433 }
434
435 void Server::handle_client_session(const MClientSession::const_ref &m)
436 {
437 version_t pv;
438 Session *session = mds->get_session(m);
439
440 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
441 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
442
443 if (!session) {
444 dout(0) << " ignoring sessionless msg " << *m << dendl;
445 return;
446 }
447
448 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
449 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
450 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
451 // close requests need to be handled when mds is active
452 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
453 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
454 return;
455 }
456 } else {
457 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
458 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
459 return;
460 }
461 }
462
463 if (logger)
464 logger->inc(l_mdss_handle_client_session);
465
466 uint64_t sseq = 0;
467 switch (m->get_op()) {
468 case CEPH_SESSION_REQUEST_OPEN:
469 if (session->is_opening() ||
470 session->is_open() ||
471 session->is_stale() ||
472 session->is_killing() ||
473 terminating_sessions) {
474 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
475 return;
476 }
477 ceph_assert(session->is_closed() || session->is_closing());
478
479 if (mds->is_stopping()) {
480 dout(10) << "mds is stopping, dropping open req" << dendl;
481 return;
482 }
483
484 {
485 auto& addr = session->info.inst.addr;
486 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features));
487 auto& client_metadata = session->info.client_metadata;
488
489 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
490 auto now = ceph_clock_now();
491 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
492 auto elapsed = now - m->get_recv_stamp();
493 CachedStackStringStream css;
494 *css << "New client session:"
495 << " addr=\"" << session->info.inst.addr << "\""
496 << ",elapsed=" << elapsed
497 << ",throttled=" << throttle_elapsed
498 << ",status=\"" << status << "\"";
499 if (!err.empty()) {
500 *css << ",error=\"" << err << "\"";
501 }
502 const auto& metadata = session->info.client_metadata;
503 if (auto it = metadata.find("root"); it != metadata.end()) {
504 *css << ",root=\"" << it->second << "\"";
505 }
506 dout(2) << css->strv() << dendl;
507 };
508
509 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str) {
510 auto m = MClientSession::create(CEPH_SESSION_REJECT);
511 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
512 m->metadata["error_string"] = err_str;
513 mds->send_message_client(m, session);
514 log_session_status("REJECTED", err_str);
515 };
516
517 bool blacklisted = mds->objecter->with_osdmap(
518 [&addr](const OSDMap &osd_map) -> bool {
519 return osd_map.is_blacklisted(addr);
520 });
521
522 if (blacklisted) {
523 dout(10) << "rejecting blacklisted client " << addr << dendl;
524 send_reject_message("blacklisted");
525 session->clear();
526 break;
527 }
528
529 if (client_metadata.features.empty())
530 infer_supported_features(session, client_metadata);
531
532 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
533 dout(20) << " features: '" << client_metadata.features << dendl;
534 for (const auto& p : client_metadata) {
535 dout(20) << " " << p.first << ": " << p.second << dendl;
536 }
537
538 feature_bitset_t missing_features = required_client_features;
539 missing_features -= client_metadata.features;
540 if (!missing_features.empty()) {
541 stringstream ss;
542 ss << "missing required features '" << missing_features << "'";
543 send_reject_message(ss.str());
544 mds->clog->warn() << "client session lacks required features '"
545 << missing_features << "' denied (" << session->info.inst << ")";
546 session->clear();
547 break;
548 }
549
550 // Special case for the 'root' metadata path; validate that the claimed
551 // root is actually within the caps of the session
552 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
553 auto claimed_root = it->second;
554 stringstream ss;
555 bool denied = false;
556 // claimed_root has a leading "/" which we strip before passing
557 // into caps check
558 if (claimed_root.empty() || claimed_root[0] != '/') {
559 denied = true;
560 ss << "invalue root '" << claimed_root << "'";
561 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
562 denied = true;
563 ss << "non-allowable root '" << claimed_root << "'";
564 }
565
566 if (denied) {
567 // Tell the client we're rejecting their open
568 send_reject_message(ss.str());
569 mds->clog->warn() << "client session with " << ss.str()
570 << " denied (" << session->info.inst << ")";
571 session->clear();
572 break;
573 }
574 }
575
576 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
577 if (find_session_by_uuid(it->second)) {
578 send_reject_message("duplicated session uuid");
579 mds->clog->warn() << "client session with duplicated session uuid '"
580 << it->second << "' denied (" << session->info.inst << ")";
581 session->clear();
582 break;
583 }
584 }
585
586 if (session->is_closed())
587 mds->sessionmap.add_session(session);
588
589 pv = mds->sessionmap.mark_projected(session);
590 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
591 mds->sessionmap.touch_session(session);
592 auto fin = new FunctionContext([log_session_status = std::move(log_session_status)](int r){
593 ceph_assert(r == 0);
594 log_session_status("ACCEPTED", "");
595 });
596 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
597 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
598 mdlog->flush();
599 }
600 break;
601
602 case CEPH_SESSION_REQUEST_RENEWCAPS:
603 if (session->is_open() || session->is_stale()) {
604 mds->sessionmap.touch_session(session);
605 if (session->is_stale()) {
606 mds->sessionmap.set_state(session, Session::STATE_OPEN);
607 mds->locker->resume_stale_caps(session);
608 mds->sessionmap.touch_session(session);
609 }
610 auto reply = MClientSession::create(CEPH_SESSION_RENEWCAPS, m->get_seq());
611 mds->send_message_client(reply, session);
612 } else {
613 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
614 }
615 break;
616
617 case CEPH_SESSION_REQUEST_CLOSE:
618 {
619 if (session->is_closed() ||
620 session->is_closing() ||
621 session->is_killing()) {
622 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
623 return;
624 }
625 if (session->is_importing()) {
626 dout(10) << "ignoring close req on importing session" << dendl;
627 return;
628 }
629 ceph_assert(session->is_open() ||
630 session->is_stale() ||
631 session->is_opening());
632 if (m->get_seq() < session->get_push_seq()) {
633 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
634 << ", dropping" << dendl;
635 return;
636 }
637 // We are getting a seq that is higher than expected.
638 // Handle the same as any other seqn error.
639 //
640 if (m->get_seq() != session->get_push_seq()) {
641 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
642 << ", BUGGY!" << dendl;
643 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
644 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
645 return;
646 }
647 journal_close_session(session, Session::STATE_CLOSING, NULL);
648 }
649 break;
650
651 case CEPH_SESSION_FLUSHMSG_ACK:
652 finish_flush_session(session, m->get_seq());
653 break;
654
655 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
656 if (mds->is_active())
657 mdlog->flush();
658 break;
659
660 default:
661 ceph_abort();
662 }
663 }
664
665
666 void Server::flush_session(Session *session, MDSGatherBuilder *gather) {
667 if (!session->is_open() ||
668 !session->get_connection() ||
669 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
670 return;
671 }
672
673 version_t seq = session->wait_for_flush(gather->new_sub());
674 mds->send_message_client(
675 MClientSession::create(CEPH_SESSION_FLUSHMSG, seq), session);
676 }
677
678 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
679 {
680 for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
681 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
682 ceph_assert(session);
683 flush_session(session, &gather);
684 }
685 }
686
687 void Server::finish_flush_session(Session *session, version_t seq)
688 {
689 MDSContext::vec finished;
690 session->finish_flush(seq, finished);
691 mds->queue_waiters(finished);
692 }
693
694 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
695 interval_set<inodeno_t>& inos, version_t piv)
696 {
697 dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close")
698 << " " << pv << dendl;
699
700 if (piv) {
701 ceph_assert(session->is_closing() || session->is_killing() ||
702 session->is_opening()); // re-open closing session
703 session->info.prealloc_inos.subtract(inos);
704 mds->inotable->apply_release_ids(inos);
705 ceph_assert(mds->inotable->get_version() == piv);
706 }
707
708 mds->sessionmap.mark_dirty(session);
709
710 // apply
711 if (session->get_state_seq() != state_seq) {
712 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
713 << ", noop" << dendl;
714 // close must have been canceled (by an import?), or any number of other things..
715 } else if (open) {
716 ceph_assert(session->is_opening());
717 mds->sessionmap.set_state(session, Session::STATE_OPEN);
718 mds->sessionmap.touch_session(session);
719 ceph_assert(session->get_connection());
720 auto reply = MClientSession::create(CEPH_SESSION_OPEN);
721 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
722 reply->supported_features = supported_features;
723 mds->send_message_client(reply, session);
724 if (mdcache->is_readonly()) {
725 auto m = MClientSession::create(CEPH_SESSION_FORCE_RO);
726 mds->send_message_client(m, session);
727 }
728 } else if (session->is_closing() ||
729 session->is_killing()) {
730 // kill any lingering capabilities, leases, requests
731 while (!session->caps.empty()) {
732 Capability *cap = session->caps.front();
733 CInode *in = cap->get_inode();
734 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
735 mds->locker->remove_client_cap(in, cap, true);
736 }
737 while (!session->leases.empty()) {
738 ClientLease *r = session->leases.front();
739 CDentry *dn = static_cast<CDentry*>(r->parent);
740 dout(20) << " killing client lease of " << *dn << dendl;
741 dn->remove_client_lease(r, mds->locker);
742 }
743 if (client_reconnect_gather.erase(session->info.get_client())) {
744 dout(20) << " removing client from reconnect set" << dendl;
745 if (client_reconnect_gather.empty()) {
746 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
747 reconnect_gather_finish();
748 }
749 }
750 if (client_reclaim_gather.erase(session->info.get_client())) {
751 dout(20) << " removing client from reclaim set" << dendl;
752 if (client_reclaim_gather.empty()) {
753 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
754 mds->maybe_clientreplay_done();
755 }
756 }
757
758 if (session->is_closing()) {
759 // mark con disposable. if there is a fault, we will get a
760 // reset and clean it up. if the client hasn't received the
761 // CLOSE message yet, they will reconnect and get an
762 // ms_handle_remote_reset() and realize they had in fact closed.
763 // do this *before* sending the message to avoid a possible
764 // race.
765 if (session->get_connection()) {
766 // Conditional because terminate_sessions will indiscrimately
767 // put sessions in CLOSING whether they ever had a conn or not.
768 session->get_connection()->mark_disposable();
769 }
770
771 // reset session
772 mds->send_message_client(MClientSession::create(CEPH_SESSION_CLOSE), session);
773 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
774 session->clear();
775 mds->sessionmap.remove_session(session);
776 } else if (session->is_killing()) {
777 // destroy session, close connection
778 if (session->get_connection()) {
779 session->get_connection()->mark_down();
780 session->get_connection()->set_priv(NULL);
781 }
782 mds->sessionmap.remove_session(session);
783 } else {
784 ceph_abort();
785 }
786 } else {
787 ceph_abort();
788 }
789 }
790
791 /**
792 * Inject sessions from some source other than actual connections.
793 *
794 * For example:
795 * - sessions inferred from journal replay
796 * - sessions learned from other MDSs during rejoin
797 * - sessions learned from other MDSs during dir/caps migration
798 * - sessions learned from other MDSs during a cross-MDS rename
799 */
800 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
801 map<client_t,client_metadata_t>& cmm,
802 map<client_t, pair<Session*,uint64_t> >& smap)
803 {
804 version_t pv = mds->sessionmap.get_projected();
805
806 dout(10) << "prepare_force_open_sessions " << pv
807 << " on " << cm.size() << " clients"
808 << dendl;
809
810 mds->objecter->with_osdmap(
811 [this, &cm, &cmm](const OSDMap &osd_map) {
812 for (auto p = cm.begin(); p != cm.end(); ) {
813 if (osd_map.is_blacklisted(p->second.addr)) {
814 dout(10) << " ignoring blacklisted client." << p->first
815 << " (" << p->second.addr << ")" << dendl;
816 cmm.erase(p->first);
817 cm.erase(p++);
818 } else {
819 ++p;
820 }
821 }
822 });
823
824 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
825 Session *session = mds->sessionmap.get_or_add_session(p->second);
826 pv = mds->sessionmap.mark_projected(session);
827 uint64_t sseq;
828 if (session->is_closed() ||
829 session->is_closing() ||
830 session->is_killing()) {
831 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
832 auto q = cmm.find(p->first);
833 if (q != cmm.end())
834 session->info.client_metadata.merge(q->second);
835 } else {
836 ceph_assert(session->is_open() ||
837 session->is_opening() ||
838 session->is_stale());
839 sseq = 0;
840 }
841 smap[p->first] = make_pair(session, sseq);
842 session->inc_importing();
843 }
844 return pv;
845 }
846
847 void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
848 bool dec_import)
849 {
850 /*
851 * FIXME: need to carefully consider the race conditions between a
852 * client trying to close a session and an MDS doing an import
853 * trying to force open a session...
854 */
855 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
856 << " initial v " << mds->sessionmap.get_version() << dendl;
857
858 for (auto &it : smap) {
859 Session *session = it.second.first;
860 uint64_t sseq = it.second.second;
861 if (sseq > 0) {
862 if (session->get_state_seq() != sseq) {
863 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
864 } else {
865 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
866 mds->sessionmap.set_state(session, Session::STATE_OPEN);
867 mds->sessionmap.touch_session(session);
868
869 auto reply = MClientSession::create(CEPH_SESSION_OPEN);
870 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
871 reply->supported_features = supported_features;
872 mds->send_message_client(reply, session);
873
874 if (mdcache->is_readonly())
875 mds->send_message_client(MClientSession::create(CEPH_SESSION_FORCE_RO), session);
876 }
877 } else {
878 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
879 ceph_assert(session->is_open() || session->is_stale());
880 }
881
882 if (dec_import) {
883 session->dec_importing();
884 }
885
886 mds->sessionmap.mark_dirty(session);
887 }
888
889 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
890 }
891
892 class C_MDS_TerminatedSessions : public ServerContext {
893 void finish(int r) override {
894 server->terminating_sessions = false;
895 }
896 public:
897 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
898 };
899
900 void Server::terminate_sessions()
901 {
902 dout(5) << "terminating all sessions..." << dendl;
903
904 terminating_sessions = true;
905
906 // kill them off. clients will retry etc.
907 set<Session*> sessions;
908 mds->sessionmap.get_client_session_set(sessions);
909 for (set<Session*>::const_iterator p = sessions.begin();
910 p != sessions.end();
911 ++p) {
912 Session *session = *p;
913 if (session->is_closing() ||
914 session->is_killing() ||
915 session->is_closed())
916 continue;
917 journal_close_session(session, Session::STATE_CLOSING, NULL);
918 }
919
920 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
921 }
922
923
924 void Server::find_idle_sessions()
925 {
926 auto now = clock::now();
927 auto last_cleared_laggy = mds->last_cleared_laggy();
928
929 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
930
931 // timeout/stale
932 // (caps go stale, lease die)
933 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
934 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
935
936 // don't kick clients if we've been laggy
937 if (last_cleared_laggy < cutoff) {
938 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
939 << "), not marking any client stale" << dendl;
940 return;
941 }
942
943 std::vector<Session*> to_evict;
944
945 bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
946 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
947 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
948 std::vector<Session*> new_stale;
949
950 for (auto session : *(sessions_p1->second)) {
951 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
952 if (last_cap_renew_span < cutoff) {
953 dout(20) << "laggiest active session is " << session->info.inst
954 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
955 break;
956 }
957
958 if (session->last_seen > session->last_cap_renew) {
959 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
960 if (last_cap_renew_span < cutoff) {
961 dout(20) << "laggiest active session is " << session->info.inst
962 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
963 continue;
964 }
965 }
966
967 if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
968 dout(20) << "evicting session " << session->info.inst << " since autoclose "
969 "has arrived" << dendl;
970 // evict session without marking it stale
971 to_evict.push_back(session);
972 continue;
973 }
974
975 if (defer_session_stale &&
976 !session->is_any_flush_waiter() &&
977 !mds->locker->is_revoking_any_caps_from(session->get_client())) {
978 dout(20) << "deferring marking session " << session->info.inst << " stale "
979 "since it holds no caps" << dendl;
980 continue;
981 }
982
983 auto it = session->info.client_metadata.find("timeout");
984 if (it != session->info.client_metadata.end()) {
985 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
986 if (timeout == 0) {
987 dout(10) << "skipping session " << session->info.inst
988 << ", infinite timeout specified" << dendl;
989 continue;
990 }
991 double cutoff = queue_max_age + timeout;
992 if (last_cap_renew_span < cutoff) {
993 dout(10) << "skipping session " << session->info.inst
994 << ", timeout (" << timeout << ") specified"
995 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
996 continue;
997 }
998
999 // do not go through stale, evict it directly.
1000 to_evict.push_back(session);
1001 } else {
1002 dout(10) << "new stale session " << session->info.inst
1003 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1004 new_stale.push_back(session);
1005 }
1006 }
1007
1008 for (auto session : new_stale) {
1009 mds->sessionmap.set_state(session, Session::STATE_STALE);
1010 if (mds->locker->revoke_stale_caps(session)) {
1011 mds->locker->remove_stale_leases(session);
1012 finish_flush_session(session, session->get_push_seq());
1013 auto m = MClientSession::create(CEPH_SESSION_STALE, session->get_push_seq());
1014 mds->send_message_client(m, session);
1015 } else {
1016 to_evict.push_back(session);
1017 }
1018 }
1019 }
1020
1021 // autoclose
1022 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
1023
1024 // Collect a list of sessions exceeding the autoclose threshold
1025 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
1026 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1027 for (auto session : *(sessions_p2->second)) {
1028 assert(session->is_stale());
1029 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1030 if (last_cap_renew_span < cutoff) {
1031 dout(20) << "oldest stale session is " << session->info.inst
1032 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1033 break;
1034 }
1035 to_evict.push_back(session);
1036 }
1037 }
1038
1039 for (auto session: to_evict) {
1040 if (session->is_importing()) {
1041 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1042 continue;
1043 }
1044
1045 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1046 mds->clog->warn() << "evicting unresponsive client " << *session
1047 << ", after " << last_cap_renew_span << " seconds";
1048 dout(10) << "autoclosing stale session " << session->info.inst
1049 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1050
1051 if (g_conf()->mds_session_blacklist_on_timeout) {
1052 std::stringstream ss;
1053 mds->evict_client(session->get_client().v, false, true, ss, nullptr);
1054 } else {
1055 kill_session(session, NULL);
1056 }
1057 }
1058 }
1059
1060 void Server::evict_cap_revoke_non_responders() {
1061 if (!cap_revoke_eviction_timeout) {
1062 return;
1063 }
1064
1065 std::list<client_t> to_evict;
1066 mds->locker->get_late_revoking_clients(&to_evict, cap_revoke_eviction_timeout);
1067
1068 for (auto const &client: to_evict) {
1069 mds->clog->warn() << "client id " << client << " has not responded to"
1070 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1071 << " seconds, evicting";
1072 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1073 << client << dendl;
1074
1075 std::stringstream ss;
1076 bool evicted = mds->evict_client(client.v, false,
1077 g_conf()->mds_session_blacklist_on_evict,
1078 ss, nullptr);
1079 if (evicted && logger) {
1080 logger->inc(l_mdss_cap_revoke_eviction);
1081 }
1082 }
1083 }
1084
1085 void Server::handle_conf_change(const ConfigProxy& conf,
1086 const std::set <std::string> &changed) {
1087 if (changed.count("mds_cap_revoke_eviction_timeout")) {
1088 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
1089 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1090 << cap_revoke_eviction_timeout << dendl;
1091 }
1092 if (changed.count("mds_recall_max_decay_rate")) {
1093 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
1094 }
1095 }
1096
1097 /*
1098 * XXX bump in the interface here, not using an MDSContext here
1099 * because all the callers right now happen to use a SaferCond
1100 */
1101 void Server::kill_session(Session *session, Context *on_safe)
1102 {
1103 ceph_assert(mds->mds_lock.is_locked_by_me());
1104
1105 if ((session->is_opening() ||
1106 session->is_open() ||
1107 session->is_stale()) &&
1108 !session->is_importing()) {
1109 dout(10) << "kill_session " << session << dendl;
1110 journal_close_session(session, Session::STATE_KILLING, on_safe);
1111 } else {
1112 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
1113 if (session->is_closing() ||
1114 session->is_killing()) {
1115 if (on_safe)
1116 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1117 } else {
1118 ceph_assert(session->is_closed() ||
1119 session->is_importing());
1120 if (on_safe)
1121 on_safe->complete(0);
1122 }
1123 }
1124 }
1125
1126 size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
1127 {
1128 bool prenautilus = mds->objecter->with_osdmap(
1129 [&](const OSDMap& o) {
1130 return o.require_osd_release < CEPH_RELEASE_NAUTILUS;
1131 });
1132
1133 std::vector<Session*> victims;
1134 const auto& sessions = mds->sessionmap.get_sessions();
1135 for (const auto& p : sessions) {
1136 if (!p.first.is_client()) {
1137 // Do not apply OSDMap blacklist to MDS daemons, we find out
1138 // about their death via MDSMap.
1139 continue;
1140 }
1141
1142 Session *s = p.second;
1143 auto inst_addr = s->info.inst.addr;
1144 // blacklist entries are always TYPE_ANY for nautilus+
1145 inst_addr.set_type(entity_addr_t::TYPE_ANY);
1146 if (blacklist.count(inst_addr)) {
1147 victims.push_back(s);
1148 continue;
1149 }
1150 if (prenautilus) {
1151 // ...except pre-nautilus, they were TYPE_LEGACY
1152 inst_addr.set_type(entity_addr_t::TYPE_LEGACY);
1153 if (blacklist.count(inst_addr)) {
1154 victims.push_back(s);
1155 }
1156 }
1157 }
1158
1159 for (const auto s : victims) {
1160 kill_session(s, nullptr);
1161 }
1162
1163 dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
1164
1165 return victims.size();
1166 }
1167
1168 void Server::journal_close_session(Session *session, int state, Context *on_safe)
1169 {
1170 uint64_t sseq = mds->sessionmap.set_state(session, state);
1171 version_t pv = mds->sessionmap.mark_projected(session);
1172 version_t piv = 0;
1173
1174 // release alloc and pending-alloc inos for this session
1175 // and wipe out session state, in case the session close aborts for some reason
1176 interval_set<inodeno_t> both;
1177 both.insert(session->info.prealloc_inos);
1178 both.insert(session->pending_prealloc_inos);
1179 if (both.size()) {
1180 mds->inotable->project_release_ids(both);
1181 piv = mds->inotable->get_projected_version();
1182 } else
1183 piv = 0;
1184
1185 mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv),
1186 new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe));
1187 mdlog->flush();
1188
1189 // clean up requests, too
1190 elist<MDRequestImpl*>::iterator p =
1191 session->requests.begin(member_offset(MDRequestImpl,
1192 item_session_request));
1193 while (!p.end()) {
1194 MDRequestRef mdr = mdcache->request_get((*p)->reqid);
1195 ++p;
1196 mdcache->request_kill(mdr);
1197 }
1198
1199 finish_flush_session(session, session->get_push_seq());
1200 }
1201
1202 void Server::reconnect_clients(MDSContext *reconnect_done_)
1203 {
1204 reconnect_done = reconnect_done_;
1205
1206 auto now = clock::now();
1207 set<Session*> sessions;
1208 mds->sessionmap.get_client_session_set(sessions);
1209 for (auto session : sessions) {
1210 if (session->is_open()) {
1211 client_reconnect_gather.insert(session->get_client());
1212 session->last_cap_renew = now;
1213 }
1214 }
1215
1216 if (client_reconnect_gather.empty()) {
1217 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1218 reconnect_gather_finish();
1219 return;
1220 }
1221
1222 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1223
1224 reconnect_start = now;
1225 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1226 mds->sessionmap.dump();
1227 }
1228
1229 void Server::handle_client_reconnect(const MClientReconnect::const_ref &m)
1230 {
1231 dout(7) << "handle_client_reconnect " << m->get_source()
1232 << (m->has_more() ? " (more)" : "") << dendl;
1233 client_t from = m->get_source().num();
1234 Session *session = mds->get_session(m);
1235 if (!session)
1236 return;
1237
1238 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1239 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1240 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1241 return;
1242 }
1243
1244 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
1245 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1246
1247 bool deny = false;
1248 if (!mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
1249 // XXX maybe in the future we can do better than this?
1250 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1251 mds->clog->info() << "denied reconnect attempt (mds is "
1252 << ceph_mds_state_name(mds->get_state())
1253 << ") from " << m->get_source_inst()
1254 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
1255 deny = true;
1256 } else {
1257 std::string error_str;
1258 if (!session->is_open()) {
1259 error_str = "session is closed";
1260 } else if (mdcache->is_readonly()) {
1261 error_str = "mds is readonly";
1262 } else {
1263 if (session->info.client_metadata.features.empty())
1264 infer_supported_features(session, session->info.client_metadata);
1265
1266 feature_bitset_t missing_features = required_client_features;
1267 missing_features -= session->info.client_metadata.features;
1268 if (!missing_features.empty()) {
1269 stringstream ss;
1270 ss << "missing required features '" << missing_features << "'";
1271 error_str = ss.str();
1272 }
1273 }
1274
1275 if (!error_str.empty()) {
1276 deny = true;
1277 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1278 mds->clog->info() << "denied reconnect attempt from "
1279 << m->get_source_inst() << " (" << error_str << ")";
1280 }
1281 }
1282
1283 if (deny) {
1284 auto r = MClientSession::create(CEPH_SESSION_CLOSE);
1285 mds->send_message_client(r, session);
1286 if (session->is_open())
1287 kill_session(session, nullptr);
1288 return;
1289 }
1290
1291 if (!m->has_more()) {
1292 // notify client of success with an OPEN
1293 auto reply = MClientSession::create(CEPH_SESSION_OPEN);
1294 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
1295 reply->supported_features = supported_features;
1296 mds->send_message_client(reply, session);
1297 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1298 }
1299
1300 session->last_cap_renew = clock::now();
1301
1302 // snaprealms
1303 for (const auto &r : m->realms) {
1304 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
1305 if (in && in->state_test(CInode::STATE_PURGING))
1306 continue;
1307 if (in) {
1308 if (in->snaprealm) {
1309 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
1310 } else {
1311 // this can happen if we are non-auth or we rollback snaprealm
1312 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
1313 }
1314 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1315 } else {
1316 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1317 << " seq " << r.realm.seq << dendl;
1318 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1319 }
1320 }
1321
1322 // caps
1323 for (const auto &p : m->caps) {
1324 // make sure our last_cap_id is MAX over all issued caps
1325 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1326 mdcache->last_cap_id = p.second.capinfo.cap_id;
1327
1328 CInode *in = mdcache->get_inode(p.first);
1329 if (in && in->state_test(CInode::STATE_PURGING))
1330 continue;
1331 if (in && in->is_auth()) {
1332 // we recovered it, and it's ours. take note.
1333 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
1334 << " on " << *in << dendl;
1335 in->reconnect_cap(from, p.second, session);
1336 mdcache->add_reconnected_cap(from, p.first, p.second);
1337 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
1338 continue;
1339 }
1340
1341 if (in && !in->is_auth()) {
1342 // not mine.
1343 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1344 // add to cap export list.
1345 mdcache->rejoin_export_caps(p.first, from, p.second,
1346 in->authority().first, true);
1347 } else {
1348 // don't know if the inode is mine
1349 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1350 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
1351 }
1352 }
1353
1354 reconnect_last_seen = clock::now();
1355
1356 if (!m->has_more()) {
1357 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1358
1359 // remove from gather set
1360 client_reconnect_gather.erase(from);
1361 if (client_reconnect_gather.empty())
1362 reconnect_gather_finish();
1363 }
1364 }
1365
1366 void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1367 {
1368 int supported = -1;
1369 auto it = client_metadata.find("ceph_version");
1370 if (it != client_metadata.end()) {
1371 // user space client
1372 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1373 supported = CEPHFS_FEATURE_LUMINOUS;
1374 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1375 supported = CEPHFS_FEATURE_KRAKEN;
1376 } else {
1377 it = client_metadata.find("kernel_version");
1378 if (it != client_metadata.end()) {
1379 // kernel client
1380 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1381 supported = CEPHFS_FEATURE_LUMINOUS;
1382 }
1383 }
1384 if (supported == -1 &&
1385 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1386 supported = CEPHFS_FEATURE_JEWEL;
1387
1388 if (supported >= 0) {
1389 unsigned long value = (1UL << (supported + 1)) - 1;
1390 client_metadata.features = feature_bitset_t(value);
1391 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1392 }
1393 }
1394
1395 void Server::update_required_client_features()
1396 {
1397 vector<size_t> bits = CEPHFS_FEATURES_MDS_REQUIRED;
1398
1399 int min_compat = mds->mdsmap->get_min_compat_client();
1400 if (min_compat >= CEPH_RELEASE_NAUTILUS) {
1401 bits.push_back(CEPHFS_FEATURE_NAUTILUS);
1402 } else if (min_compat >= CEPH_RELEASE_MIMIC)
1403 bits.push_back(CEPHFS_FEATURE_MIMIC);
1404 else if (min_compat >= CEPH_RELEASE_LUMINOUS)
1405 bits.push_back(CEPHFS_FEATURE_LUMINOUS);
1406 else if (min_compat >= CEPH_RELEASE_KRAKEN)
1407 bits.push_back(CEPHFS_FEATURE_KRAKEN);
1408 else if (min_compat >= CEPH_RELEASE_JEWEL)
1409 bits.push_back(CEPHFS_FEATURE_JEWEL);
1410
1411 std::sort(bits.begin(), bits.end());
1412 required_client_features = feature_bitset_t(bits);
1413 dout(7) << "required_client_features: " << required_client_features << dendl;
1414
1415 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1416 set<Session*> sessions;
1417 mds->sessionmap.get_client_session_set(sessions);
1418 for (auto session : sessions) {
1419 feature_bitset_t missing_features = required_client_features;
1420 missing_features -= session->info.client_metadata.features;
1421 if (!missing_features.empty()) {
1422 bool blacklisted = mds->objecter->with_osdmap(
1423 [session](const OSDMap &osd_map) -> bool {
1424 return osd_map.is_blacklisted(session->info.inst.addr);
1425 });
1426 if (blacklisted)
1427 continue;
1428
1429 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1430 << missing_features << "'";
1431 std::stringstream ss;
1432 mds->evict_client(session->get_client().v, false,
1433 g_conf()->mds_session_blacklist_on_evict, ss);
1434 }
1435 }
1436 }
1437 }
1438
1439 void Server::reconnect_gather_finish()
1440 {
1441 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1442 ceph_assert(reconnect_done);
1443
1444 if (!mds->snapclient->is_synced()) {
1445 // make sure snaptable cache is populated. snaprealms will be
1446 // extensively used in rejoin stage.
1447 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1448 mds->snapclient->wait_for_sync(reconnect_done);
1449 } else {
1450 reconnect_done->complete(0);
1451 }
1452 reconnect_done = NULL;
1453 }
1454
1455 void Server::reconnect_tick()
1456 {
1457 if (reconnect_evicting) {
1458 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
1459 return;
1460 }
1461
1462 if (client_reconnect_gather.empty())
1463 return;
1464
1465 auto now = clock::now();
1466 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
1467 if (elapse1 < g_conf()->mds_reconnect_timeout)
1468 return;
1469
1470 vector<Session*> remaining_sessions;
1471 remaining_sessions.reserve(client_reconnect_gather.size());
1472 for (auto c : client_reconnect_gather) {
1473 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1474 ceph_assert(session);
1475 remaining_sessions.push_back(session);
1476 // client re-sends cap flush messages before the reconnect message
1477 if (session->last_seen > reconnect_last_seen)
1478 reconnect_last_seen = session->last_seen;
1479 }
1480
1481 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
1482 if (elapse2 < g_conf()->mds_reconnect_timeout / 2) {
1483 dout(7) << "reconnect_tick: last seen " << elapse2
1484 << " seconds ago, extending reconnect interval" << dendl;
1485 return;
1486 }
1487
1488 dout(7) << "reconnect timed out, " << remaining_sessions.size()
1489 << " clients have not reconnected in time" << dendl;
1490
1491 // If we're doing blacklist evictions, use this to wait for them before
1492 // proceeding to reconnect_gather_finish
1493 MDSGatherBuilder gather(g_ceph_context);
1494
1495 for (auto session : remaining_sessions) {
1496 // Keep sessions that have specified timeout. These sessions will prevent
1497 // mds from going to active. MDS goes to active after they all have been
1498 // killed or reclaimed.
1499 if (session->info.client_metadata.find("timeout") !=
1500 session->info.client_metadata.end()) {
1501 dout(1) << "reconnect keeps " << session->info.inst
1502 << ", need to be reclaimed" << dendl;
1503 client_reclaim_gather.insert(session->get_client());
1504 continue;
1505 }
1506
1507 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
1508
1509 mds->clog->warn() << "evicting unresponsive client " << *session
1510 << ", after waiting " << elapse1
1511 << " seconds during MDS startup";
1512
1513 if (g_conf()->mds_session_blacklist_on_timeout) {
1514 std::stringstream ss;
1515 mds->evict_client(session->get_client().v, false, true, ss,
1516 gather.new_sub());
1517 } else {
1518 kill_session(session, NULL);
1519 }
1520
1521 failed_reconnects++;
1522 }
1523 client_reconnect_gather.clear();
1524
1525 if (gather.has_subs()) {
1526 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1527 gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
1528 [this](int r){reconnect_gather_finish();})));
1529 gather.activate();
1530 reconnect_evicting = true;
1531 } else {
1532 reconnect_gather_finish();
1533 }
1534 }
1535
1536 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1537 {
1538 if (!locks.length()) return;
1539 int numlocks;
1540 ceph_filelock lock;
1541 auto p = locks.cbegin();
1542 decode(numlocks, p);
1543 for (int i = 0; i < numlocks; ++i) {
1544 decode(lock, p);
1545 lock.client = client;
1546 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1547 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1548 }
1549 decode(numlocks, p);
1550 for (int i = 0; i < numlocks; ++i) {
1551 decode(lock, p);
1552 lock.client = client;
1553 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1554 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1555 }
1556 }
1557
1558 /**
1559 * Call this when the MDCache is oversized, to send requests to the clients
1560 * to trim some caps, and consequently unpin some inodes in the MDCache so
1561 * that it can trim too.
1562 */
1563 std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1564 {
1565 const auto now = clock::now();
1566 const bool steady = flags&RecallFlags::STEADY;
1567 const bool enforce_max = flags&RecallFlags::ENFORCE_MAX;
1568
1569 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1570 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1571 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1572 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1573 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
1574
1575 dout(7) << __func__ << ":"
1576 << " min=" << min_caps_per_client
1577 << " max=" << max_caps_per_client
1578 << " total=" << Capability::count()
1579 << " flags=0x" << std::hex << flags
1580 << dendl;
1581
1582 /* trim caps of sessions with the most caps first */
1583 std::multimap<uint64_t, Session*> caps_session;
1584 auto f = [&caps_session, enforce_max, max_caps_per_client](auto& s) {
1585 auto num_caps = s->caps.size();
1586 if (!enforce_max || num_caps > max_caps_per_client) {
1587 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1588 }
1589 };
1590 mds->sessionmap.get_client_sessions(std::move(f));
1591
1592 std::pair<bool, uint64_t> result = {false, 0};
1593 auto& [throttled, caps_recalled] = result;
1594 last_recall_state = now;
1595 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
1596 if (!session->is_open() ||
1597 !session->get_connection() ||
1598 !session->info.inst.name.is_client())
1599 continue;
1600
1601 dout(10) << __func__ << ":"
1602 << " session " << session->info.inst
1603 << " caps " << num_caps
1604 << ", leases " << session->leases.size()
1605 << dendl;
1606
1607 uint64_t newlim;
1608 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1609 newlim = min_caps_per_client;
1610 } else {
1611 newlim = num_caps-recall_max_caps;
1612 }
1613 if (num_caps > newlim) {
1614 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1615 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1616 newlim = num_caps-recall;
1617 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
1618 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1619 const uint64_t global_recall_throttle = recall_throttle.get();
1620 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1621 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1622 throttled = true;
1623 continue;
1624 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1625 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1626 throttled = true;
1627 continue;
1628 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1629 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1630 throttled = true;
1631 break;
1632 }
1633
1634 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1635 if (steady) {
1636 const auto session_recall = session->get_recall_caps();
1637 const auto session_release = session->get_release_caps();
1638 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1639 /* The session has been unable to keep up with the number of caps
1640 * recalled (by half); additionally, to prevent marking sessions
1641 * we've just begun to recall from, the session_recall counter
1642 * (decayed count of caps recently recalled) is **greater** than the
1643 * session threshold for the session's cap recall throttle.
1644 */
1645 dout(15) << " 2*session_release < session_recall"
1646 " (2*" << session_release << " < " << session_recall << ") &&"
1647 " 2*session_recall < recall_max_decay_threshold"
1648 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
1649 " Skipping because we are unlikely to get more released." << dendl;
1650 continue;
1651 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1652 /* The number of caps recalled is less than the number we *could*
1653 * recall (so there isn't much left to recall?) and the number of
1654 * caps is less than the current recall_caps counter (decayed count
1655 * of caps recently recalled).
1656 */
1657 dout(15) << " 2*recall < session_recall "
1658 " (2*" << recall << " < " << session_recall << ") &&"
1659 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1660 " Skipping because we are unlikely to get more released." << dendl;
1661 continue;
1662 }
1663 }
1664
1665 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1666
1667 auto m = MClientSession::create(CEPH_SESSION_RECALL_STATE);
1668 m->head.max_caps = newlim;
1669 mds->send_message_client(m, session);
1670 if (gather) {
1671 flush_session(session, gather);
1672 }
1673 caps_recalled += session->notify_recall_sent(newlim);
1674 recall_throttle.hit(recall);
1675 }
1676 }
1677
1678 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1679
1680 return result;
1681 }
1682
1683 void Server::force_clients_readonly()
1684 {
1685 dout(10) << "force_clients_readonly" << dendl;
1686 set<Session*> sessions;
1687 mds->sessionmap.get_client_session_set(sessions);
1688 for (set<Session*>::const_iterator p = sessions.begin();
1689 p != sessions.end();
1690 ++p) {
1691 Session *session = *p;
1692 if (!session->info.inst.name.is_client() ||
1693 !(session->is_open() || session->is_stale()))
1694 continue;
1695 mds->send_message_client(MClientSession::create(CEPH_SESSION_FORCE_RO), session);
1696 }
1697 }
1698
1699 /*******
1700 * some generic stuff for finishing off requests
1701 */
1702 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1703 {
1704 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1705 ceph_assert(!mdr->has_completed);
1706
1707 // note trace items for eventual reply.
1708 mdr->tracei = in;
1709 if (in)
1710 mdr->pin(in);
1711
1712 mdr->tracedn = dn;
1713 if (dn)
1714 mdr->pin(dn);
1715
1716 early_reply(mdr, in, dn);
1717
1718 mdr->committing = true;
1719 submit_mdlog_entry(le, fin, mdr, __func__);
1720
1721 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1722 if (mds->queue_one_replay()) {
1723 dout(10) << " queued next replay op" << dendl;
1724 } else {
1725 dout(10) << " journaled last replay op" << dendl;
1726 }
1727 } else if (mdr->did_early_reply)
1728 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
1729 else
1730 mdlog->flush();
1731 }
1732
1733 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1734 std::string_view event)
1735 {
1736 if (mdr) {
1737 string event_str("submit entry: ");
1738 event_str += event;
1739 mdr->mark_event(event_str);
1740 }
1741 mdlog->submit_entry(le, fin);
1742 }
1743
1744 /*
1745 * send response built from mdr contents and error code; clean up mdr
1746 */
1747 void Server::respond_to_request(MDRequestRef& mdr, int r)
1748 {
1749 if (mdr->client_request) {
1750 reply_client_request(mdr, MClientReply::create(*mdr->client_request, r));
1751 } else if (mdr->internal_op > -1) {
1752 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1753 if (!mdr->internal_op_finish)
1754 ceph_abort_msg("trying to respond to internal op without finisher");
1755 mdr->internal_op_finish->complete(r);
1756 mdcache->request_finish(mdr);
1757 }
1758 }
1759
1760 // statistics mds req op number and latency
1761 void Server::perf_gather_op_latency(const MClientRequest::const_ref &req, utime_t lat)
1762 {
1763 int code = l_mdss_first;
1764 switch(req->get_op()) {
1765 case CEPH_MDS_OP_LOOKUPHASH:
1766 code = l_mdss_req_lookuphash_latency;
1767 break;
1768 case CEPH_MDS_OP_LOOKUPINO:
1769 code = l_mdss_req_lookupino_latency;
1770 break;
1771 case CEPH_MDS_OP_LOOKUPPARENT:
1772 code = l_mdss_req_lookupparent_latency;
1773 break;
1774 case CEPH_MDS_OP_LOOKUPNAME:
1775 code = l_mdss_req_lookupname_latency;
1776 break;
1777 case CEPH_MDS_OP_LOOKUP:
1778 code = l_mdss_req_lookup_latency;
1779 break;
1780 case CEPH_MDS_OP_LOOKUPSNAP:
1781 code = l_mdss_req_lookupsnap_latency;
1782 break;
1783 case CEPH_MDS_OP_GETATTR:
1784 code = l_mdss_req_getattr_latency;
1785 break;
1786 case CEPH_MDS_OP_SETATTR:
1787 code = l_mdss_req_setattr_latency;
1788 break;
1789 case CEPH_MDS_OP_SETLAYOUT:
1790 code = l_mdss_req_setlayout_latency;
1791 break;
1792 case CEPH_MDS_OP_SETDIRLAYOUT:
1793 code = l_mdss_req_setdirlayout_latency;
1794 break;
1795 case CEPH_MDS_OP_SETXATTR:
1796 code = l_mdss_req_setxattr_latency;
1797 break;
1798 case CEPH_MDS_OP_RMXATTR:
1799 code = l_mdss_req_rmxattr_latency;
1800 break;
1801 case CEPH_MDS_OP_READDIR:
1802 code = l_mdss_req_readdir_latency;
1803 break;
1804 case CEPH_MDS_OP_SETFILELOCK:
1805 code = l_mdss_req_setfilelock_latency;
1806 break;
1807 case CEPH_MDS_OP_GETFILELOCK:
1808 code = l_mdss_req_getfilelock_latency;
1809 break;
1810 case CEPH_MDS_OP_CREATE:
1811 code = l_mdss_req_create_latency;
1812 break;
1813 case CEPH_MDS_OP_OPEN:
1814 code = l_mdss_req_open_latency;
1815 break;
1816 case CEPH_MDS_OP_MKNOD:
1817 code = l_mdss_req_mknod_latency;
1818 break;
1819 case CEPH_MDS_OP_LINK:
1820 code = l_mdss_req_link_latency;
1821 break;
1822 case CEPH_MDS_OP_UNLINK:
1823 code = l_mdss_req_unlink_latency;
1824 break;
1825 case CEPH_MDS_OP_RMDIR:
1826 code = l_mdss_req_rmdir_latency;
1827 break;
1828 case CEPH_MDS_OP_RENAME:
1829 code = l_mdss_req_rename_latency;
1830 break;
1831 case CEPH_MDS_OP_MKDIR:
1832 code = l_mdss_req_mkdir_latency;
1833 break;
1834 case CEPH_MDS_OP_SYMLINK:
1835 code = l_mdss_req_symlink_latency;
1836 break;
1837 case CEPH_MDS_OP_LSSNAP:
1838 code = l_mdss_req_lssnap_latency;
1839 break;
1840 case CEPH_MDS_OP_MKSNAP:
1841 code = l_mdss_req_mksnap_latency;
1842 break;
1843 case CEPH_MDS_OP_RMSNAP:
1844 code = l_mdss_req_rmsnap_latency;
1845 break;
1846 case CEPH_MDS_OP_RENAMESNAP:
1847 code = l_mdss_req_renamesnap_latency;
1848 break;
1849 default: ceph_abort();
1850 }
1851 logger->tinc(code, lat);
1852 }
1853
1854 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
1855 {
1856 if (!g_conf()->mds_early_reply)
1857 return;
1858
1859 if (mdr->no_early_reply) {
1860 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
1861 return;
1862 }
1863
1864 if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
1865 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
1866 return;
1867 }
1868
1869 if (mdr->alloc_ino) {
1870 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
1871 return;
1872 }
1873
1874 const MClientRequest::const_ref &req = mdr->client_request;
1875 entity_inst_t client_inst = req->get_source_inst();
1876 if (client_inst.name.is_mds())
1877 return;
1878
1879 if (req->is_replay()) {
1880 dout(10) << " no early reply on replay op" << dendl;
1881 return;
1882 }
1883
1884
1885 auto reply = MClientReply::create(*req, 0);
1886 reply->set_unsafe();
1887
1888 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1889 //
1890 //_rename_finish() does not send dentry link/unlink message to replicas.
1891 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1892 // that have projected linkages from getting new replica.
1893 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
1894
1895 dout(10) << "early_reply " << reply->get_result()
1896 << " (" << cpp_strerror(reply->get_result())
1897 << ") " << *req << dendl;
1898
1899 if (tracei || tracedn) {
1900 if (tracei)
1901 mdr->cap_releases.erase(tracei->vino());
1902 if (tracedn)
1903 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1904
1905 set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid,
1906 req->get_dentry_wanted(), mdr);
1907 }
1908
1909 reply->set_extra_bl(mdr->reply_extra_bl);
1910 mds->send_message_client(reply, mdr->session);
1911
1912 mdr->did_early_reply = true;
1913
1914 mds->logger->inc(l_mds_reply);
1915 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
1916 mds->logger->tinc(l_mds_reply_latency, lat);
1917 if (client_inst.name.is_client()) {
1918 mds->sessionmap.hit_session(mdr->session);
1919 }
1920 perf_gather_op_latency(req, lat);
1921 dout(20) << "lat " << lat << dendl;
1922
1923 mdr->mark_event("early_replied");
1924 }
1925
1926 /*
1927 * send given reply
1928 * include a trace to tracei
1929 * Clean up mdr
1930 */
1931 void Server::reply_client_request(MDRequestRef& mdr, const MClientReply::ref &reply)
1932 {
1933 ceph_assert(mdr.get());
1934 const MClientRequest::const_ref &req = mdr->client_request;
1935
1936 dout(7) << "reply_client_request " << reply->get_result()
1937 << " (" << cpp_strerror(reply->get_result())
1938 << ") " << *req << dendl;
1939
1940 mdr->mark_event("replying");
1941
1942 Session *session = mdr->session;
1943
1944 // note successful request in session map?
1945 //
1946 // setfilelock requests are special, they only modify states in MDS memory.
1947 // The states get lost when MDS fails. If Client re-send a completed
1948 // setfilelock request, it means that client did not receive corresponding
1949 // setfilelock reply. So MDS should re-execute the setfilelock request.
1950 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
1951 reply->get_result() == 0 && session) {
1952 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
1953 session->add_completed_request(mdr->reqid.tid, created);
1954 if (mdr->ls) {
1955 mdr->ls->touched_sessions.insert(session->info.inst.name);
1956 }
1957 }
1958
1959 // give any preallocated inos to the session
1960 apply_allocated_inos(mdr, session);
1961
1962 // get tracei/tracedn from mdr?
1963 snapid_t snapid = mdr->snapid;
1964 CInode *tracei = mdr->tracei;
1965 CDentry *tracedn = mdr->tracedn;
1966
1967 bool is_replay = mdr->client_request->is_replay();
1968 bool did_early_reply = mdr->did_early_reply;
1969 entity_inst_t client_inst = req->get_source_inst();
1970 int dentry_wanted = req->get_dentry_wanted();
1971
1972 if (!did_early_reply && !is_replay) {
1973
1974 mds->logger->inc(l_mds_reply);
1975 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
1976 mds->logger->tinc(l_mds_reply_latency, lat);
1977 if (session && client_inst.name.is_client()) {
1978 mds->sessionmap.hit_session(session);
1979 }
1980 perf_gather_op_latency(req, lat);
1981 dout(20) << "lat " << lat << dendl;
1982
1983 if (tracei)
1984 mdr->cap_releases.erase(tracei->vino());
1985 if (tracedn)
1986 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1987 }
1988
1989 // drop non-rdlocks before replying, so that we can issue leases
1990 mdcache->request_drop_non_rdlocks(mdr);
1991
1992 // reply at all?
1993 if (session && !client_inst.name.is_mds()) {
1994 // send reply.
1995 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
1996 (tracei || tracedn)) {
1997 if (is_replay) {
1998 if (tracei)
1999 mdcache->try_reconnect_cap(tracei, session);
2000 } else {
2001 // include metadata in reply
2002 set_trace_dist(session, reply, tracei, tracedn,
2003 snapid, dentry_wanted,
2004 mdr);
2005 }
2006 }
2007
2008 // We can set the extra bl unconditionally: if it's already been sent in the
2009 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2010 reply->set_extra_bl(mdr->reply_extra_bl);
2011
2012 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
2013 mds->send_message_client(reply, session);
2014 }
2015
2016 if (req->is_queued_for_replay() &&
2017 (mdr->has_completed || reply->get_result() < 0)) {
2018 if (reply->get_result() < 0) {
2019 int r = reply->get_result();
2020 derr << "reply_client_request: failed to replay " << *req
2021 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2022 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2023 }
2024 mds->queue_one_replay();
2025 }
2026
2027 // clean up request
2028 mdcache->request_finish(mdr);
2029
2030 // take a closer look at tracei, if it happens to be a remote link
2031 if (tracei &&
2032 tracedn &&
2033 tracedn->get_projected_linkage()->is_remote()) {
2034 mdcache->eval_remote(tracedn);
2035 }
2036 }
2037
2038 /*
2039 * pass inode OR dentry (not both, or we may get confused)
2040 *
2041 * trace is in reverse order (i.e. root inode comes last)
2042 */
2043 void Server::set_trace_dist(Session *session, const MClientReply::ref &reply,
2044 CInode *in, CDentry *dn,
2045 snapid_t snapid,
2046 int dentry_wanted,
2047 MDRequestRef& mdr)
2048 {
2049 // skip doing this for debugging purposes?
2050 if (g_conf()->mds_inject_traceless_reply_probability &&
2051 mdr->ls && !mdr->o_trunc &&
2052 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
2053 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2054 return;
2055 }
2056
2057 // inode, dentry, dir, ..., inode
2058 bufferlist bl;
2059 mds_rank_t whoami = mds->get_nodeid();
2060 client_t client = session->get_client();
2061 utime_t now = ceph_clock_now();
2062
2063 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2064
2065 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
2066
2067 // realm
2068 if (snapid == CEPH_NOSNAP) {
2069 SnapRealm *realm;
2070 if (in)
2071 realm = in->find_snaprealm();
2072 else
2073 realm = dn->get_dir()->get_inode()->find_snaprealm();
2074 reply->snapbl = realm->get_snap_trace();
2075 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2076 }
2077
2078 // dir + dentry?
2079 if (dn) {
2080 reply->head.is_dentry = 1;
2081 CDir *dir = dn->get_dir();
2082 CInode *diri = dir->get_inode();
2083
2084 diri->encode_inodestat(bl, session, NULL, snapid);
2085 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2086
2087 #ifdef MDS_VERIFY_FRAGSTAT
2088 if (dir->is_complete())
2089 dir->verify_fragstat();
2090 #endif
2091 DirStat ds;
2092 ds.frag = dir->get_frag();
2093 ds.auth = dir->get_dir_auth().first;
2094 if (dir->is_auth())
2095 dir->get_dist_spec(ds.dist, whoami);
2096
2097 dir->encode_dirstat(bl, session->info, ds);
2098 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2099
2100 encode(dn->get_name(), bl);
2101 if (snapid == CEPH_NOSNAP)
2102 mds->locker->issue_client_lease(dn, client, bl, now, session);
2103 else {
2104 //null lease
2105 LeaseStat e;
2106 mds->locker->encode_lease(bl, session->info, e);
2107 }
2108 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
2109 } else
2110 reply->head.is_dentry = 0;
2111
2112 // inode
2113 if (in) {
2114 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2115 dout(20) << "set_trace_dist added in " << *in << dendl;
2116 reply->head.is_target = 1;
2117 } else
2118 reply->head.is_target = 0;
2119
2120 reply->set_trace(bl);
2121 }
2122
2123 void Server::handle_client_request(const MClientRequest::const_ref &req)
2124 {
2125 dout(4) << "handle_client_request " << *req << dendl;
2126
2127 if (mds->logger)
2128 mds->logger->inc(l_mds_request);
2129 if (logger)
2130 logger->inc(l_mdss_handle_client_request);
2131
2132 if (!mdcache->is_open()) {
2133 dout(5) << "waiting for root" << dendl;
2134 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2135 return;
2136 }
2137
2138 // active session?
2139 Session *session = 0;
2140 if (req->get_source().is_client()) {
2141 session = mds->get_session(req);
2142 if (!session) {
2143 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
2144 } else if (session->is_closed() ||
2145 session->is_closing() ||
2146 session->is_killing()) {
2147 dout(5) << "session closed|closing|killing, dropping" << dendl;
2148 session = NULL;
2149 }
2150 if (!session) {
2151 if (req->is_queued_for_replay())
2152 mds->queue_one_replay();
2153 return;
2154 }
2155 }
2156
2157 // old mdsmap?
2158 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2159 // send it? hrm, this isn't ideal; they may get a lot of copies if
2160 // they have a high request rate.
2161 }
2162
2163 // completed request?
2164 bool has_completed = false;
2165 if (req->is_replay() || req->get_retry_attempt()) {
2166 ceph_assert(session);
2167 inodeno_t created;
2168 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2169 has_completed = true;
2170 // Don't send traceless reply if the completed request has created
2171 // new inode. Treat the request as lookup request instead.
2172 if (req->is_replay() ||
2173 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2174 req->get_op() != CEPH_MDS_OP_OPEN &&
2175 req->get_op() != CEPH_MDS_OP_CREATE)) {
2176 dout(5) << "already completed " << req->get_reqid() << dendl;
2177 auto reply = MClientReply::create(*req, 0);
2178 if (created != inodeno_t()) {
2179 bufferlist extra;
2180 encode(created, extra);
2181 reply->set_extra_bl(extra);
2182 }
2183 mds->send_message_client(reply, session);
2184
2185 if (req->is_queued_for_replay())
2186 mds->queue_one_replay();
2187
2188 return;
2189 }
2190 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2191 req->get_op() != CEPH_MDS_OP_CREATE) {
2192 dout(10) << " completed request which created new inode " << created
2193 << ", convert it to lookup request" << dendl;
2194 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2195 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2196 }
2197 }
2198 }
2199
2200 // trim completed_request list
2201 if (req->get_oldest_client_tid() > 0) {
2202 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
2203 ceph_assert(session);
2204 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2205 // Sessions 'completed_requests' was dirtied, mark it to be
2206 // potentially flushed at segment expiry.
2207 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2208
2209 if (session->get_num_trim_requests_warnings() > 0 &&
2210 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
2211 session->reset_num_trim_requests_warnings();
2212 } else {
2213 if (session->get_num_completed_requests() >=
2214 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
2215 session->inc_num_trim_requests_warnings();
2216 stringstream ss;
2217 ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
2218 << req->get_oldest_client_tid() << "), "
2219 << session->get_num_completed_requests()
2220 << " completed requests recorded in session\n";
2221 mds->clog->warn() << ss.str();
2222 dout(20) << __func__ << " " << ss.str() << dendl;
2223 }
2224 }
2225 }
2226
2227 // register + dispatch
2228 MDRequestRef mdr = mdcache->request_start(req);
2229 if (!mdr.get())
2230 return;
2231
2232 if (session) {
2233 mdr->session = session;
2234 session->requests.push_back(&mdr->item_session_request);
2235 }
2236
2237 if (has_completed)
2238 mdr->has_completed = true;
2239
2240 // process embedded cap releases?
2241 // (only if NOT replay!)
2242 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
2243 client_t client = req->get_source().num();
2244 for (const auto &r : req->releases) {
2245 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2246 }
2247 req->releases.clear();
2248 }
2249
2250 dispatch_client_request(mdr);
2251 return;
2252 }
2253
2254 void Server::handle_osd_map()
2255 {
2256 /* Note that we check the OSDMAP_FULL flag directly rather than
2257 * using osdmap_full_flag(), because we want to know "is the flag set"
2258 * rather than "does the flag apply to us?" */
2259 mds->objecter->with_osdmap([this](const OSDMap& o) {
2260 auto pi = o.get_pg_pool(mds->mdsmap->get_metadata_pool());
2261 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
2262 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2263 << o.get_epoch() << dendl;
2264 });
2265 }
2266
2267 void Server::dispatch_client_request(MDRequestRef& mdr)
2268 {
2269 // we shouldn't be waiting on anyone.
2270 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
2271
2272 if (mdr->killed) {
2273 dout(10) << "request " << *mdr << " was killed" << dendl;
2274 return;
2275 } else if (mdr->aborted) {
2276 mdr->aborted = false;
2277 mdcache->request_kill(mdr);
2278 return;
2279 }
2280
2281 const MClientRequest::const_ref &req = mdr->client_request;
2282
2283 if (logger) logger->inc(l_mdss_dispatch_client_request);
2284
2285 dout(7) << "dispatch_client_request " << *req << dendl;
2286
2287 if (req->may_write()) {
2288 if (mdcache->is_readonly()) {
2289 dout(10) << " read-only FS" << dendl;
2290 respond_to_request(mdr, -EROFS);
2291 return;
2292 }
2293 if (mdr->has_more() && mdr->more()->slave_error) {
2294 dout(10) << " got error from slaves" << dendl;
2295 respond_to_request(mdr, mdr->more()->slave_error);
2296 return;
2297 }
2298 }
2299
2300 if (is_full) {
2301 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2302 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2303 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2304 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2305 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2306 req->get_op() == CEPH_MDS_OP_CREATE ||
2307 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2308 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2309 ((req->get_op() == CEPH_MDS_OP_LINK ||
2310 req->get_op() == CEPH_MDS_OP_RENAME) &&
2311 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
2312 ) {
2313
2314 dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2315 respond_to_request(mdr, -ENOSPC);
2316 return;
2317 } else {
2318 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2319 }
2320 }
2321
2322 switch (req->get_op()) {
2323 case CEPH_MDS_OP_LOOKUPHASH:
2324 case CEPH_MDS_OP_LOOKUPINO:
2325 handle_client_lookup_ino(mdr, false, false);
2326 break;
2327 case CEPH_MDS_OP_LOOKUPPARENT:
2328 handle_client_lookup_ino(mdr, true, false);
2329 break;
2330 case CEPH_MDS_OP_LOOKUPNAME:
2331 handle_client_lookup_ino(mdr, false, true);
2332 break;
2333
2334 // inodes ops.
2335 case CEPH_MDS_OP_LOOKUP:
2336 handle_client_getattr(mdr, true);
2337 break;
2338
2339 case CEPH_MDS_OP_LOOKUPSNAP:
2340 // lookupsnap does not reference a CDentry; treat it as a getattr
2341 case CEPH_MDS_OP_GETATTR:
2342 handle_client_getattr(mdr, false);
2343 break;
2344
2345 case CEPH_MDS_OP_SETATTR:
2346 handle_client_setattr(mdr);
2347 break;
2348 case CEPH_MDS_OP_SETLAYOUT:
2349 handle_client_setlayout(mdr);
2350 break;
2351 case CEPH_MDS_OP_SETDIRLAYOUT:
2352 handle_client_setdirlayout(mdr);
2353 break;
2354 case CEPH_MDS_OP_SETXATTR:
2355 handle_client_setxattr(mdr);
2356 break;
2357 case CEPH_MDS_OP_RMXATTR:
2358 handle_client_removexattr(mdr);
2359 break;
2360
2361 case CEPH_MDS_OP_READDIR:
2362 handle_client_readdir(mdr);
2363 break;
2364
2365 case CEPH_MDS_OP_SETFILELOCK:
2366 handle_client_file_setlock(mdr);
2367 break;
2368
2369 case CEPH_MDS_OP_GETFILELOCK:
2370 handle_client_file_readlock(mdr);
2371 break;
2372
2373 // funky.
2374 case CEPH_MDS_OP_CREATE:
2375 if (mdr->has_completed)
2376 handle_client_open(mdr); // already created.. just open
2377 else
2378 handle_client_openc(mdr);
2379 break;
2380
2381 case CEPH_MDS_OP_OPEN:
2382 handle_client_open(mdr);
2383 break;
2384
2385 // namespace.
2386 // no prior locks.
2387 case CEPH_MDS_OP_MKNOD:
2388 handle_client_mknod(mdr);
2389 break;
2390 case CEPH_MDS_OP_LINK:
2391 handle_client_link(mdr);
2392 break;
2393 case CEPH_MDS_OP_UNLINK:
2394 case CEPH_MDS_OP_RMDIR:
2395 handle_client_unlink(mdr);
2396 break;
2397 case CEPH_MDS_OP_RENAME:
2398 handle_client_rename(mdr);
2399 break;
2400 case CEPH_MDS_OP_MKDIR:
2401 handle_client_mkdir(mdr);
2402 break;
2403 case CEPH_MDS_OP_SYMLINK:
2404 handle_client_symlink(mdr);
2405 break;
2406
2407
2408 // snaps
2409 case CEPH_MDS_OP_LSSNAP:
2410 handle_client_lssnap(mdr);
2411 break;
2412 case CEPH_MDS_OP_MKSNAP:
2413 handle_client_mksnap(mdr);
2414 break;
2415 case CEPH_MDS_OP_RMSNAP:
2416 handle_client_rmsnap(mdr);
2417 break;
2418 case CEPH_MDS_OP_RENAMESNAP:
2419 handle_client_renamesnap(mdr);
2420 break;
2421
2422 default:
2423 dout(1) << " unknown client op " << req->get_op() << dendl;
2424 respond_to_request(mdr, -EOPNOTSUPP);
2425 }
2426 }
2427
2428
2429 // ---------------------------------------
2430 // SLAVE REQUESTS
2431
2432 void Server::handle_slave_request(const MMDSSlaveRequest::const_ref &m)
2433 {
2434 dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
2435 mds_rank_t from = mds_rank_t(m->get_source().num());
2436
2437 if (logger) logger->inc(l_mdss_handle_slave_request);
2438
2439 // reply?
2440 if (m->is_reply())
2441 return handle_slave_request_reply(m);
2442
2443 // the purpose of rename notify is enforcing causal message ordering. making sure
2444 // bystanders have received all messages from rename srcdn's auth MDS.
2445 if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
2446 auto reply = MMDSSlaveRequest::create(m->get_reqid(), m->get_attempt(), MMDSSlaveRequest::OP_RENAMENOTIFYACK);
2447 mds->send_message(reply, m->get_connection());
2448 return;
2449 }
2450
2451 CDentry *straydn = NULL;
2452 if (m->straybl.length() > 0) {
2453 straydn = mdcache->add_replica_stray(m->straybl, from);
2454 ceph_assert(straydn);
2455 m->straybl.clear();
2456 }
2457
2458 // am i a new slave?
2459 MDRequestRef mdr;
2460 if (mdcache->have_request(m->get_reqid())) {
2461 // existing?
2462 mdr = mdcache->request_get(m->get_reqid());
2463
2464 // is my request newer?
2465 if (mdr->attempt > m->get_attempt()) {
2466 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2467 << ", dropping " << *m << dendl;
2468 return;
2469 }
2470
2471
2472 if (mdr->attempt < m->get_attempt()) {
2473 // mine is old, close it out
2474 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2475 << ", closing out" << dendl;
2476 mdcache->request_finish(mdr);
2477 mdr.reset();
2478 } else if (mdr->slave_to_mds != from) {
2479 dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
2480 return;
2481 }
2482
2483 if (m->get_op() == MMDSSlaveRequest::OP_FINISH && m->is_abort()) {
2484 mdr->aborted = true;
2485 if (mdr->slave_request) {
2486 // only abort on-going xlock, wrlock and auth pin
2487 ceph_assert(!mdr->slave_did_prepare());
2488 } else {
2489 mdcache->request_finish(mdr);
2490 }
2491 return;
2492 }
2493 }
2494 if (!mdr.get()) {
2495 // new?
2496 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
2497 dout(10) << "missing slave request for " << m->get_reqid()
2498 << " OP_FINISH, must have lost race with a forward" << dendl;
2499 return;
2500 }
2501 mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
2502 mdr->set_op_stamp(m->op_stamp);
2503 }
2504 ceph_assert(mdr->slave_request == 0); // only one at a time, please!
2505
2506 if (straydn) {
2507 mdr->pin(straydn);
2508 mdr->straydn = straydn;
2509 }
2510
2511 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2512 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2513 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2514 return;
2515 } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2516 mdr->locks.empty()) {
2517 dout(3) << "not active yet, waiting" << dendl;
2518 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2519 return;
2520 }
2521
2522 mdr->reset_slave_request(m);
2523
2524 dispatch_slave_request(mdr);
2525 }
2526
2527 void Server::handle_slave_request_reply(const MMDSSlaveRequest::const_ref &m)
2528 {
2529 mds_rank_t from = mds_rank_t(m->get_source().num());
2530
2531 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2532 metareqid_t r = m->get_reqid();
2533 if (!mdcache->have_uncommitted_master(r, from)) {
2534 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2535 << from << " reqid " << r << dendl;
2536 return;
2537 }
2538 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2539 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2540 return;
2541 }
2542
2543 if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
2544 metareqid_t r = m->get_reqid();
2545 mdcache->committed_master_slave(r, from);
2546 return;
2547 }
2548
2549 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2550 if (m->get_attempt() != mdr->attempt) {
2551 dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
2552 << m->get_attempt() << dendl;
2553 return;
2554 }
2555
2556 switch (m->get_op()) {
2557 case MMDSSlaveRequest::OP_XLOCKACK:
2558 {
2559 // identify lock, master request
2560 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2561 m->get_object_info());
2562 mdr->more()->slaves.insert(from);
2563 lock->decode_locked_state(m->get_lock_data());
2564 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2565 mdr->locks.emplace_hint(mdr->locks.end(), lock, MutationImpl::LockOp::XLOCK);
2566 mdr->finish_locking(lock);
2567 lock->get_xlock(mdr, mdr->get_client());
2568
2569 ceph_assert(mdr->more()->waiting_on_slave.count(from));
2570 mdr->more()->waiting_on_slave.erase(from);
2571 ceph_assert(mdr->more()->waiting_on_slave.empty());
2572 mdcache->dispatch_request(mdr);
2573 }
2574 break;
2575
2576 case MMDSSlaveRequest::OP_WRLOCKACK:
2577 {
2578 // identify lock, master request
2579 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2580 m->get_object_info());
2581 mdr->more()->slaves.insert(from);
2582 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2583 auto it = mdr->locks.emplace_hint(mdr->locks.end(),
2584 lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
2585 ceph_assert(it->is_remote_wrlock());
2586 ceph_assert(it->wrlock_target == from);
2587
2588 mdr->finish_locking(lock);
2589
2590 ceph_assert(mdr->more()->waiting_on_slave.count(from));
2591 mdr->more()->waiting_on_slave.erase(from);
2592 ceph_assert(mdr->more()->waiting_on_slave.empty());
2593 mdcache->dispatch_request(mdr);
2594 }
2595 break;
2596
2597 case MMDSSlaveRequest::OP_AUTHPINACK:
2598 handle_slave_auth_pin_ack(mdr, m);
2599 break;
2600
2601 case MMDSSlaveRequest::OP_LINKPREPACK:
2602 handle_slave_link_prep_ack(mdr, m);
2603 break;
2604
2605 case MMDSSlaveRequest::OP_RMDIRPREPACK:
2606 handle_slave_rmdir_prep_ack(mdr, m);
2607 break;
2608
2609 case MMDSSlaveRequest::OP_RENAMEPREPACK:
2610 handle_slave_rename_prep_ack(mdr, m);
2611 break;
2612
2613 case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
2614 handle_slave_rename_notify_ack(mdr, m);
2615 break;
2616
2617 default:
2618 ceph_abort();
2619 }
2620 }
2621
2622 void Server::dispatch_slave_request(MDRequestRef& mdr)
2623 {
2624 dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
2625
2626 if (mdr->aborted) {
2627 dout(7) << " abort flag set, finishing" << dendl;
2628 mdcache->request_finish(mdr);
2629 return;
2630 }
2631
2632 if (logger) logger->inc(l_mdss_dispatch_slave_request);
2633
2634 int op = mdr->slave_request->get_op();
2635 switch (op) {
2636 case MMDSSlaveRequest::OP_XLOCK:
2637 case MMDSSlaveRequest::OP_WRLOCK:
2638 {
2639 // identify object
2640 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2641 mdr->slave_request->get_object_info());
2642
2643 if (!lock) {
2644 dout(10) << "don't have object, dropping" << dendl;
2645 ceph_abort(); // can this happen, if we auth pinned properly.
2646 }
2647 if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2648 dout(10) << "not auth for remote xlock attempt, dropping on "
2649 << *lock << " on " << *lock->get_parent() << dendl;
2650 } else {
2651 // use acquire_locks so that we get auth_pinning.
2652 MutationImpl::LockOpVec lov;
2653 for (const auto& p : mdr->locks) {
2654 if (p.is_xlock())
2655 lov.add_xlock(p.lock);
2656 else if (p.is_wrlock())
2657 lov.add_wrlock(p.lock);
2658 }
2659
2660 int replycode = 0;
2661 switch (op) {
2662 case MMDSSlaveRequest::OP_XLOCK:
2663 lov.add_xlock(lock);
2664 replycode = MMDSSlaveRequest::OP_XLOCKACK;
2665 break;
2666 case MMDSSlaveRequest::OP_WRLOCK:
2667 lov.add_wrlock(lock);
2668 replycode = MMDSSlaveRequest::OP_WRLOCKACK;
2669 break;
2670 }
2671
2672 if (!mds->locker->acquire_locks(mdr, lov))
2673 return;
2674
2675 // ack
2676 auto r = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, replycode);
2677 r->set_lock_type(lock->get_type());
2678 lock->get_parent()->set_object_info(r->get_object_info());
2679 if (replycode == MMDSSlaveRequest::OP_XLOCKACK)
2680 lock->encode_locked_state(r->get_lock_data());
2681 mds->send_message(r, mdr->slave_request->get_connection());
2682 }
2683
2684 // done.
2685 mdr->reset_slave_request();
2686 }
2687 break;
2688
2689 case MMDSSlaveRequest::OP_UNXLOCK:
2690 case MMDSSlaveRequest::OP_UNWRLOCK:
2691 {
2692 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2693 mdr->slave_request->get_object_info());
2694 ceph_assert(lock);
2695 auto it = mdr->locks.find(lock);
2696 ceph_assert(it != mdr->locks.end());
2697 bool need_issue = false;
2698 switch (op) {
2699 case MMDSSlaveRequest::OP_UNXLOCK:
2700 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
2701 break;
2702 case MMDSSlaveRequest::OP_UNWRLOCK:
2703 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
2704 break;
2705 }
2706 if (need_issue)
2707 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2708
2709 // done. no ack necessary.
2710 mdr->reset_slave_request();
2711 }
2712 break;
2713
2714 case MMDSSlaveRequest::OP_DROPLOCKS:
2715 mds->locker->drop_locks(mdr.get());
2716 mdr->reset_slave_request();
2717 break;
2718
2719 case MMDSSlaveRequest::OP_AUTHPIN:
2720 handle_slave_auth_pin(mdr);
2721 break;
2722
2723 case MMDSSlaveRequest::OP_LINKPREP:
2724 case MMDSSlaveRequest::OP_UNLINKPREP:
2725 handle_slave_link_prep(mdr);
2726 break;
2727
2728 case MMDSSlaveRequest::OP_RMDIRPREP:
2729 handle_slave_rmdir_prep(mdr);
2730 break;
2731
2732 case MMDSSlaveRequest::OP_RENAMEPREP:
2733 handle_slave_rename_prep(mdr);
2734 break;
2735
2736 case MMDSSlaveRequest::OP_FINISH:
2737 // information about rename imported caps
2738 if (mdr->slave_request->inode_export.length() > 0)
2739 mdr->more()->inode_import = mdr->slave_request->inode_export;
2740 // finish off request.
2741 mdcache->request_finish(mdr);
2742 break;
2743
2744 default:
2745 ceph_abort();
2746 }
2747 }
2748
2749 void Server::handle_slave_auth_pin(MDRequestRef& mdr)
2750 {
2751 dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
2752
2753 // build list of objects
2754 list<MDSCacheObject*> objects;
2755 CInode *auth_pin_freeze = NULL;
2756 bool fail = false, wouldblock = false, readonly = false;
2757
2758 if (mdcache->is_readonly()) {
2759 dout(10) << " read-only FS" << dendl;
2760 readonly = true;
2761 fail = true;
2762 }
2763
2764 if (!fail) {
2765 for (const auto &oi : mdr->slave_request->get_authpins()) {
2766 MDSCacheObject *object = mdcache->get_object(oi);
2767 if (!object) {
2768 dout(10) << " don't have " << oi << dendl;
2769 fail = true;
2770 break;
2771 }
2772
2773 objects.push_back(object);
2774 if (oi == mdr->slave_request->get_authpin_freeze())
2775 auth_pin_freeze = static_cast<CInode*>(object);
2776 }
2777 }
2778
2779 // can we auth pin them?
2780 if (!fail) {
2781 for (list<MDSCacheObject*>::iterator p = objects.begin();
2782 p != objects.end();
2783 ++p) {
2784 if (!(*p)->is_auth()) {
2785 dout(10) << " not auth for " << **p << dendl;
2786 fail = true;
2787 break;
2788 }
2789 if (mdr->is_auth_pinned(*p))
2790 continue;
2791 if (!mdr->can_auth_pin(*p)) {
2792 if (mdr->slave_request->is_nonblock()) {
2793 dout(10) << " can't auth_pin (freezing?) " << **p << " nonblocking" << dendl;
2794 fail = true;
2795 wouldblock = true;
2796 break;
2797 }
2798 // wait
2799 dout(10) << " waiting for authpinnable on " << **p << dendl;
2800 (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2801 mdr->drop_local_auth_pins();
2802
2803 mds->locker->notify_freeze_waiter(*p);
2804 return;
2805 }
2806 }
2807 }
2808
2809 // auth pin!
2810 if (fail) {
2811 mdr->drop_local_auth_pins(); // just in case
2812 } else {
2813 /* freeze authpin wrong inode */
2814 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
2815 mdr->more()->rename_inode != auth_pin_freeze)
2816 mdr->unfreeze_auth_pin(true);
2817
2818 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2819 * on the source inode to complete. This happens after all locks for the rename
2820 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2821 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2822 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2823 * The solution is freeze the inode and prevent other MDRequests from getting new
2824 * auth pins.
2825 */
2826 if (auth_pin_freeze) {
2827 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
2828 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
2829 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
2830 mds->mdlog->flush();
2831 return;
2832 }
2833 }
2834 for (list<MDSCacheObject*>::iterator p = objects.begin();
2835 p != objects.end();
2836 ++p) {
2837 dout(10) << "auth_pinning " << **p << dendl;
2838 mdr->auth_pin(*p);
2839 }
2840 }
2841
2842 // ack!
2843 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
2844
2845 // return list of my auth_pins (if any)
2846 for (const auto &p : mdr->auth_pins) {
2847 MDSCacheObjectInfo info;
2848 p->set_object_info(info);
2849 reply->get_authpins().push_back(info);
2850 if (p == (MDSCacheObject*)auth_pin_freeze)
2851 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
2852 }
2853
2854 if (wouldblock)
2855 reply->mark_error_wouldblock();
2856 if (readonly)
2857 reply->mark_error_rofs();
2858
2859 mds->send_message_mds(reply, mdr->slave_to_mds);
2860
2861 // clean up this request
2862 mdr->reset_slave_request();
2863 return;
2864 }
2865
2866 void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
2867 {
2868 dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
2869 mds_rank_t from = mds_rank_t(ack->get_source().num());
2870
2871 // added auth pins?
2872 set<MDSCacheObject*> pinned;
2873 for (const auto &oi : ack->get_authpins()) {
2874 MDSCacheObject *object = mdcache->get_object(oi);
2875 ceph_assert(object); // we pinned it
2876 dout(10) << " remote has pinned " << *object << dendl;
2877 if (!mdr->is_auth_pinned(object))
2878 mdr->remote_auth_pins[object] = from;
2879 if (oi == ack->get_authpin_freeze())
2880 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
2881 pinned.insert(object);
2882 }
2883
2884 // removed frozen auth pin ?
2885 if (mdr->more()->is_remote_frozen_authpin &&
2886 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
2887 auto p = mdr->remote_auth_pins.find(mdr->more()->rename_inode);
2888 ceph_assert(p != mdr->remote_auth_pins.end());
2889 if (p->second == from) {
2890 mdr->more()->is_remote_frozen_authpin = false;
2891 }
2892 }
2893
2894 // removed auth pins?
2895 auto p = mdr->remote_auth_pins.begin();
2896 while (p != mdr->remote_auth_pins.end()) {
2897 MDSCacheObject* object = p->first;
2898 if (p->second == from && pinned.count(object) == 0) {
2899 dout(10) << " remote has unpinned " << *object << dendl;
2900 mdr->remote_auth_pins.erase(p++);
2901 } else {
2902 ++p;
2903 }
2904 }
2905
2906 if (ack->is_error_rofs()) {
2907 mdr->more()->slave_error = -EROFS;
2908 mdr->aborted = true;
2909 } else if (ack->is_error_wouldblock()) {
2910 mdr->more()->slave_error = -EWOULDBLOCK;
2911 mdr->aborted = true;
2912 }
2913
2914 // note slave
2915 mdr->more()->slaves.insert(from);
2916
2917 // clear from waiting list
2918 ceph_assert(mdr->more()->waiting_on_slave.count(from));
2919 mdr->more()->waiting_on_slave.erase(from);
2920
2921 // go again?
2922 if (mdr->more()->waiting_on_slave.empty())
2923 mdcache->dispatch_request(mdr);
2924 else
2925 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
2926 }
2927
2928
2929 // ---------------------------------------
2930 // HELPERS
2931
2932
2933 /**
2934 * check whether we are permitted to complete a request
2935 *
2936 * Check whether we have permission to perform the operation specified
2937 * by mask on the given inode, based on the capability in the mdr's
2938 * session.
2939 */
2940 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
2941 {
2942 if (mdr->session) {
2943 int r = mdr->session->check_access(
2944 in, mask,
2945 mdr->client_request->get_caller_uid(),
2946 mdr->client_request->get_caller_gid(),
2947 &mdr->client_request->get_caller_gid_list(),
2948 mdr->client_request->head.args.setattr.uid,
2949 mdr->client_request->head.args.setattr.gid);
2950 if (r < 0) {
2951 respond_to_request(mdr, r);
2952 return false;
2953 }
2954 }
2955 return true;
2956 }
2957
2958 /**
2959 * check whether fragment has reached maximum size
2960 *
2961 */
2962 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
2963 {
2964 const auto size = in->get_frag_size();
2965 if (size >= g_conf()->mds_bal_fragment_size_max) {
2966 dout(10) << "fragment " << *in << " size exceeds " << g_conf()->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
2967 respond_to_request(mdr, -ENOSPC);
2968 return false;
2969 }
2970
2971 return true;
2972 }
2973
2974
2975 /** validate_dentry_dir
2976 *
2977 * verify that the dir exists and would own the dname.
2978 * do not check if the dentry exists.
2979 */
2980 CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, std::string_view dname)
2981 {
2982 // make sure parent is a dir?
2983 if (!diri->is_dir()) {
2984 dout(7) << "validate_dentry_dir: not a dir" << dendl;
2985 respond_to_request(mdr, -ENOTDIR);
2986 return NULL;
2987 }
2988
2989 // which dirfrag?
2990 frag_t fg = diri->pick_dirfrag(dname);
2991 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
2992 if (!dir)
2993 return 0;
2994
2995 // frozen?
2996 if (dir->is_frozen()) {
2997 dout(7) << "dir is frozen " << *dir << dendl;
2998 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2999 return NULL;
3000 }
3001
3002 return dir;
3003 }
3004
3005
3006 /** prepare_null_dentry
3007 * prepare a null (or existing) dentry in given dir.
3008 * wait for any dn lock.
3009 */
3010 CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, std::string_view dname, bool okexist)
3011 {
3012 dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
3013 ceph_assert(dir->is_auth());
3014
3015 client_t client = mdr->get_client();
3016
3017 // does it already exist?
3018 CDentry *dn = dir->lookup(dname);
3019 if (dn) {
3020 /*
3021 if (dn->lock.is_xlocked_by_other(mdr)) {
3022 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
3023 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
3024 return 0;
3025 }
3026 */
3027 if (!dn->get_linkage(client, mdr)->is_null()) {
3028 // name already exists
3029 dout(10) << "dentry " << dname << " exists in " << *dir << dendl;
3030 if (!okexist) {
3031 respond_to_request(mdr, -EEXIST);
3032 return 0;
3033 }
3034 } else {
3035 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3036 dn->first = std::max(dn->first, next_snap);
3037 }
3038 return dn;
3039 }
3040
3041 // make sure dir is complete
3042 if (!dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(dname))) {
3043 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
3044 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
3045 return 0;
3046 }
3047
3048 // create
3049 dn = dir->add_null_dentry(dname, mdcache->get_global_snaprealm()->get_newest_seq() + 1);
3050 dn->mark_new();
3051 dout(10) << "prepare_null_dentry added " << *dn << dendl;
3052 return dn;
3053 }
3054
3055 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3056 {
3057 CDentry *straydn = mdr->straydn;
3058 if (straydn) {
3059 string straydname;
3060 in->name_stray_dentry(straydname);
3061 if (straydn->get_name() == straydname)
3062 return straydn;
3063
3064 ceph_assert(!mdr->done_locking);
3065 mdr->unpin(straydn);
3066 }
3067
3068 CDir *straydir = mdcache->get_stray_dir(in);
3069
3070 if (!mdr->client_request->is_replay() &&
3071 !check_fragment_space(mdr, straydir))
3072 return NULL;
3073
3074 straydn = mdcache->get_or_create_stray_dentry(in);
3075 mdr->straydn = straydn;
3076 mdr->pin(straydn);
3077 return straydn;
3078 }
3079
3080 /** prepare_new_inode
3081 *
3082 * create a new inode. set c/m/atime. hit dir pop.
3083 */
3084 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
3085 file_layout_t *layout)
3086 {
3087 CInode *in = new CInode(mdcache);
3088
3089 // Server::prepare_force_open_sessions() can re-open session in closing
3090 // state. In that corner case, session's prealloc_inos are being freed.
3091 // To simplify the code, we disallow using/refilling session's prealloc_ino
3092 // while session is opening.
3093 bool allow_prealloc_inos = !mdr->session->is_opening();
3094
3095 // assign ino
3096 if (allow_prealloc_inos &&
3097 mdr->session->info.prealloc_inos.size()) {
3098 mdr->used_prealloc_ino =
3099 in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used
3100 mds->sessionmap.mark_projected(mdr->session);
3101
3102 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3103 << " (" << mdr->session->info.prealloc_inos
3104 << ", " << mdr->session->info.prealloc_inos.size() << " left)"
3105 << dendl;
3106 } else {
3107 mdr->alloc_ino =
3108 in->inode.ino = mds->inotable->project_alloc_id();
3109 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3110 }
3111
3112 if (useino && useino != in->inode.ino) {
3113 dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
3114 mds->clog->error() << mdr->client_request->get_source()
3115 << " specified ino " << useino
3116 << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
3117 //ceph_abort(); // just for now.
3118 }
3119
3120 if (allow_prealloc_inos &&
3121 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3122 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
3123 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
3124 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
3125 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3126 mds->sessionmap.mark_projected(mdr->session);
3127 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3128 }
3129
3130 in->inode.version = 1;
3131 in->inode.xattr_version = 1;
3132 in->inode.nlink = 1; // FIXME
3133
3134 in->inode.mode = mode;
3135
3136 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
3137 if (in->inode.is_dir()) {
3138 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
3139 } else if (layout) {
3140 in->inode.layout = *layout;
3141 } else {
3142 in->inode.layout = mdcache->default_file_layout;
3143 }
3144
3145 in->inode.truncate_size = -1ull; // not truncated, yet!
3146 in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
3147
3148 CInode *diri = dir->get_inode();
3149
3150 dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
3151
3152 if (diri->inode.mode & S_ISGID) {
3153 dout(10) << " dir is sticky" << dendl;
3154 in->inode.gid = diri->inode.gid;
3155 if (S_ISDIR(mode)) {
3156 dout(10) << " new dir also sticky" << dendl;
3157 in->inode.mode |= S_ISGID;
3158 }
3159 } else
3160 in->inode.gid = mdr->client_request->get_caller_gid();
3161
3162 in->inode.uid = mdr->client_request->get_caller_uid();
3163
3164 in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
3165 mdr->get_op_stamp();
3166
3167 in->inode.change_attr = 0;
3168
3169 const MClientRequest::const_ref &req = mdr->client_request;
3170 if (req->get_data().length()) {
3171 auto p = req->get_data().cbegin();
3172
3173 // xattrs on new inode?
3174 CInode::mempool_xattr_map xattrs;
3175 decode(xattrs, p);
3176 for (const auto &p : xattrs) {
3177 dout(10) << "prepare_new_inode setting xattr " << p.first << dendl;
3178 auto em = in->xattrs.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple(p.second));
3179 if (!em.second)
3180 em.first->second = p.second;
3181 }
3182 }
3183
3184 if (!mds->mdsmap->get_inline_data_enabled() ||
3185 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
3186 in->inode.inline_data.version = CEPH_INLINE_NONE;
3187
3188 mdcache->add_inode(in); // add
3189 dout(10) << "prepare_new_inode " << *in << dendl;
3190 return in;
3191 }
3192
3193 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3194 {
3195 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3196 << " inotablev " << mds->inotable->get_projected_version()
3197 << dendl;
3198 blob->set_ino_alloc(mdr->alloc_ino,
3199 mdr->used_prealloc_ino,
3200 mdr->prealloc_inos,
3201 mdr->client_request->get_source(),
3202 mds->sessionmap.get_projected(),
3203 mds->inotable->get_projected_version());
3204 }
3205
3206 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3207 {
3208 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3209 << " / " << mdr->prealloc_inos
3210 << " / " << mdr->used_prealloc_ino << dendl;
3211
3212 if (mdr->alloc_ino) {
3213 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3214 }
3215 if (mdr->prealloc_inos.size()) {
3216 ceph_assert(session);
3217 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
3218 session->info.prealloc_inos.insert(mdr->prealloc_inos);
3219 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
3220 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3221 }
3222 if (mdr->used_prealloc_ino) {
3223 ceph_assert(session);
3224 session->info.used_inos.erase(mdr->used_prealloc_ino);
3225 mds->sessionmap.mark_dirty(session);
3226 }
3227 }
3228
3229 class C_MDS_TryFindInode : public ServerContext {
3230 MDRequestRef mdr;
3231 public:
3232 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3233 void finish(int r) override {
3234 if (r == -ESTALE) // :( find_ino_peers failed
3235 server->respond_to_request(mdr, r);
3236 else
3237 server->dispatch_client_request(mdr);
3238 }
3239 };
3240
3241 class CF_MDS_MDRContextFactory : public MDSContextFactory {
3242 public:
3243 CF_MDS_MDRContextFactory(MDCache *cache, MDRequestRef &mdr) : cache(cache), mdr(mdr) {}
3244 MDSContext *build() {
3245 return new C_MDS_RetryRequest(cache, mdr);
3246 }
3247 private:
3248 MDCache *cache;
3249 MDRequestRef mdr;
3250 };
3251
3252 CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath)
3253 {
3254 // figure parent dir vs dname
3255 if (refpath.depth() == 0) {
3256 dout(7) << "can't do that to root" << dendl;
3257 respond_to_request(mdr, -EINVAL);
3258 return 0;
3259 }
3260 string dname = refpath.last_dentry();
3261 refpath.pop_dentry();
3262
3263 dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl;
3264
3265 // traverse to parent dir
3266 CInode *diri;
3267 CF_MDS_MDRContextFactory cf(mdcache, mdr);
3268 int r = mdcache->path_traverse(mdr, cf, refpath, &trace, &diri, MDS_TRAVERSE_FORWARD);
3269 if (r > 0) return 0; // delayed
3270 if (r < 0) {
3271 if (r == -ESTALE) {
3272 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3273 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3274 return 0;
3275 }
3276 respond_to_request(mdr, r);
3277 return 0;
3278 }
3279
3280 // is it an auth dir?
3281 CDir *dir = validate_dentry_dir(mdr, diri, dname);
3282 if (!dir)
3283 return 0; // forwarded or waiting for freeze
3284
3285 dout(10) << "traverse_to_auth_dir " << *dir << dendl;
3286 return dir;
3287 }
3288
3289 /* If this returns null, the request has been handled
3290 * as appropriate: forwarded on, or the client's been replied to */
3291 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n,
3292 MutationImpl::LockOpVec& lov,
3293 bool want_auth,
3294 bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's
3295 a snapped dir */
3296 file_layout_t **layout,
3297 bool no_lookup) // true if we cannot return a null dentry lease
3298 {
3299 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
3300 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3301
3302 if (mdr->done_locking)
3303 return mdr->in[n];
3304
3305 // traverse
3306 CF_MDS_MDRContextFactory cf(mdcache, mdr);
3307 int r = mdcache->path_traverse(mdr, cf, refpath, &mdr->dn[n], &mdr->in[n], MDS_TRAVERSE_FORWARD);
3308 if (r > 0)
3309 return NULL; // delayed
3310 if (r < 0) { // error
3311 if (r == -ENOENT && n == 0 && !mdr->dn[n].empty()) {
3312 if (!no_lookup) {
3313 mdr->tracedn = mdr->dn[n].back();
3314 }
3315 respond_to_request(mdr, r);
3316 } else if (r == -ESTALE) {
3317 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3318 MDSContext *c = new C_MDS_TryFindInode(this, mdr);
3319 mdcache->find_ino_peers(refpath.get_ino(), c);
3320 } else {
3321 dout(10) << "FAIL on error " << r << dendl;
3322 respond_to_request(mdr, r);
3323 }
3324 return 0;
3325 }
3326 CInode *ref = mdr->in[n];
3327 dout(10) << "ref is " << *ref << dendl;
3328
3329 // fw to inode auth?
3330 if (mdr->snapid != CEPH_NOSNAP && !no_want_auth)
3331 want_auth = true;
3332
3333 if (want_auth) {
3334 if (ref->is_ambiguous_auth()) {
3335 dout(10) << "waiting for single auth on " << *ref << dendl;
3336 ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
3337 return 0;
3338 }
3339 if (!ref->is_auth()) {
3340 dout(10) << "fw to auth for " << *ref << dendl;
3341 mdcache->request_forward(mdr, ref->authority().first);
3342 return 0;
3343 }
3344
3345 // auth_pin?
3346 // do NOT proceed if freezing, as cap release may defer in that case, and
3347 // we could deadlock when we try to lock @ref.
3348 // if we're already auth_pinned, continue; the release has already been processed.
3349 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3350 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3351 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
3352 ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3353 /* If we have any auth pins, this will deadlock.
3354 * But the only way to get here if we've already got auth pins
3355 * is because we're on an inode with snapshots that got updated
3356 * between dispatches of this request. So we're going to drop
3357 * our locks and our auth pins and reacquire them later.
3358 *
3359 * This is safe since we're only in this function when working on
3360 * a single MDS request; otherwise we'd be in
3361 * rdlock_path_xlock_dentry.
3362 */
3363 mds->locker->drop_locks(mdr.get(), NULL);
3364 mdr->drop_local_auth_pins();
3365 if (!mdr->remote_auth_pins.empty())
3366 mds->locker->notify_freeze_waiter(ref);
3367 return 0;
3368 }
3369
3370 mdr->auth_pin(ref);
3371 }
3372
3373 for (int i=0; i<(int)mdr->dn[n].size(); i++)
3374 lov.add_rdlock(&mdr->dn[n][i]->lock);
3375 if (layout)
3376 mds->locker->include_snap_rdlocks_wlayout(ref, lov, layout);
3377 else
3378 mds->locker->include_snap_rdlocks(ref, lov);
3379
3380 // set and pin ref
3381 mdr->pin(ref);
3382 return ref;
3383 }
3384
3385
3386 /** rdlock_path_xlock_dentry
3387 * traverse path to the directory that could/would contain dentry.
3388 * make sure i am auth for that dentry, forward as necessary.
3389 * create null dentry in place (or use existing if okexist).
3390 * get rdlocks on traversed dentries, xlock on new dentry.
3391 */
3392 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
3393 MutationImpl::LockOpVec& lov,
3394 bool okexist, bool mustexist, bool alwaysxlock,
3395 file_layout_t **layout)
3396 {
3397 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
3398
3399 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3400
3401 client_t client = mdr->get_client();
3402
3403 if (mdr->done_locking)
3404 return mdr->dn[n].back();
3405
3406 CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
3407 if (!dir) return 0;
3408
3409 CInode *diri = dir->get_inode();
3410 if (!mdr->reqid.name.is_mds()) {
3411 if (diri->is_system() && !diri->is_root()) {
3412 respond_to_request(mdr, -EROFS);
3413 return 0;
3414 }
3415 }
3416 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
3417 respond_to_request(mdr, -ENOENT);
3418 return 0;
3419 }
3420
3421 // make a null dentry?
3422 std::string_view dname = refpath.last_dentry();
3423 CDentry *dn;
3424 if (mustexist) {
3425 dn = dir->lookup(dname);
3426
3427 // make sure dir is complete
3428 if (!dn && !dir->is_complete() &&
3429 (!dir->has_bloom() || dir->is_in_bloom(dname))) {
3430 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
3431 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
3432 return 0;
3433 }
3434
3435 // readable?
3436 if (dn && !dn->lock.can_read(client) && dn->lock.get_xlock_by() != mdr) {
3437 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
3438 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
3439 return 0;
3440 }
3441
3442 // exists?
3443 if (!dn || dn->get_linkage(client, mdr)->is_null()) {
3444 dout(7) << "dentry " << dname << " dne in " << *dir << dendl;
3445 respond_to_request(mdr, -ENOENT);
3446 return 0;
3447 }
3448 } else {
3449 dn = prepare_null_dentry(mdr, dir, dname, okexist);
3450 if (!dn)
3451 return 0;
3452 }
3453
3454 mdr->dn[n].push_back(dn);
3455 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
3456 mdr->in[n] = dnl->get_inode();
3457
3458 // -- lock --
3459 // NOTE: rename takes the same set of locks for srcdn
3460 for (int i=0; i<(int)mdr->dn[n].size(); i++)
3461 lov.add_rdlock(&mdr->dn[n][i]->lock);
3462 if (alwaysxlock || dnl->is_null())
3463 lov.add_xlock(&dn->lock); // new dn, xlock
3464 else
3465 lov.add_rdlock(&dn->lock); // existing dn, rdlock
3466 lov.add_wrlock(&dn->get_dir()->inode->filelock); // also, wrlock on dir mtime
3467 lov.add_wrlock(&dn->get_dir()->inode->nestlock); // also, wrlock on dir mtime
3468 if (layout)
3469 mds->locker->include_snap_rdlocks_wlayout(dn->get_dir()->inode, lov, layout);
3470 else
3471 mds->locker->include_snap_rdlocks(dn->get_dir()->inode, lov);
3472
3473 return dn;
3474 }
3475
3476
3477
3478
3479
3480 /**
3481 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3482 *
3483 * @param diri base inode
3484 * @param fg the exact frag we want
3485 * @param mdr request
3486 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3487 */
3488 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3489 {
3490 CDir *dir = diri->get_dirfrag(fg);
3491
3492 // not open and inode not mine?
3493 if (!dir && !diri->is_auth()) {
3494 mds_rank_t inauth = diri->authority().first;
3495 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3496 mdcache->request_forward(mdr, inauth);
3497 return 0;
3498 }
3499
3500 // not open and inode frozen?
3501 if (!dir && diri->is_frozen()) {
3502 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3503 ceph_assert(diri->get_parent_dir());
3504 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3505 return 0;
3506 }
3507
3508 // invent?
3509 if (!dir)
3510 dir = diri->get_or_open_dirfrag(mdcache, fg);
3511
3512 // am i auth for the dirfrag?
3513 if (!dir->is_auth()) {
3514 mds_rank_t auth = dir->authority().first;
3515 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3516 << ", fw to mds." << auth << dendl;
3517 mdcache->request_forward(mdr, auth);
3518 return 0;
3519 }
3520
3521 return dir;
3522 }
3523
3524
3525 // ===============================================================================
3526 // STAT
3527
3528 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3529 {
3530 const MClientRequest::const_ref &req = mdr->client_request;
3531
3532 if (req->get_filepath().depth() == 0 && is_lookup) {
3533 // refpath can't be empty for lookup but it can for
3534 // getattr (we do getattr with empty refpath for mount of '/')
3535 respond_to_request(mdr, -EINVAL);
3536 return;
3537 }
3538
3539 bool want_auth = false;
3540 int mask = req->head.args.getattr.mask;
3541 if (mask & CEPH_STAT_RSTAT)
3542 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3543
3544 MutationImpl::LockOpVec lov;
3545 CInode *ref = rdlock_path_pin_ref(mdr, 0, lov, want_auth, false, NULL,
3546 !is_lookup);
3547 if (!ref) return;
3548
3549 /*
3550 * if client currently holds the EXCL cap on a field, do not rdlock
3551 * it; client's stat() will result in valid info if _either_ EXCL
3552 * cap is held or MDS rdlocks and reads the value here.
3553 *
3554 * handling this case here is easier than weakening rdlock
3555 * semantics... that would cause problems elsewhere.
3556 */
3557 client_t client = mdr->get_client();
3558 int issued = 0;
3559 Capability *cap = ref->get_client_cap(client);
3560 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3561 mdr->snapid <= cap->client_follows))
3562 issued = cap->issued();
3563
3564 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
3565 lov.add_rdlock(&ref->linklock);
3566 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
3567 lov.add_rdlock(&ref->authlock);
3568 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
3569 lov.add_rdlock(&ref->xattrlock);
3570 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3571 // Don't wait on unstable filelock if client is allowed to read file size.
3572 // This can reduce the response time of getattr in the case that multiple
3573 // clients do stat(2) and there are writers.
3574 // The downside of this optimization is that mds may not issue Fs caps along
3575 // with getattr reply. Client may need to send more getattr requests.
3576 if (mdr->is_rdlocked(&ref->filelock)) {
3577 lov.add_rdlock(&ref->filelock);
3578 } else if (ref->filelock.is_stable() ||
3579 ref->filelock.get_num_wrlocks() > 0 ||
3580 !ref->filelock.can_read(mdr->get_client())) {
3581 lov.add_rdlock(&ref->filelock);
3582 mdr->done_locking = false;
3583 }
3584 }
3585
3586 if (!mds->locker->acquire_locks(mdr, lov))
3587 return;
3588
3589 if (!check_access(mdr, ref, MAY_READ))
3590 return;
3591
3592 utime_t now = ceph_clock_now();
3593 mdr->set_mds_stamp(now);
3594
3595 // note which caps are requested, so we return at least a snapshot
3596 // value for them. (currently this matters for xattrs and inline data)
3597 mdr->getattr_caps = mask;
3598
3599 mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
3600
3601 // reply
3602 dout(10) << "reply to stat on " << *req << dendl;
3603 mdr->tracei = ref;
3604 if (is_lookup)
3605 mdr->tracedn = mdr->dn[0].back();
3606 respond_to_request(mdr, 0);
3607 }
3608
3609 struct C_MDS_LookupIno2 : public ServerContext {
3610 MDRequestRef mdr;
3611 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3612 void finish(int r) override {
3613 server->_lookup_ino_2(mdr, r);
3614 }
3615 };
3616
3617 /*
3618 * filepath: ino
3619 */
3620 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3621 bool want_parent, bool want_dentry)
3622 {
3623 const MClientRequest::const_ref &req = mdr->client_request;
3624
3625 if ((uint64_t)req->head.args.lookupino.snapid > 0)
3626 return _lookup_snap_ino(mdr);
3627
3628 inodeno_t ino = req->get_filepath().get_ino();
3629 CInode *in = mdcache->get_inode(ino);
3630 if (in && in->state_test(CInode::STATE_PURGING)) {
3631 respond_to_request(mdr, -ESTALE);
3632 return;
3633 }
3634 if (!in) {
3635 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3636 return;
3637 }
3638
3639 if (mdr && in->snaprealm && !in->snaprealm->have_past_parents_open() &&
3640 !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
3641 return;
3642 }
3643
3644 // check for nothing (not read or write); this still applies the
3645 // path check.
3646 if (!check_access(mdr, in, 0))
3647 return;
3648
3649 CDentry *dn = in->get_projected_parent_dn();
3650 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3651
3652 MutationImpl::LockOpVec lov;
3653 if (dn && (want_parent || want_dentry)) {
3654 mdr->pin(dn);
3655 lov.add_rdlock(&dn->lock);
3656 }
3657
3658 unsigned mask = req->head.args.lookupino.mask;
3659 if (mask) {
3660 Capability *cap = in->get_client_cap(mdr->get_client());
3661 int issued = 0;
3662 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3663 issued = cap->issued();
3664 // permission bits, ACL/security xattrs
3665 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3666 lov.add_rdlock(&in->authlock);
3667 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3668 lov.add_rdlock(&in->xattrlock);
3669
3670 mdr->getattr_caps = mask;
3671 }
3672
3673 if (!lov.empty()) {
3674 if (!mds->locker->acquire_locks(mdr, lov))
3675 return;
3676
3677 if (diri != NULL) {
3678 // need read access to directory inode
3679 if (!check_access(mdr, diri, MAY_READ))
3680 return;
3681 }
3682 }
3683
3684 if (want_parent) {
3685 if (in->is_base()) {
3686 respond_to_request(mdr, -EINVAL);
3687 return;
3688 }
3689 if (!diri || diri->is_stray()) {
3690 respond_to_request(mdr, -ESTALE);
3691 return;
3692 }
3693 dout(10) << "reply to lookup_parent " << *in << dendl;
3694 mdr->tracei = diri;
3695 respond_to_request(mdr, 0);
3696 } else {
3697 if (want_dentry) {
3698 inodeno_t dirino = req->get_filepath2().get_ino();
3699 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3700 respond_to_request(mdr, -ENOENT);
3701 return;
3702 }
3703 dout(10) << "reply to lookup_name " << *in << dendl;
3704 } else
3705 dout(10) << "reply to lookup_ino " << *in << dendl;
3706
3707 mdr->tracei = in;
3708 if (want_dentry)
3709 mdr->tracedn = dn;
3710 respond_to_request(mdr, 0);
3711 }
3712 }
3713
3714 void Server::_lookup_snap_ino(MDRequestRef& mdr)
3715 {
3716 const MClientRequest::const_ref &req = mdr->client_request;
3717
3718 vinodeno_t vino;
3719 vino.ino = req->get_filepath().get_ino();
3720 vino.snapid = (__u64)req->head.args.lookupino.snapid;
3721 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
3722 __u32 hash = req->head.args.lookupino.hash;
3723
3724 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
3725
3726 CInode *in = mdcache->lookup_snap_inode(vino);
3727 if (!in) {
3728 in = mdcache->get_inode(vino.ino);
3729 if (in) {
3730 if (in->state_test(CInode::STATE_PURGING) ||
3731 !in->has_snap_data(vino.snapid)) {
3732 if (in->is_dir() || !parent_ino) {
3733 respond_to_request(mdr, -ESTALE);
3734 return;
3735 }
3736 in = NULL;
3737 }
3738 }
3739 }
3740
3741 if (in) {
3742 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
3743 mdr->snapid = vino.snapid;
3744 mdr->tracei = in;
3745 respond_to_request(mdr, 0);
3746 return;
3747 }
3748
3749 CInode *diri = NULL;
3750 if (parent_ino) {
3751 diri = mdcache->get_inode(parent_ino);
3752 if (!diri) {
3753 mdcache->open_ino(parent_ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
3754 return;
3755 }
3756
3757 if (!diri->is_dir()) {
3758 respond_to_request(mdr, -EINVAL);
3759 return;
3760 }
3761
3762 MutationImpl::LockOpVec lov;
3763 lov.add_rdlock(&diri->dirfragtreelock);
3764 if (!mds->locker->acquire_locks(mdr, lov))
3765 return;
3766
3767 frag_t frag = diri->dirfragtree[hash];
3768 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
3769 if (!dir)
3770 return;
3771
3772 if (!dir->is_complete()) {
3773 if (dir->is_frozen()) {
3774 mds->locker->drop_locks(mdr.get());
3775 mdr->drop_local_auth_pins();
3776 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3777 return;
3778 }
3779 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
3780 return;
3781 }
3782
3783 respond_to_request(mdr, -ESTALE);
3784 } else {
3785 mdcache->open_ino(vino.ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
3786 }
3787 }
3788
3789 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
3790 {
3791 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
3792 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3793
3794 // `r` is a rank if >=0, else an error code
3795 if (r >= 0) {
3796 mds_rank_t dest_rank(r);
3797 if (dest_rank == mds->get_nodeid())
3798 dispatch_client_request(mdr);
3799 else
3800 mdcache->request_forward(mdr, dest_rank);
3801 return;
3802 }
3803
3804 // give up
3805 if (r == -ENOENT || r == -ENODATA)
3806 r = -ESTALE;
3807 respond_to_request(mdr, r);
3808 }
3809
3810
3811 /* This function takes responsibility for the passed mdr*/
3812 void Server::handle_client_open(MDRequestRef& mdr)
3813 {
3814 const MClientRequest::const_ref &req = mdr->client_request;
3815 dout(7) << "open on " << req->get_filepath() << dendl;
3816
3817 int flags = req->head.args.open.flags;
3818 int cmode = ceph_flags_to_mode(flags);
3819 if (cmode < 0) {
3820 respond_to_request(mdr, -EINVAL);
3821 return;
3822 }
3823
3824 bool need_auth = !file_mode_is_readonly(cmode) ||
3825 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
3826
3827 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
3828 dout(7) << "read-only FS" << dendl;
3829 respond_to_request(mdr, -EROFS);
3830 return;
3831 }
3832
3833 MutationImpl::LockOpVec lov;
3834 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, need_auth);
3835 if (!cur)
3836 return;
3837
3838 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
3839 ceph_assert(!need_auth);
3840 mdr->done_locking = false;
3841 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
3842 if (!cur)
3843 return;
3844 }
3845
3846 if (!cur->inode.is_file()) {
3847 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3848 cmode = CEPH_FILE_MODE_PIN;
3849 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3850 if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
3851 flags &= ~CEPH_O_TRUNC;
3852 }
3853
3854 dout(10) << "open flags = " << flags
3855 << ", filemode = " << cmode
3856 << ", need_auth = " << need_auth
3857 << dendl;
3858
3859 // regular file?
3860 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3861 dout(7) << "not a file or dir " << *cur << dendl;
3862 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3863 return;
3864 }*/
3865 if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
3866 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
3867 respond_to_request(mdr, -EINVAL);
3868 return;
3869 }
3870
3871 if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
3872 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
3873 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3874 respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
3875 return;
3876 }
3877
3878 if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
3879 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
3880 dout(7) << "old client cannot open inline data file " << *cur << dendl;
3881 respond_to_request(mdr, -EPERM);
3882 return;
3883 }
3884
3885 // snapped data is read only
3886 if (mdr->snapid != CEPH_NOSNAP &&
3887 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
3888 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
3889 respond_to_request(mdr, -EROFS);
3890 return;
3891 }
3892
3893 unsigned mask = req->head.args.open.mask;
3894 if (mask) {
3895 Capability *cap = cur->get_client_cap(mdr->get_client());
3896 int issued = 0;
3897 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3898 issued = cap->issued();
3899 // permission bits, ACL/security xattrs
3900 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3901 lov.add_rdlock(&cur->authlock);
3902 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3903 lov.add_rdlock(&cur->xattrlock);
3904
3905 mdr->getattr_caps = mask;
3906 }
3907
3908 // O_TRUNC
3909 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
3910 ceph_assert(cur->is_auth());
3911
3912 lov.add_xlock(&cur->filelock);
3913 if (!mds->locker->acquire_locks(mdr, lov))
3914 return;
3915
3916 if (!check_access(mdr, cur, MAY_WRITE))
3917 return;
3918
3919 // wait for pending truncate?
3920 const auto pi = cur->get_projected_inode();
3921 if (pi->is_truncating()) {
3922 dout(10) << " waiting for pending truncate from " << pi->truncate_from
3923 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
3924 mds->locker->drop_locks(mdr.get());
3925 mdr->drop_local_auth_pins();
3926 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
3927 return;
3928 }
3929
3930 do_open_truncate(mdr, cmode);
3931 return;
3932 }
3933
3934 // sync filelock if snapped.
3935 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3936 // and that data itself is flushed so that we can read the snapped data off disk.
3937 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
3938 lov.add_rdlock(&cur->filelock);
3939 }
3940
3941 if (!mds->locker->acquire_locks(mdr, lov))
3942 return;
3943
3944 mask = MAY_READ;
3945 if (cmode & CEPH_FILE_MODE_WR)
3946 mask |= MAY_WRITE;
3947 if (!check_access(mdr, cur, mask))
3948 return;
3949
3950 utime_t now = ceph_clock_now();
3951 mdr->set_mds_stamp(now);
3952
3953 if (cur->is_file() || cur->is_dir()) {
3954 if (mdr->snapid == CEPH_NOSNAP) {
3955 // register new cap
3956 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr->session, 0, req->is_replay());
3957 if (cap)
3958 dout(12) << "open issued caps " << ccap_string(cap->pending())
3959 << " for " << req->get_source()
3960 << " on " << *cur << dendl;
3961 } else {
3962 int caps = ceph_caps_for_mode(cmode);
3963 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
3964 << " for " << req->get_source()
3965 << " snapid " << mdr->snapid
3966 << " on " << *cur << dendl;
3967 mdr->snap_caps = caps;
3968 }
3969 }
3970
3971 // increase max_size?
3972 if (cmode & CEPH_FILE_MODE_WR)
3973 mds->locker->check_inode_max_size(cur);
3974
3975 // make sure this inode gets into the journal
3976 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
3977 mdcache->open_file_table.should_log_open(cur)) {
3978 EOpen *le = new EOpen(mds->mdlog);
3979 mdlog->start_entry(le);
3980 le->add_clean_inode(cur);
3981 mdlog->submit_entry(le);
3982 }
3983
3984 // hit pop
3985 if (cmode & CEPH_FILE_MODE_WR)
3986 mds->balancer->hit_inode(cur, META_POP_IWR);
3987 else
3988 mds->balancer->hit_inode(cur, META_POP_IRD,
3989 mdr->client_request->get_source().num());
3990
3991 CDentry *dn = 0;
3992 if (req->get_dentry_wanted()) {
3993 ceph_assert(mdr->dn[0].size());
3994 dn = mdr->dn[0].back();
3995 }
3996
3997 mdr->tracei = cur;
3998 mdr->tracedn = dn;
3999 respond_to_request(mdr, 0);
4000 }
4001
4002 class C_MDS_openc_finish : public ServerLogContext {
4003 CDentry *dn;
4004 CInode *newi;
4005 public:
4006 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4007 ServerLogContext(s, r), dn(d), newi(ni) {}
4008 void finish(int r) override {
4009 ceph_assert(r == 0);
4010
4011 dn->pop_projected_linkage();
4012
4013 // dirty inode, dn, dir
4014 newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
4015 newi->mark_dirty(newi->inode.version+1, mdr->ls);
4016 newi->mark_dirty_parent(mdr->ls, true);
4017
4018 mdr->apply();
4019
4020 get_mds()->locker->share_inode_max_size(newi);
4021
4022 MDRequestRef null_ref;
4023 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4024
4025 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
4026
4027 server->respond_to_request(mdr, 0);
4028
4029 ceph_assert(g_conf()->mds_kill_openc_at != 1);
4030 }
4031 };
4032
4033 /* This function takes responsibility for the passed mdr*/
4034 void Server::handle_client_openc(MDRequestRef& mdr)
4035 {
4036 const MClientRequest::const_ref &req = mdr->client_request;
4037 client_t client = mdr->get_client();
4038
4039 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4040
4041 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4042 if (cmode < 0) {
4043 respond_to_request(mdr, -EINVAL);
4044 return;
4045 }
4046
4047 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
4048
4049 if (!excl) {
4050 CF_MDS_MDRContextFactory cf(mdcache, mdr);
4051 int r = mdcache->path_traverse(mdr, cf, req->get_filepath(),
4052 &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
4053 if (r > 0) return;
4054 if (r == 0) {
4055 // it existed.
4056 handle_client_open(mdr);
4057 return;
4058 }
4059 if (r < 0 && r != -ENOENT) {
4060 if (r == -ESTALE) {
4061 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
4062 MDSContext *c = new C_MDS_TryFindInode(this, mdr);
4063 mdcache->find_ino_peers(req->get_filepath().get_ino(), c);
4064 } else {
4065 dout(10) << "FAIL on error " << r << dendl;
4066 respond_to_request(mdr, r);
4067 }
4068 return;
4069 }
4070 }
4071
4072 MutationImpl::LockOpVec lov;
4073 file_layout_t *dir_layout = nullptr;
4074 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov,
4075 !excl, false, false, &dir_layout);
4076 if (!dn) return;
4077 if (mdr->snapid != CEPH_NOSNAP) {
4078 respond_to_request(mdr, -EROFS);
4079 return;
4080 }
4081 // set layout
4082 file_layout_t layout;
4083 if (dir_layout)
4084 layout = *dir_layout;
4085 else
4086 layout = mdcache->default_file_layout;
4087
4088 // What kind of client caps are required to complete this operation
4089 uint64_t access = MAY_WRITE;
4090
4091 const auto default_layout = layout;
4092
4093 // fill in any special params from client
4094 if (req->head.args.open.stripe_unit)
4095 layout.stripe_unit = req->head.args.open.stripe_unit;
4096 if (req->head.args.open.stripe_count)
4097 layout.stripe_count = req->head.args.open.stripe_count;
4098 if (req->head.args.open.object_size)
4099 layout.object_size = req->head.args.open.object_size;
4100 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4101 (__s32)req->head.args.open.pool >= 0) {
4102 layout.pool_id = req->head.args.open.pool;
4103
4104 // make sure we have as new a map as the client
4105 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4106 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4107 return;
4108 }
4109 }
4110
4111 // If client doesn't have capability to modify layout pools, then
4112 // only permit this request if the requested pool matches what the
4113 // file would have inherited anyway from its parent.
4114 if (default_layout != layout) {
4115 access |= MAY_SET_VXATTR;
4116 }
4117
4118 if (!layout.is_valid()) {
4119 dout(10) << " invalid initial file layout" << dendl;
4120 respond_to_request(mdr, -EINVAL);
4121 return;
4122 }
4123 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4124 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4125 respond_to_request(mdr, -EINVAL);
4126 return;
4127 }
4128
4129 // created null dn.
4130 CDir *dir = dn->get_dir();
4131 CInode *diri = dir->get_inode();
4132 lov.add_rdlock(&diri->authlock);
4133 if (!mds->locker->acquire_locks(mdr, lov))
4134 return;
4135
4136 if (!check_access(mdr, diri, access))
4137 return;
4138
4139 if (!check_fragment_space(mdr, dir))
4140 return;
4141
4142 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4143
4144 if (!dnl->is_null()) {
4145 // it existed.
4146 ceph_assert(req->head.args.open.flags & CEPH_O_EXCL);
4147 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
4148 mdr->tracei = dnl->get_inode();
4149 mdr->tracedn = dn;
4150 respond_to_request(mdr, -EEXIST);
4151 return;
4152 }
4153
4154 // create inode.
4155 CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4156 req->head.args.open.mode | S_IFREG, &layout);
4157 ceph_assert(in);
4158
4159 // it's a file.
4160 dn->push_projected_linkage(in);
4161
4162 in->inode.version = dn->pre_dirty();
4163 if (layout.pool_id != mdcache->default_file_layout.pool_id)
4164 in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
4165 in->inode.update_backtrace();
4166 in->inode.rstat.rfiles = 1;
4167
4168 SnapRealm *realm = diri->find_snaprealm();
4169 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4170 ceph_assert(follows >= realm->get_newest_seq());
4171
4172 ceph_assert(dn->first == follows+1);
4173 in->first = dn->first;
4174
4175 // do the open
4176 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
4177 in->authlock.set_state(LOCK_EXCL);
4178 in->xattrlock.set_state(LOCK_EXCL);
4179
4180 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
4181 in->inode.client_ranges[client].range.first = 0;
4182 in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
4183 in->inode.client_ranges[client].follows = follows;
4184 cap->mark_clientwriteable();
4185 }
4186
4187 // prepare finisher
4188 mdr->ls = mdlog->get_current_segment();
4189 EUpdate *le = new EUpdate(mdlog, "openc");
4190 mdlog->start_entry(le);
4191 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4192 journal_allocated_inos(mdr, &le->metablob);
4193 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4194 le->metablob.add_primary_dentry(dn, in, true, true, true);
4195
4196 // make sure this inode gets into the journal
4197 le->metablob.add_opened_ino(in->ino());
4198
4199 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in);
4200
4201 if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
4202 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4203 // add the file created flag onto the reply if create_flags features is supported
4204 encode(in->inode.ino, mdr->reply_extra_bl);
4205 }
4206
4207 journal_and_reply(mdr, in, dn, le, fin);
4208
4209 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4210 // have overshot the split size (multiple opencs in flight), so here is
4211 // an early chance to split the dir if this openc makes it oversized.
4212 mds->balancer->maybe_fragment(dir, false);
4213 }
4214
4215
4216
4217 void Server::handle_client_readdir(MDRequestRef& mdr)
4218 {
4219 const MClientRequest::const_ref &req = mdr->client_request;
4220 client_t client = req->get_source().num();
4221 MutationImpl::LockOpVec lov;
4222 CInode *diri = rdlock_path_pin_ref(mdr, 0, lov, false, true);
4223 if (!diri) return;
4224
4225 // it's a directory, right?
4226 if (!diri->is_dir()) {
4227 // not a dir
4228 dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
4229 respond_to_request(mdr, -ENOTDIR);
4230 return;
4231 }
4232
4233 lov.add_rdlock(&diri->filelock);
4234 lov.add_rdlock(&diri->dirfragtreelock);
4235
4236 if (!mds->locker->acquire_locks(mdr, lov))
4237 return;
4238
4239 if (!check_access(mdr, diri, MAY_READ))
4240 return;
4241
4242 // which frag?
4243 frag_t fg = (__u32)req->head.args.readdir.frag;
4244 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4245 string offset_str = req->get_path2();
4246
4247 __u32 offset_hash = 0;
4248 if (!offset_str.empty())
4249 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4250 else
4251 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4252
4253 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4254 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4255
4256 // does the frag exist?
4257 if (diri->dirfragtree[fg.value()] != fg) {
4258 frag_t newfg;
4259 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4260 if (fg.contains((unsigned)offset_hash)) {
4261 newfg = diri->dirfragtree[offset_hash];
4262 } else {
4263 // client actually wants next frag
4264 newfg = diri->dirfragtree[fg.value()];
4265 }
4266 } else {
4267 offset_str.clear();
4268 newfg = diri->dirfragtree[fg.value()];
4269 }
4270 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4271 fg = newfg;
4272 }
4273
4274 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4275 if (!dir) return;
4276
4277 // ok!
4278 dout(10) << "handle_client_readdir on " << *dir << dendl;
4279 ceph_assert(dir->is_auth());
4280
4281 if (!dir->is_complete()) {
4282 if (dir->is_frozen()) {
4283 dout(7) << "dir is frozen " << *dir << dendl;
4284 mds->locker->drop_locks(mdr.get());
4285 mdr->drop_local_auth_pins();
4286 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4287 return;
4288 }
4289 // fetch
4290 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4291 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4292 return;
4293 }
4294
4295 #ifdef MDS_VERIFY_FRAGSTAT
4296 dir->verify_fragstat();
4297 #endif
4298
4299 utime_t now = ceph_clock_now();
4300 mdr->set_mds_stamp(now);
4301
4302 snapid_t snapid = mdr->snapid;
4303 dout(10) << "snapid " << snapid << dendl;
4304
4305 SnapRealm *realm = diri->find_snaprealm();
4306
4307 unsigned max = req->head.args.readdir.max_entries;
4308 if (!max)
4309 max = dir->get_num_any(); // whatever, something big.
4310 unsigned max_bytes = req->head.args.readdir.max_bytes;
4311 if (!max_bytes)
4312 // make sure at least one item can be encoded
4313 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
4314
4315 // start final blob
4316 bufferlist dirbl;
4317 DirStat ds;
4318 ds.frag = dir->get_frag();
4319 ds.auth = dir->get_dir_auth().first;
4320 if (dir->is_auth())
4321 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4322
4323 dir->encode_dirstat(dirbl, mdr->session->info, ds);
4324
4325 // count bytes available.
4326 // this isn't perfect, but we should capture the main variable/unbounded size items!
4327 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4328 int bytes_left = max_bytes - front_bytes;
4329 bytes_left -= realm->get_snap_trace().length();
4330
4331 // build dir contents
4332 bufferlist dnbl;
4333 __u32 numfiles = 0;
4334 bool start = !offset_hash && offset_str.empty();
4335 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4336 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
4337 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4338 bool end = (it == dir->end());
4339 for (; !end && numfiles < max; end = (it == dir->end())) {
4340 CDentry *dn = it->second;
4341 ++it;
4342
4343 if (dn->state_test(CDentry::STATE_PURGING))
4344 continue;
4345
4346 bool dnp = dn->use_projected(client, mdr);
4347 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4348
4349 if (dnl->is_null())
4350 continue;
4351
4352 if (dn->last < snapid || dn->first > snapid) {
4353 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4354 continue;
4355 }
4356
4357 if (!start) {
4358 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4359 if (!(offset_key < dn->key()))
4360 continue;
4361 }
4362
4363 CInode *in = dnl->get_inode();
4364
4365 if (in && in->ino() == CEPH_INO_CEPH)
4366 continue;
4367
4368 // remote link?
4369 // better for the MDS to do the work, if we think the client will stat any of these files.
4370 if (dnl->is_remote() && !in) {
4371 in = mdcache->get_inode(dnl->get_remote_ino());
4372 if (in) {
4373 dn->link_remote(dnl, in);
4374 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4375 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4376 continue;
4377 } else {
4378 // touch everything i _do_ have
4379 for (auto &p : *dir) {
4380 if (!p.second->get_linkage()->is_null())
4381 mdcache->lru.lru_touch(p.second);
4382 }
4383
4384 // already issued caps and leases, reply immediately.
4385 if (dnbl.length() > 0) {
4386 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4387 dout(10) << " open remote dentry after caps were issued, stopping at "
4388 << dnbl.length() << " < " << bytes_left << dendl;
4389 break;
4390 }
4391
4392 mds->locker->drop_locks(mdr.get());
4393 mdr->drop_local_auth_pins();
4394 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4395 return;
4396 }
4397 }
4398 ceph_assert(in);
4399
4400 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
4401 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4402 break;
4403 }
4404
4405 unsigned start_len = dnbl.length();
4406
4407 // dentry
4408 dout(12) << "including dn " << *dn << dendl;
4409 encode(dn->get_name(), dnbl);
4410 mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session);
4411
4412 // inode
4413 dout(12) << "including inode " << *in << dendl;
4414 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4415 if (r < 0) {
4416 // chop off dn->name, lease
4417 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4418 bufferlist keep;
4419 keep.substr_of(dnbl, 0, start_len);
4420 dnbl.swap(keep);
4421 break;
4422 }
4423 ceph_assert(r >= 0);
4424 numfiles++;
4425
4426 // touch dn
4427 mdcache->lru.lru_touch(dn);
4428 }
4429
4430 __u16 flags = 0;
4431 if (end) {
4432 flags = CEPH_READDIR_FRAG_END;
4433 if (start)
4434 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4435 }
4436 // client only understand END and COMPLETE flags ?
4437 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4438 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4439 }
4440
4441 // finish final blob
4442 encode(numfiles, dirbl);
4443 encode(flags, dirbl);
4444 dirbl.claim_append(dnbl);
4445
4446 // yay, reply
4447 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4448 << " bytes=" << dirbl.length()
4449 << " start=" << (int)start
4450 << " end=" << (int)end
4451 << dendl;
4452 mdr->reply_extra_bl = dirbl;
4453
4454 // bump popularity. NOTE: this doesn't quite capture it.
4455 mds->balancer->hit_dir(dir, META_POP_IRD, -1, numfiles);
4456
4457 // reply
4458 mdr->tracei = diri;
4459 respond_to_request(mdr, 0);
4460 }
4461
4462
4463
4464 // ===============================================================================
4465 // INODE UPDATES
4466
4467
4468 /*
4469 * finisher for basic inode updates
4470 */
4471 class C_MDS_inode_update_finish : public ServerLogContext {
4472 CInode *in;
4473 bool truncating_smaller, changed_ranges, new_realm;
4474 public:
4475 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
4476 bool sm=false, bool cr=false, bool nr=false) :
4477 ServerLogContext(s, r), in(i),
4478 truncating_smaller(sm), changed_ranges(cr), new_realm(nr) { }
4479 void finish(int r) override {
4480 ceph_assert(r == 0);
4481
4482 // apply
4483 in->pop_and_dirty_projected_inode(mdr->ls);
4484 mdr->apply();
4485
4486 MDSRank *mds = get_mds();
4487
4488 // notify any clients
4489 if (truncating_smaller && in->inode.is_truncating()) {
4490 mds->locker->issue_truncate(in);
4491 mds->mdcache->truncate_inode(in, mdr->ls);
4492 }
4493
4494 if (new_realm) {
4495 int op = CEPH_SNAP_OP_SPLIT;
4496 mds->mdcache->send_snap_update(in, 0, op);
4497 mds->mdcache->do_realm_invalidate_and_update_notify(in, op);
4498 }
4499
4500 get_mds()->balancer->hit_inode(in, META_POP_IWR);
4501
4502 server->respond_to_request(mdr, 0);
4503
4504 if (changed_ranges)
4505 get_mds()->locker->share_inode_max_size(in);
4506 }
4507 };
4508
4509 void Server::handle_client_file_setlock(MDRequestRef& mdr)
4510 {
4511 const MClientRequest::const_ref &req = mdr->client_request;
4512 MutationImpl::LockOpVec lov;
4513
4514 // get the inode to operate on, and set up any locks needed for that
4515 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
4516 if (!cur)
4517 return;
4518
4519 lov.add_xlock(&cur->flocklock);
4520 /* acquire_locks will return true if it gets the locks. If it fails,
4521 it will redeliver this request at a later date, so drop the request.
4522 */
4523 if (!mds->locker->acquire_locks(mdr, lov)) {
4524 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
4525 return;
4526 }
4527
4528 // copy the lock change into a ceph_filelock so we can store/apply it
4529 ceph_filelock set_lock;
4530 set_lock.start = req->head.args.filelock_change.start;
4531 set_lock.length = req->head.args.filelock_change.length;
4532 set_lock.client = req->get_orig_source().num();
4533 set_lock.owner = req->head.args.filelock_change.owner;
4534 set_lock.pid = req->head.args.filelock_change.pid;
4535 set_lock.type = req->head.args.filelock_change.type;
4536 bool will_wait = req->head.args.filelock_change.wait;
4537
4538 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
4539
4540 ceph_lock_state_t *lock_state = NULL;
4541 bool interrupt = false;
4542
4543 // get the appropriate lock state
4544 switch (req->head.args.filelock_change.rule) {
4545 case CEPH_LOCK_FLOCK_INTR:
4546 interrupt = true;
4547 // fall-thru
4548 case CEPH_LOCK_FLOCK:
4549 lock_state = cur->get_flock_lock_state();
4550 break;
4551
4552 case CEPH_LOCK_FCNTL_INTR:
4553 interrupt = true;
4554 // fall-thru
4555 case CEPH_LOCK_FCNTL:
4556 lock_state = cur->get_fcntl_lock_state();
4557 break;
4558
4559 default:
4560 dout(10) << "got unknown lock type " << set_lock.type
4561 << ", dropping request!" << dendl;
4562 respond_to_request(mdr, -EOPNOTSUPP);
4563 return;
4564 }
4565
4566 dout(10) << " state prior to lock change: " << *lock_state << dendl;
4567 if (CEPH_LOCK_UNLOCK == set_lock.type) {
4568 list<ceph_filelock> activated_locks;
4569 MDSContext::vec waiters;
4570 if (lock_state->is_waiting(set_lock)) {
4571 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
4572 lock_state->remove_waiting(set_lock);
4573 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4574 } else if (!interrupt) {
4575 dout(10) << " unlock attempt on " << set_lock << dendl;
4576 lock_state->remove_lock(set_lock, activated_locks);
4577 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4578 }
4579 mds->queue_waiters(waiters);
4580
4581 respond_to_request(mdr, 0);
4582 } else {
4583 dout(10) << " lock attempt on " << set_lock << dendl;
4584 bool deadlock = false;
4585 if (mdr->more()->flock_was_waiting &&
4586 !lock_state->is_waiting(set_lock)) {
4587 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
4588 respond_to_request(mdr, -EINTR);
4589 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
4590 dout(10) << " it failed on this attempt" << dendl;
4591 // couldn't set lock right now
4592 if (deadlock) {
4593 respond_to_request(mdr, -EDEADLK);
4594 } else if (!will_wait) {
4595 respond_to_request(mdr, -EWOULDBLOCK);
4596 } else {
4597 dout(10) << " added to waiting list" << dendl;
4598 ceph_assert(lock_state->is_waiting(set_lock));
4599 mdr->more()->flock_was_waiting = true;
4600 mds->locker->drop_locks(mdr.get());
4601 mdr->drop_local_auth_pins();
4602 mdr->mark_event("failed to add lock, waiting");
4603 mdr->mark_nowarn();
4604 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
4605 }
4606 } else
4607 respond_to_request(mdr, 0);
4608 }
4609 dout(10) << " state after lock change: " << *lock_state << dendl;
4610 }
4611
4612 void Server::handle_client_file_readlock(MDRequestRef& mdr)
4613 {
4614 const MClientRequest::const_ref &req = mdr->client_request;
4615 MutationImpl::LockOpVec lov;
4616
4617 // get the inode to operate on, and set up any locks needed for that
4618 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
4619 if (!cur)
4620 return;
4621
4622 /* acquire_locks will return true if it gets the locks. If it fails,
4623 it will redeliver this request at a later date, so drop the request.
4624 */
4625 lov.add_rdlock(&cur->flocklock);
4626 if (!mds->locker->acquire_locks(mdr, lov)) {
4627 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
4628 return;
4629 }
4630
4631 // copy the lock change into a ceph_filelock so we can store/apply it
4632 ceph_filelock checking_lock;
4633 checking_lock.start = req->head.args.filelock_change.start;
4634 checking_lock.length = req->head.args.filelock_change.length;
4635 checking_lock.client = req->get_orig_source().num();
4636 checking_lock.owner = req->head.args.filelock_change.owner;
4637 checking_lock.pid = req->head.args.filelock_change.pid;
4638 checking_lock.type = req->head.args.filelock_change.type;
4639
4640 // get the appropriate lock state
4641 ceph_lock_state_t *lock_state = NULL;
4642 switch (req->head.args.filelock_change.rule) {
4643 case CEPH_LOCK_FLOCK:
4644 lock_state = cur->get_flock_lock_state();
4645 break;
4646
4647 case CEPH_LOCK_FCNTL:
4648 lock_state = cur->get_fcntl_lock_state();
4649 break;
4650
4651 default:
4652 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
4653 respond_to_request(mdr, -EINVAL);
4654 return;
4655 }
4656 lock_state->look_for_lock(checking_lock);
4657
4658 bufferlist lock_bl;
4659 encode(checking_lock, lock_bl);
4660
4661 mdr->reply_extra_bl = lock_bl;
4662 respond_to_request(mdr, 0);
4663 }
4664
4665 void Server::handle_client_setattr(MDRequestRef& mdr)
4666 {
4667 const MClientRequest::const_ref &req = mdr->client_request;
4668 MutationImpl::LockOpVec lov;
4669 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
4670 if (!cur) return;
4671
4672 if (mdr->snapid != CEPH_NOSNAP) {
4673 respond_to_request(mdr, -EROFS);
4674 return;
4675 }
4676 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
4677 respond_to_request(mdr, -EPERM);
4678 return;
4679 }
4680
4681 __u32 mask = req->head.args.setattr.mask;
4682 __u32 access_mask = MAY_WRITE;
4683
4684 // xlock inode
4685 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
4686 lov.add_xlock(&cur->authlock);
4687 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
4688 lov.add_xlock(&cur->filelock);
4689 if (mask & CEPH_SETATTR_CTIME)
4690 lov.add_wrlock(&cur->versionlock);
4691
4692 if (!mds->locker->acquire_locks(mdr, lov))
4693 return;
4694
4695 if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
4696 access_mask |= MAY_CHOWN;
4697
4698 if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
4699 access_mask |= MAY_CHGRP;
4700
4701 if (!check_access(mdr, cur, access_mask))
4702 return;
4703
4704 // trunc from bigger -> smaller?
4705 auto pip = cur->get_projected_inode();
4706
4707 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
4708
4709 // ENOSPC on growing file while full, but allow shrinks
4710 if (is_full && req->head.args.setattr.size > old_size) {
4711 dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
4712 respond_to_request(mdr, -ENOSPC);
4713 return;
4714 }
4715
4716 bool truncating_smaller = false;
4717 if (mask & CEPH_SETATTR_SIZE) {
4718 truncating_smaller = req->head.args.setattr.size < old_size;
4719 if (truncating_smaller && pip->is_truncating()) {
4720 dout(10) << " waiting for pending truncate from " << pip->truncate_from
4721 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
4722 mds->locker->drop_locks(mdr.get());
4723 mdr->drop_local_auth_pins();
4724 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4725 return;
4726 }
4727 }
4728
4729 bool changed_ranges = false;
4730
4731 // project update
4732 mdr->ls = mdlog->get_current_segment();
4733 EUpdate *le = new EUpdate(mdlog, "setattr");
4734 mdlog->start_entry(le);
4735
4736 auto &pi = cur->project_inode();
4737
4738 if (mask & CEPH_SETATTR_UID)
4739 pi.inode.uid = req->head.args.setattr.uid;
4740 if (mask & CEPH_SETATTR_GID)
4741 pi.inode.gid = req->head.args.setattr.gid;
4742
4743 if (mask & CEPH_SETATTR_MODE)
4744 pi.inode.mode = (pi.inode.mode & ~07777) | (req->head.args.setattr.mode & 07777);
4745 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
4746 S_ISREG(pi.inode.mode) &&
4747 (pi.inode.mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
4748 pi.inode.mode &= ~(S_ISUID|S_ISGID);
4749 }
4750
4751 if (mask & CEPH_SETATTR_MTIME)
4752 pi.inode.mtime = req->head.args.setattr.mtime;
4753 if (mask & CEPH_SETATTR_ATIME)
4754 pi.inode.atime = req->head.args.setattr.atime;
4755 if (mask & CEPH_SETATTR_BTIME)
4756 pi.inode.btime = req->head.args.setattr.btime;
4757 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
4758 pi.inode.time_warp_seq++; // maybe not a timewarp, but still a serialization point.
4759 if (mask & CEPH_SETATTR_SIZE) {
4760 if (truncating_smaller) {
4761 pi.inode.truncate(old_size, req->head.args.setattr.size);
4762 le->metablob.add_truncate_start(cur->ino());
4763 } else {
4764 pi.inode.size = req->head.args.setattr.size;
4765 pi.inode.rstat.rbytes = pi.inode.size;
4766 }
4767 pi.inode.mtime = mdr->get_op_stamp();
4768
4769 // adjust client's max_size?
4770 CInode::mempool_inode::client_range_map new_ranges;
4771 bool max_increased = false;
4772 mds->locker->calc_new_client_ranges(cur, pi.inode.size, true, &new_ranges, &max_increased);
4773 if (pi.inode.client_ranges != new_ranges) {
4774 dout(10) << " client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl;
4775 pi.inode.client_ranges = new_ranges;
4776 changed_ranges = true;
4777 }
4778 }
4779
4780 pi.inode.version = cur->pre_dirty();
4781 pi.inode.ctime = mdr->get_op_stamp();
4782 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
4783 pi.inode.rstat.rctime = mdr->get_op_stamp();
4784 pi.inode.change_attr++;
4785
4786 // log + wait
4787 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4788 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4789 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4790
4791 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
4792 truncating_smaller, changed_ranges));
4793
4794 // flush immediately if there are readers/writers waiting
4795 if (mdr->is_xlocked(&cur->filelock) &&
4796 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
4797 mds->mdlog->flush();
4798 }
4799
4800 /* Takes responsibility for mdr */
4801 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
4802 {
4803 CInode *in = mdr->in[0];
4804 client_t client = mdr->get_client();
4805 ceph_assert(in);
4806
4807 dout(10) << "do_open_truncate " << *in << dendl;
4808
4809 SnapRealm *realm = in->find_snaprealm();
4810 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
4811
4812 mdr->ls = mdlog->get_current_segment();
4813 EUpdate *le = new EUpdate(mdlog, "open_truncate");
4814 mdlog->start_entry(le);
4815
4816 // prepare
4817 auto &pi = in->project_inode();
4818 pi.inode.version = in->pre_dirty();
4819 pi.inode.mtime = pi.inode.ctime = mdr->get_op_stamp();
4820 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
4821 pi.inode.rstat.rctime = mdr->get_op_stamp();
4822 pi.inode.change_attr++;
4823
4824 uint64_t old_size = std::max<uint64_t>(pi.inode.size, mdr->client_request->head.args.open.old_size);
4825 if (old_size > 0) {
4826 pi.inode.truncate(old_size, 0);
4827 le->metablob.add_truncate_start(in->ino());
4828 }
4829
4830 bool changed_ranges = false;
4831 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
4832 pi.inode.client_ranges[client].range.first = 0;
4833 pi.inode.client_ranges[client].range.last = pi.inode.get_layout_size_increment();
4834 pi.inode.client_ranges[client].follows = realm->get_newest_seq();
4835 changed_ranges = true;
4836 cap->mark_clientwriteable();
4837 }
4838
4839 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
4840
4841 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
4842 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
4843
4844 // make sure ino gets into the journal
4845 le->metablob.add_opened_ino(in->ino());
4846
4847 mdr->o_trunc = true;
4848
4849 CDentry *dn = 0;
4850 if (mdr->client_request->get_dentry_wanted()) {
4851 ceph_assert(mdr->dn[0].size());
4852 dn = mdr->dn[0].back();
4853 }
4854
4855 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
4856 changed_ranges));
4857 // Although the `open` part can give an early reply, the truncation won't
4858 // happen until our EUpdate is persistent, to give the client a prompt
4859 // response we must also flush that event.
4860 mdlog->flush();
4861 }
4862
4863
4864 /* This function cleans up the passed mdr */
4865 void Server::handle_client_setlayout(MDRequestRef& mdr)
4866 {
4867 const MClientRequest::const_ref &req = mdr->client_request;
4868 MutationImpl::LockOpVec lov;
4869 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
4870 if (!cur) return;
4871
4872 if (mdr->snapid != CEPH_NOSNAP) {
4873 respond_to_request(mdr, -EROFS);
4874 return;
4875 }
4876 if (!cur->is_file()) {
4877 respond_to_request(mdr, -EINVAL);
4878 return;
4879 }
4880 if (cur->get_projected_inode()->size ||
4881 cur->get_projected_inode()->truncate_seq > 1) {
4882 respond_to_request(mdr, -ENOTEMPTY);
4883 return;
4884 }
4885
4886 // validate layout
4887 file_layout_t layout = cur->get_projected_inode()->layout;
4888 // save existing layout for later
4889 const auto old_layout = layout;
4890
4891 int access = MAY_WRITE;
4892
4893 if (req->head.args.setlayout.layout.fl_object_size > 0)
4894 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4895 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4896 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4897 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4898 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4899 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4900 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4901
4902 // make sure we have as new a map as the client
4903 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4904 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4905 return;
4906 }
4907 }
4908
4909 // Don't permit layout modifications without 'p' caps
4910 if (layout != old_layout) {
4911 access |= MAY_SET_VXATTR;
4912 }
4913
4914 if (!layout.is_valid()) {
4915 dout(10) << "bad layout" << dendl;
4916 respond_to_request(mdr, -EINVAL);
4917 return;
4918 }
4919 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4920 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4921 respond_to_request(mdr, -EINVAL);
4922 return;
4923 }
4924
4925 lov.add_xlock(&cur->filelock);
4926 if (!mds->locker->acquire_locks(mdr, lov))
4927 return;
4928
4929 if (!check_access(mdr, cur, access))
4930 return;
4931
4932 // project update
4933 auto &pi = cur->project_inode();
4934 pi.inode.layout = layout;
4935 // add the old pool to the inode
4936 pi.inode.add_old_pool(old_layout.pool_id);
4937 pi.inode.version = cur->pre_dirty();
4938 pi.inode.ctime = mdr->get_op_stamp();
4939 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
4940 pi.inode.rstat.rctime = mdr->get_op_stamp();
4941 pi.inode.change_attr++;
4942
4943 // log + wait
4944 mdr->ls = mdlog->get_current_segment();
4945 EUpdate *le = new EUpdate(mdlog, "setlayout");
4946 mdlog->start_entry(le);
4947 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4948 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4949 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4950
4951 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4952 }
4953
4954 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
4955 {
4956 const MClientRequest::const_ref &req = mdr->client_request;
4957 MutationImpl::LockOpVec lov;
4958 file_layout_t *dir_layout = nullptr;
4959 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout);
4960 if (!cur) return;
4961
4962 if (mdr->snapid != CEPH_NOSNAP) {
4963 respond_to_request(mdr, -EROFS);
4964 return;
4965 }
4966
4967 if (!cur->is_dir()) {
4968 respond_to_request(mdr, -ENOTDIR);
4969 return;
4970 }
4971
4972 lov.add_xlock(&cur->policylock);
4973 if (!mds->locker->acquire_locks(mdr, lov))
4974 return;
4975
4976 // validate layout
4977 const auto old_pi = cur->get_projected_inode();
4978 file_layout_t layout;
4979 if (old_pi->has_layout())
4980 layout = old_pi->layout;
4981 else if (dir_layout)
4982 layout = *dir_layout;
4983 else
4984 layout = mdcache->default_file_layout;
4985
4986 // Level of access required to complete
4987 int access = MAY_WRITE;
4988
4989 const auto old_layout = layout;
4990
4991 if (req->head.args.setlayout.layout.fl_object_size > 0)
4992 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4993 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4994 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4995 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4996 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4997 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4998 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4999 // make sure we have as new a map as the client
5000 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5001 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5002 return;
5003 }
5004 }
5005
5006 if (layout != old_layout) {
5007 access |= MAY_SET_VXATTR;
5008 }
5009
5010 if (!layout.is_valid()) {
5011 dout(10) << "bad layout" << dendl;
5012 respond_to_request(mdr, -EINVAL);
5013 return;
5014 }
5015 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5016 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5017 respond_to_request(mdr, -EINVAL);
5018 return;
5019 }
5020
5021 if (!check_access(mdr, cur, access))
5022 return;
5023
5024 auto &pi = cur->project_inode();
5025 pi.inode.layout = layout;
5026 pi.inode.version = cur->pre_dirty();
5027
5028 // log + wait
5029 mdr->ls = mdlog->get_current_segment();
5030 EUpdate *le = new EUpdate(mdlog, "setlayout");
5031 mdlog->start_entry(le);
5032 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5033 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5034 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5035
5036 mdr->no_early_reply = true;
5037 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5038 }
5039
5040 // XATTRS
5041
5042 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5043 file_layout_t *layout, bool validate)
5044 {
5045 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
5046 try {
5047 if (name == "layout") {
5048 string::iterator begin = value.begin();
5049 string::iterator end = value.end();
5050 keys_and_values<string::iterator> p; // create instance of parser
5051 std::map<string, string> m; // map to receive results
5052 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5053 return -EINVAL;
5054 }
5055 string left(begin, end);
5056 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5057 if (begin != end)
5058 return -EINVAL;
5059 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5060 // Skip validation on each attr, we do it once at the end (avoid
5061 // rejecting intermediate states if the overall result is ok)
5062 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
5063 osdmap, layout, false);
5064 if (r < 0)
5065 return r;
5066 }
5067 } else if (name == "layout.object_size") {
5068 layout->object_size = boost::lexical_cast<unsigned>(value);
5069 } else if (name == "layout.stripe_unit") {
5070 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5071 } else if (name == "layout.stripe_count") {
5072 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5073 } else if (name == "layout.pool") {
5074 try {
5075 layout->pool_id = boost::lexical_cast<unsigned>(value);
5076 } catch (boost::bad_lexical_cast const&) {
5077 int64_t pool = osdmap.lookup_pg_pool_name(value);
5078 if (pool < 0) {
5079 dout(10) << " unknown pool " << value << dendl;
5080 return -ENOENT;
5081 }
5082 layout->pool_id = pool;
5083 }
5084 } else if (name == "layout.pool_namespace") {
5085 layout->pool_ns = value;
5086 } else {
5087 dout(10) << " unknown layout vxattr " << name << dendl;
5088 return -EINVAL;
5089 }
5090 } catch (boost::bad_lexical_cast const&) {
5091 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5092 return -EINVAL;
5093 }
5094
5095 if (validate && !layout->is_valid()) {
5096 dout(10) << "bad layout" << dendl;
5097 return -EINVAL;
5098 }
5099 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
5100 dout(10) << " invalid data pool " << layout->pool_id << dendl;
5101 return -EINVAL;
5102 }
5103 return 0;
5104 }
5105
5106 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5107 {
5108 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5109 try {
5110 if (name == "quota") {
5111 string::iterator begin = value.begin();
5112 string::iterator end = value.end();
5113 if (begin == end) {
5114 // keep quota unchanged. (for create_quota_realm())
5115 return 0;
5116 }
5117 keys_and_values<string::iterator> p; // create instance of parser
5118 std::map<string, string> m; // map to receive results
5119 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5120 return -EINVAL;
5121 }
5122 string left(begin, end);
5123 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5124 if (begin != end)
5125 return -EINVAL;
5126 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5127 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5128 if (r < 0)
5129 return r;
5130 }
5131 } else if (name == "quota.max_bytes") {
5132 int64_t q = boost::lexical_cast<int64_t>(value);
5133 if (q < 0)
5134 return -EINVAL;
5135 quota->max_bytes = q;
5136 } else if (name == "quota.max_files") {
5137 int64_t q = boost::lexical_cast<int64_t>(value);
5138 if (q < 0)
5139 return -EINVAL;
5140 quota->max_files = q;
5141 } else {
5142 dout(10) << " unknown quota vxattr " << name << dendl;
5143 return -EINVAL;
5144 }
5145 } catch (boost::bad_lexical_cast const&) {
5146 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5147 return -EINVAL;
5148 }
5149
5150 if (!quota->is_valid()) {
5151 dout(10) << "bad quota" << dendl;
5152 return -EINVAL;
5153 }
5154 return 0;
5155 }
5156
5157 void Server::create_quota_realm(CInode *in)
5158 {
5159 dout(10) << __func__ << " " << *in << dendl;
5160
5161 auto req = MClientRequest::create(CEPH_MDS_OP_SETXATTR);
5162 req->set_filepath(filepath(in->ino()));
5163 req->set_string2("ceph.quota");
5164 // empty vxattr value
5165 req->set_tid(mds->issue_tid());
5166
5167 mds->send_message_mds(req, in->authority().first);
5168 }
5169
5170 /*
5171 * Verify that the file layout attribute carried by client
5172 * is well-formatted.
5173 * Return 0 on success, otherwise this function takes
5174 * responsibility for the passed mdr.
5175 */
5176 int Server::check_layout_vxattr(MDRequestRef& mdr,
5177 string name,
5178 string value,
5179 file_layout_t *layout)
5180 {
5181 const MClientRequest::const_ref &req = mdr->client_request;
5182 epoch_t epoch;
5183 int r;
5184
5185 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5186 r = parse_layout_vxattr(name, value, osdmap, layout);
5187 epoch = osdmap.get_epoch();
5188 });
5189
5190 if (r == -ENOENT) {
5191
5192 // we don't have the specified pool, make sure our map
5193 // is newer than or as new as the client.
5194 epoch_t req_epoch = req->get_osdmap_epoch();
5195
5196 if (req_epoch > epoch) {
5197
5198 // well, our map is older. consult mds.
5199 Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
5200
5201 if (!mds->objecter->wait_for_map(req_epoch, fin))
5202 return r; // wait, fin will retry this request later
5203
5204 delete fin;
5205
5206 // now we have at least as new a map as the client, try again.
5207 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5208 r = parse_layout_vxattr(name, value, osdmap, layout);
5209 epoch = osdmap.get_epoch();
5210 });
5211
5212 ceph_assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
5213
5214 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5215
5216 // For compatibility with client w/ old code, we still need get the
5217 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5218 // we can remove those code.
5219 mdr->waited_for_osdmap = true;
5220 mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
5221 mds, new C_MDS_RetryRequest(mdcache, mdr)));
5222 return r;
5223 }
5224 }
5225
5226 if (r < 0) {
5227
5228 if (r == -ENOENT)
5229 r = -EINVAL;
5230
5231 respond_to_request(mdr, r);
5232 return r;
5233 }
5234
5235 // all is well
5236 return 0;
5237 }
5238
5239 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
5240 file_layout_t *dir_layout,
5241 MutationImpl::LockOpVec& lov)
5242 {
5243 const MClientRequest::const_ref &req = mdr->client_request;
5244 string name(req->get_path2());
5245 bufferlist bl = req->get_data();
5246 string value (bl.c_str(), bl.length());
5247 dout(10) << "handle_set_vxattr " << name
5248 << " val " << value.length()
5249 << " bytes on " << *cur
5250 << dendl;
5251
5252 CInode::mempool_inode *pip = nullptr;
5253 string rest;
5254
5255 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5256 return;
5257 }
5258
5259 bool new_realm = false;
5260 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5261 if (!cur->is_dir()) {
5262 respond_to_request(mdr, -EINVAL);
5263 return;
5264 }
5265
5266 file_layout_t layout;
5267 if (cur->get_projected_inode()->has_layout())
5268 layout = cur->get_projected_inode()->layout;
5269 else if (dir_layout)
5270 layout = *dir_layout;
5271 else
5272 layout = mdcache->default_file_layout;
5273
5274 rest = name.substr(name.find("layout"));
5275 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5276 return;
5277
5278 lov.add_xlock(&cur->policylock);
5279 if (!mds->locker->acquire_locks(mdr, lov))
5280 return;
5281
5282 auto &pi = cur->project_inode();
5283 pi.inode.layout = layout;
5284 mdr->no_early_reply = true;
5285 pip = &pi.inode;
5286 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
5287 if (!cur->is_file()) {
5288 respond_to_request(mdr, -EINVAL);
5289 return;
5290 }
5291 if (cur->get_projected_inode()->size ||
5292 cur->get_projected_inode()->truncate_seq > 1) {
5293 respond_to_request(mdr, -ENOTEMPTY);
5294 return;
5295 }
5296 file_layout_t layout = cur->get_projected_inode()->layout;
5297 rest = name.substr(name.find("layout"));
5298 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5299 return;
5300
5301 lov.add_xlock(&cur->filelock);
5302 if (!mds->locker->acquire_locks(mdr, lov))
5303 return;
5304
5305 auto &pi = cur->project_inode();
5306 int64_t old_pool = pi.inode.layout.pool_id;
5307 pi.inode.add_old_pool(old_pool);
5308 pi.inode.layout = layout;
5309 pip = &pi.inode;
5310 } else if (name.compare(0, 10, "ceph.quota") == 0) {
5311 if (!cur->is_dir() || cur->is_root()) {
5312 respond_to_request(mdr, -EINVAL);
5313 return;
5314 }
5315
5316 quota_info_t quota = cur->get_projected_inode()->quota;
5317
5318 rest = name.substr(name.find("quota"));
5319 int r = parse_quota_vxattr(rest, value, &quota);
5320 if (r < 0) {
5321 respond_to_request(mdr, r);
5322 return;
5323 }
5324
5325 lov.add_xlock(&cur->policylock);
5326 if (quota.is_enable() && !cur->get_projected_srnode()) {
5327 lov.add_xlock(&cur->snaplock);
5328 new_realm = true;
5329 }
5330
5331 if (!mds->locker->acquire_locks(mdr, lov))
5332 return;
5333
5334 auto &pi = cur->project_inode(false, new_realm);
5335 pi.inode.quota = quota;
5336
5337 if (new_realm) {
5338 SnapRealm *realm = cur->find_snaprealm();
5339 auto seq = realm->get_newest_seq();
5340 auto &newsnap = *pi.snapnode;
5341 newsnap.created = seq;
5342 newsnap.seq = seq;
5343 }
5344 mdr->no_early_reply = true;
5345 pip = &pi.inode;
5346
5347 client_t exclude_ct = mdr->get_client();
5348 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
5349 } else if (name.find("ceph.dir.pin") == 0) {
5350 if (!cur->is_dir() || cur->is_root()) {
5351 respond_to_request(mdr, -EINVAL);
5352 return;
5353 }
5354
5355 mds_rank_t rank;
5356 try {
5357 rank = boost::lexical_cast<mds_rank_t>(value);
5358 if (rank < 0) rank = MDS_RANK_NONE;
5359 } catch (boost::bad_lexical_cast const&) {
5360 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5361 respond_to_request(mdr, -EINVAL);
5362 return;
5363 }
5364
5365 lov.add_xlock(&cur->policylock);
5366 if (!mds->locker->acquire_locks(mdr, lov))
5367 return;
5368
5369 auto &pi = cur->project_inode();
5370 cur->set_export_pin(rank);
5371 pip = &pi.inode;
5372 } else {
5373 dout(10) << " unknown vxattr " << name << dendl;
5374 respond_to_request(mdr, -EINVAL);
5375 return;
5376 }
5377
5378 pip->change_attr++;
5379 pip->ctime = mdr->get_op_stamp();
5380 if (mdr->get_op_stamp() > pip->rstat.rctime)
5381 pip->rstat.rctime = mdr->get_op_stamp();
5382 pip->version = cur->pre_dirty();
5383 if (cur->is_file())
5384 pip->update_backtrace();
5385
5386 // log + wait
5387 mdr->ls = mdlog->get_current_segment();
5388 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
5389 mdlog->start_entry(le);
5390 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5391 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5392 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5393
5394 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5395 false, false, new_realm));
5396 return;
5397 }
5398
5399 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
5400 file_layout_t *dir_layout,
5401 MutationImpl::LockOpVec& lov)
5402 {
5403 const MClientRequest::const_ref &req = mdr->client_request;
5404 string name(req->get_path2());
5405
5406 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
5407
5408 if (name == "ceph.dir.layout") {
5409 if (!cur->is_dir()) {
5410 respond_to_request(mdr, -ENODATA);
5411 return;
5412 }
5413 if (cur->is_root()) {
5414 dout(10) << "can't remove layout policy on the root directory" << dendl;
5415 respond_to_request(mdr, -EINVAL);
5416 return;
5417 }
5418
5419 if (!cur->get_projected_inode()->has_layout()) {
5420 respond_to_request(mdr, -ENODATA);
5421 return;
5422 }
5423
5424 lov.add_xlock(&cur->policylock);
5425 if (!mds->locker->acquire_locks(mdr, lov))
5426 return;
5427
5428 auto &pi = cur->project_inode();
5429 pi.inode.clear_layout();
5430 pi.inode.version = cur->pre_dirty();
5431
5432 // log + wait
5433 mdr->ls = mdlog->get_current_segment();
5434 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
5435 mdlog->start_entry(le);
5436 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5437 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5438 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5439
5440 mdr->no_early_reply = true;
5441 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5442 return;
5443 } else if (name == "ceph.dir.layout.pool_namespace"
5444 || name == "ceph.file.layout.pool_namespace") {
5445 // Namespace is the only layout field that has a meaningful
5446 // null/none value (empty string, means default layout). Is equivalent
5447 // to a setxattr with empty string: pass through the empty payload of
5448 // the rmxattr request to do this.
5449 handle_set_vxattr(mdr, cur, dir_layout, lov);
5450 return;
5451 }
5452
5453 respond_to_request(mdr, -ENODATA);
5454 }
5455
5456 class C_MDS_inode_xattr_update_finish : public ServerLogContext {
5457 CInode *in;
5458 public:
5459
5460 C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
5461 ServerLogContext(s, r), in(i) { }
5462 void finish(int r) override {
5463 ceph_assert(r == 0);
5464
5465 // apply
5466 in->pop_and_dirty_projected_inode(mdr->ls);
5467
5468 mdr->apply();
5469
5470 get_mds()->balancer->hit_inode(in, META_POP_IWR);
5471
5472 server->respond_to_request(mdr, 0);
5473 }
5474 };
5475
5476 void Server::handle_client_setxattr(MDRequestRef& mdr)
5477 {
5478 const MClientRequest::const_ref &req = mdr->client_request;
5479 string name(req->get_path2());
5480 MutationImpl::LockOpVec lov;
5481 CInode *cur;
5482
5483 file_layout_t *dir_layout = NULL;
5484 if (name.compare(0, 15, "ceph.dir.layout") == 0)
5485 cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout);
5486 else
5487 cur = rdlock_path_pin_ref(mdr, 0, lov, true);
5488 if (!cur)
5489 return;
5490
5491 if (mdr->snapid != CEPH_NOSNAP) {
5492 respond_to_request(mdr, -EROFS);
5493 return;
5494 }
5495
5496 int flags = req->head.args.setxattr.flags;
5497
5498 // magic ceph.* namespace?
5499 if (name.compare(0, 5, "ceph.") == 0) {
5500 handle_set_vxattr(mdr, cur, dir_layout, lov);
5501 return;
5502 }
5503
5504 lov.add_xlock(&cur->xattrlock);
5505 if (!mds->locker->acquire_locks(mdr, lov))
5506 return;
5507
5508 if (!check_access(mdr, cur, MAY_WRITE))
5509 return;
5510
5511 auto pxattrs = cur->get_projected_xattrs();
5512 size_t len = req->get_data().length();
5513 size_t inc = len + name.length();
5514
5515 // check xattrs kv pairs size
5516 size_t cur_xattrs_size = 0;
5517 for (const auto& p : *pxattrs) {
5518 if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
5519 continue;
5520 }
5521 cur_xattrs_size += p.first.length() + p.second.length();
5522 }
5523
5524 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
5525 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
5526 << cur_xattrs_size << ", inc " << inc << dendl;
5527 respond_to_request(mdr, -ENOSPC);
5528 return;
5529 }
5530
5531 if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(mempool::mds_co::string(name))) {
5532 dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
5533 respond_to_request(mdr, -EEXIST);
5534 return;
5535 }
5536 if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(mempool::mds_co::string(name))) {
5537 dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
5538 respond_to_request(mdr, -ENODATA);
5539 return;
5540 }
5541
5542 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
5543
5544 // project update
5545 auto &pi = cur->project_inode(true);
5546 pi.inode.version = cur->pre_dirty();
5547 pi.inode.ctime = mdr->get_op_stamp();
5548 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5549 pi.inode.rstat.rctime = mdr->get_op_stamp();
5550 pi.inode.change_attr++;
5551 pi.inode.xattr_version++;
5552 auto &px = *pi.xattrs;
5553 if ((flags & CEPH_XATTR_REMOVE)) {
5554 px.erase(mempool::mds_co::string(name));
5555 } else {
5556 bufferptr b = buffer::create(len);
5557 if (len)
5558 req->get_data().copy(0, len, b.c_str());
5559 auto em = px.emplace(std::piecewise_construct, std::forward_as_tuple(mempool::mds_co::string(name)), std::forward_as_tuple(b));
5560 if (!em.second)
5561 em.first->second = b;
5562 }
5563
5564 // log + wait
5565 mdr->ls = mdlog->get_current_segment();
5566 EUpdate *le = new EUpdate(mdlog, "setxattr");
5567 mdlog->start_entry(le);
5568 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5569 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5570 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5571
5572 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5573 }
5574
5575 void Server::handle_client_removexattr(MDRequestRef& mdr)
5576 {
5577 const MClientRequest::const_ref &req = mdr->client_request;
5578 std::string name(req->get_path2());
5579
5580 MutationImpl::LockOpVec lov;
5581 file_layout_t *dir_layout = nullptr;
5582 CInode *cur;
5583 if (name == "ceph.dir.layout")
5584 cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout);
5585 else
5586 cur = rdlock_path_pin_ref(mdr, 0, lov, true);
5587 if (!cur)
5588 return;
5589
5590 if (mdr->snapid != CEPH_NOSNAP) {
5591 respond_to_request(mdr, -EROFS);
5592 return;
5593 }
5594
5595 if (name.compare(0, 5, "ceph.") == 0) {
5596 handle_remove_vxattr(mdr, cur, dir_layout, lov);
5597 return;
5598 }
5599
5600 lov.add_xlock(&cur->xattrlock);
5601 if (!mds->locker->acquire_locks(mdr, lov))
5602 return;
5603
5604 auto pxattrs = cur->get_projected_xattrs();
5605 if (pxattrs->count(mempool::mds_co::string(name)) == 0) {
5606 dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
5607 respond_to_request(mdr, -ENODATA);
5608 return;
5609 }
5610
5611 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
5612
5613 // project update
5614 auto &pi = cur->project_inode(true);
5615 auto &px = *pi.xattrs;
5616 pi.inode.version = cur->pre_dirty();
5617 pi.inode.ctime = mdr->get_op_stamp();
5618 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5619 pi.inode.rstat.rctime = mdr->get_op_stamp();
5620 pi.inode.change_attr++;
5621 pi.inode.xattr_version++;
5622 px.erase(mempool::mds_co::string(name));
5623
5624 // log + wait
5625 mdr->ls = mdlog->get_current_segment();
5626 EUpdate *le = new EUpdate(mdlog, "removexattr");
5627 mdlog->start_entry(le);
5628 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5629 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5630 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5631
5632 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5633 }
5634
5635
5636 // =================================================================
5637 // DIRECTORY and NAMESPACE OPS
5638
5639
5640 // ------------------------------------------------
5641
5642 // MKNOD
5643
5644 class C_MDS_mknod_finish : public ServerLogContext {
5645 CDentry *dn;
5646 CInode *newi;
5647 public:
5648 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
5649 ServerLogContext(s, r), dn(d), newi(ni) {}
5650 void finish(int r) override {
5651 ceph_assert(r == 0);
5652
5653 // link the inode
5654 dn->pop_projected_linkage();
5655
5656 // be a bit hacky with the inode version, here.. we decrement it
5657 // just to keep mark_dirty() happen. (we didn't bother projecting
5658 // a new version of hte inode since it's just been created)
5659 newi->inode.version--;
5660 newi->mark_dirty(newi->inode.version + 1, mdr->ls);
5661 newi->mark_dirty_parent(mdr->ls, true);
5662
5663 // mkdir?
5664 if (newi->inode.is_dir()) {
5665 CDir *dir = newi->get_dirfrag(frag_t());
5666 ceph_assert(dir);
5667 dir->fnode.version--;
5668 dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
5669 dir->mark_new(mdr->ls);
5670 }
5671
5672 mdr->apply();
5673
5674 MDRequestRef null_ref;
5675 get_mds()->mdcache->send_dentry_link(dn, null_ref);
5676
5677 if (newi->inode.is_file())
5678 get_mds()->locker->share_inode_max_size(newi);
5679
5680 // hit pop
5681 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
5682
5683 // reply
5684 server->respond_to_request(mdr, 0);
5685 }
5686 };
5687
5688
5689 void Server::handle_client_mknod(MDRequestRef& mdr)
5690 {
5691 const MClientRequest::const_ref &req = mdr->client_request;
5692 client_t client = mdr->get_client();
5693 MutationImpl::LockOpVec lov;
5694 file_layout_t *dir_layout = nullptr;
5695 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false,
5696 &dir_layout);
5697 if (!dn) return;
5698 if (mdr->snapid != CEPH_NOSNAP) {
5699 respond_to_request(mdr, -EROFS);
5700 return;
5701 }
5702 CInode *diri = dn->get_dir()->get_inode();
5703 lov.add_rdlock(&diri->authlock);
5704 if (!mds->locker->acquire_locks(mdr, lov))
5705 return;
5706
5707 if (!check_access(mdr, diri, MAY_WRITE))
5708 return;
5709
5710 if (!check_fragment_space(mdr, dn->get_dir()))
5711 return;
5712
5713 unsigned mode = req->head.args.mknod.mode;
5714 if ((mode & S_IFMT) == 0)
5715 mode |= S_IFREG;
5716
5717 // set layout
5718 file_layout_t layout;
5719 if (dir_layout && S_ISREG(mode))
5720 layout = *dir_layout;
5721 else
5722 layout = mdcache->default_file_layout;
5723
5724 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
5725 ceph_assert(newi);
5726
5727 dn->push_projected_linkage(newi);
5728
5729 newi->inode.rdev = req->head.args.mknod.rdev;
5730 newi->inode.version = dn->pre_dirty();
5731 newi->inode.rstat.rfiles = 1;
5732 if (layout.pool_id != mdcache->default_file_layout.pool_id)
5733 newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
5734 newi->inode.update_backtrace();
5735
5736 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
5737 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5738 ceph_assert(follows >= realm->get_newest_seq());
5739
5740 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5741 // want to write to it (e.g., if they are reexporting NFS)
5742 if (S_ISREG(newi->inode.mode)) {
5743 // issue a cap on the file
5744 int cmode = CEPH_FILE_MODE_RDWR;
5745 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5746 if (cap) {
5747 cap->set_wanted(0);
5748
5749 // put locks in excl mode
5750 newi->filelock.set_state(LOCK_EXCL);
5751 newi->authlock.set_state(LOCK_EXCL);
5752 newi->xattrlock.set_state(LOCK_EXCL);
5753
5754 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
5755 newi->inode.client_ranges[client].range.first = 0;
5756 newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
5757 newi->inode.client_ranges[client].follows = follows;
5758 cap->mark_clientwriteable();
5759 }
5760 }
5761
5762 ceph_assert(dn->first == follows + 1);
5763 newi->first = dn->first;
5764
5765 dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
5766
5767 // prepare finisher
5768 mdr->ls = mdlog->get_current_segment();
5769 EUpdate *le = new EUpdate(mdlog, "mknod");
5770 mdlog->start_entry(le);
5771 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5772 journal_allocated_inos(mdr, &le->metablob);
5773
5774 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
5775 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5776 le->metablob.add_primary_dentry(dn, newi, true, true, true);
5777
5778 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5779 }
5780
5781
5782
5783 // MKDIR
5784 /* This function takes responsibility for the passed mdr*/
5785 void Server::handle_client_mkdir(MDRequestRef& mdr)
5786 {
5787 const MClientRequest::const_ref &req = mdr->client_request;
5788 if (req->get_filepath().is_last_dot_or_dotdot()) {
5789 respond_to_request(mdr, -EEXIST);
5790 return;
5791 }
5792
5793 MutationImpl::LockOpVec lov;
5794 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false);
5795 if (!dn) return;
5796 if (mdr->snapid != CEPH_NOSNAP) {
5797 respond_to_request(mdr, -EROFS);
5798 return;
5799 }
5800 CDir *dir = dn->get_dir();
5801 CInode *diri = dir->get_inode();
5802 lov.add_rdlock(&diri->authlock);
5803 if (!mds->locker->acquire_locks(mdr, lov))
5804 return;
5805
5806 // mkdir check access
5807 if (!check_access(mdr, diri, MAY_WRITE))
5808 return;
5809
5810 if (!check_fragment_space(mdr, dir))
5811 return;
5812
5813 // new inode
5814 unsigned mode = req->head.args.mkdir.mode;
5815 mode &= ~S_IFMT;
5816 mode |= S_IFDIR;
5817 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5818 ceph_assert(newi);
5819
5820 // it's a directory.
5821 dn->push_projected_linkage(newi);
5822
5823 newi->inode.version = dn->pre_dirty();
5824 newi->inode.rstat.rsubdirs = 1;
5825 newi->inode.update_backtrace();
5826
5827 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
5828 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5829 ceph_assert(follows >= realm->get_newest_seq());
5830
5831 dout(12) << " follows " << follows << dendl;
5832 ceph_assert(dn->first == follows + 1);
5833 newi->first = dn->first;
5834
5835 // ...and that new dir is empty.
5836 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
5837 newdir->state_set(CDir::STATE_CREATING);
5838 newdir->mark_complete();
5839 newdir->fnode.version = newdir->pre_dirty();
5840
5841 // prepare finisher
5842 mdr->ls = mdlog->get_current_segment();
5843 EUpdate *le = new EUpdate(mdlog, "mkdir");
5844 mdlog->start_entry(le);
5845 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5846 journal_allocated_inos(mdr, &le->metablob);
5847 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5848 le->metablob.add_primary_dentry(dn, newi, true, true);
5849 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
5850
5851 // issue a cap on the directory
5852 int cmode = CEPH_FILE_MODE_RDWR;
5853 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5854 if (cap) {
5855 cap->set_wanted(0);
5856
5857 // put locks in excl mode
5858 newi->filelock.set_state(LOCK_EXCL);
5859 newi->authlock.set_state(LOCK_EXCL);
5860 newi->xattrlock.set_state(LOCK_EXCL);
5861 }
5862
5863 // make sure this inode gets into the journal
5864 le->metablob.add_opened_ino(newi->ino());
5865
5866 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5867
5868 // We hit_dir (via hit_inode) in our finish callback, but by then we might
5869 // have overshot the split size (multiple mkdir in flight), so here is
5870 // an early chance to split the dir if this mkdir makes it oversized.
5871 mds->balancer->maybe_fragment(dir, false);
5872 }
5873
5874
5875 // SYMLINK
5876
5877 void Server::handle_client_symlink(MDRequestRef& mdr)
5878 {
5879 const MClientRequest::const_ref &req = mdr->client_request;
5880 MutationImpl::LockOpVec lov;
5881 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false);
5882 if (!dn) return;
5883 if (mdr->snapid != CEPH_NOSNAP) {
5884 respond_to_request(mdr, -EROFS);
5885 return;
5886 }
5887 CDir *dir = dn->get_dir();
5888 CInode *diri = dir->get_inode();
5889 lov.add_rdlock(&diri->authlock);
5890 if (!mds->locker->acquire_locks(mdr, lov))
5891 return;
5892
5893 if (!check_access(mdr, diri, MAY_WRITE))
5894 return;
5895
5896 if (!check_fragment_space(mdr, dir))
5897 return;
5898
5899 unsigned mode = S_IFLNK | 0777;
5900 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5901 ceph_assert(newi);
5902
5903 // it's a symlink
5904 dn->push_projected_linkage(newi);
5905
5906 newi->symlink = req->get_path2();
5907 newi->inode.size = newi->symlink.length();
5908 newi->inode.rstat.rbytes = newi->inode.size;
5909 newi->inode.rstat.rfiles = 1;
5910 newi->inode.version = dn->pre_dirty();
5911 newi->inode.update_backtrace();
5912
5913 newi->first = dn->first;
5914
5915 // prepare finisher
5916 mdr->ls = mdlog->get_current_segment();
5917 EUpdate *le = new EUpdate(mdlog, "symlink");
5918 mdlog->start_entry(le);
5919 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5920 journal_allocated_inos(mdr, &le->metablob);
5921 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5922 le->metablob.add_primary_dentry(dn, newi, true, true);
5923
5924 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5925 }
5926
5927
5928
5929
5930
5931 // LINK
5932
5933 void Server::handle_client_link(MDRequestRef& mdr)
5934 {
5935 const MClientRequest::const_ref &req = mdr->client_request;
5936
5937 dout(7) << "handle_client_link " << req->get_filepath()
5938 << " to " << req->get_filepath2()
5939 << dendl;
5940
5941 MutationImpl::LockOpVec lov;
5942
5943 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false);
5944 if (!dn) return;
5945 CInode *targeti = rdlock_path_pin_ref(mdr, 1, lov, false);
5946 if (!targeti) return;
5947 if (mdr->snapid != CEPH_NOSNAP) {
5948 respond_to_request(mdr, -EROFS);
5949 return;
5950 }
5951
5952 CDir *dir = dn->get_dir();
5953 dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
5954 dout(7) << "target is " << *targeti << dendl;
5955 if (targeti->is_dir()) {
5956 // if srcdn is replica, need to make sure its linkage is correct
5957 vector<CDentry*>& trace = mdr->dn[1];
5958 if (trace.empty() ||
5959 trace.back()->is_auth() ||
5960 trace.back()->lock.can_read(mdr->get_client())) {
5961 dout(7) << "target is a dir, failing..." << dendl;
5962 respond_to_request(mdr, -EINVAL);
5963 return;
5964 }
5965 }
5966
5967 lov.erase_rdlock(&targeti->snaplock);
5968 lov.add_xlock(&targeti->snaplock);
5969 lov.add_xlock(&targeti->linklock);
5970
5971 if (!mds->locker->acquire_locks(mdr, lov))
5972 return;
5973
5974 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5975 if (!check_access(mdr, targeti, MAY_WRITE))
5976 return;
5977
5978 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
5979 return;
5980
5981 if (!check_fragment_space(mdr, dir))
5982 return;
5983 }
5984
5985 // go!
5986 ceph_assert(g_conf()->mds_kill_link_at != 1);
5987
5988 // local or remote?
5989 if (targeti->is_auth())
5990 _link_local(mdr, dn, targeti);
5991 else
5992 _link_remote(mdr, true, dn, targeti);
5993 }
5994
5995
5996 class C_MDS_link_local_finish : public ServerLogContext {
5997 CDentry *dn;
5998 CInode *targeti;
5999 version_t dnpv;
6000 version_t tipv;
6001 bool adjust_realm;
6002 public:
6003 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
6004 version_t dnpv_, version_t tipv_, bool ar) :
6005 ServerLogContext(s, r), dn(d), targeti(ti),
6006 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
6007 void finish(int r) override {
6008 ceph_assert(r == 0);
6009 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
6010 }
6011 };
6012
6013
6014 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
6015 {
6016 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
6017
6018 mdr->ls = mdlog->get_current_segment();
6019
6020 // predirty NEW dentry
6021 version_t dnpv = dn->pre_dirty();
6022 version_t tipv = targeti->pre_dirty();
6023
6024 // project inode update
6025 auto &pi = targeti->project_inode();
6026 pi.inode.nlink++;
6027 pi.inode.ctime = mdr->get_op_stamp();
6028 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
6029 pi.inode.rstat.rctime = mdr->get_op_stamp();
6030 pi.inode.change_attr++;
6031 pi.inode.version = tipv;
6032
6033 bool adjust_realm = false;
6034 if (!targeti->is_projected_snaprealm_global()) {
6035 sr_t *newsnap = targeti->project_snaprealm();
6036 targeti->mark_snaprealm_global(newsnap);
6037 targeti->record_snaprealm_parent_dentry(newsnap, NULL, targeti->get_projected_parent_dn(), true);
6038 adjust_realm = true;
6039 }
6040
6041 // log + wait
6042 EUpdate *le = new EUpdate(mdlog, "link_local");
6043 mdlog->start_entry(le);
6044 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6045 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
6046 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
6047 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6048 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
6049
6050 // do this after predirty_*, to avoid funky extra dnl arg
6051 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6052
6053 journal_and_reply(mdr, targeti, dn, le,
6054 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
6055 }
6056
6057 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
6058 version_t dnpv, version_t tipv, bool adjust_realm)
6059 {
6060 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
6061
6062 // link and unlock the NEW dentry
6063 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6064 if (!dnl->get_inode())
6065 dn->link_remote(dnl, targeti);
6066 dn->mark_dirty(dnpv, mdr->ls);
6067
6068 // target inode
6069 targeti->pop_and_dirty_projected_inode(mdr->ls);
6070
6071 mdr->apply();
6072
6073 MDRequestRef null_ref;
6074 mdcache->send_dentry_link(dn, null_ref);
6075
6076 if (adjust_realm) {
6077 int op = CEPH_SNAP_OP_SPLIT;
6078 mds->mdcache->send_snap_update(targeti, 0, op);
6079 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6080 }
6081
6082 // bump target popularity
6083 mds->balancer->hit_inode(targeti, META_POP_IWR);
6084 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6085
6086 // reply
6087 respond_to_request(mdr, 0);
6088 }
6089
6090
6091 // link / unlink remote
6092
6093 class C_MDS_link_remote_finish : public ServerLogContext {
6094 bool inc;
6095 CDentry *dn;
6096 CInode *targeti;
6097 version_t dpv;
6098 public:
6099 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
6100 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
6101 dpv(d->get_projected_version()) {}
6102 void finish(int r) override {
6103 ceph_assert(r == 0);
6104 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
6105 }
6106 };
6107
6108 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
6109 {
6110 dout(10) << "_link_remote "
6111 << (inc ? "link ":"unlink ")
6112 << *dn << " to " << *targeti << dendl;
6113
6114 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6115 mds_rank_t linkauth = targeti->authority().first;
6116 if (mdr->more()->witnessed.count(linkauth) == 0) {
6117 if (mds->is_cluster_degraded() &&
6118 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
6119 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
6120 if (mdr->more()->waiting_on_slave.empty())
6121 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
6122 return;
6123 }
6124
6125 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
6126 int op;
6127 if (inc)
6128 op = MMDSSlaveRequest::OP_LINKPREP;
6129 else
6130 op = MMDSSlaveRequest::OP_UNLINKPREP;
6131 auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, op);
6132 targeti->set_object_info(req->get_object_info());
6133 req->op_stamp = mdr->get_op_stamp();
6134 if (auto& desti_srnode = mdr->more()->desti_srnode)
6135 encode(*desti_srnode, req->desti_snapbl);
6136 mds->send_message_mds(req, linkauth);
6137
6138 ceph_assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
6139 mdr->more()->waiting_on_slave.insert(linkauth);
6140 return;
6141 }
6142 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
6143
6144 ceph_assert(g_conf()->mds_kill_link_at != 2);
6145
6146 if (auto& desti_srnode = mdr->more()->desti_srnode) {
6147 delete desti_srnode;
6148 desti_srnode = NULL;
6149 }
6150
6151 mdr->set_mds_stamp(ceph_clock_now());
6152
6153 // add to event
6154 mdr->ls = mdlog->get_current_segment();
6155 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
6156 mdlog->start_entry(le);
6157 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6158 if (!mdr->more()->witnessed.empty()) {
6159 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6160 le->reqid = mdr->reqid;
6161 le->had_slaves = true;
6162 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6163 }
6164
6165 if (inc) {
6166 dn->pre_dirty();
6167 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
6168 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6169 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6170 } else {
6171 dn->pre_dirty();
6172 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
6173 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6174 le->metablob.add_null_dentry(dn, true);
6175 dn->push_projected_linkage();
6176 }
6177
6178 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
6179 }
6180
6181 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
6182 CDentry *dn, CInode *targeti,
6183 version_t dpv)
6184 {
6185 dout(10) << "_link_remote_finish "
6186 << (inc ? "link ":"unlink ")
6187 << *dn << " to " << *targeti << dendl;
6188
6189 ceph_assert(g_conf()->mds_kill_link_at != 3);
6190
6191 if (!mdr->more()->witnessed.empty())
6192 mdcache->logged_master_update(mdr->reqid);
6193
6194 if (inc) {
6195 // link the new dentry
6196 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6197 if (!dnl->get_inode())
6198 dn->link_remote(dnl, targeti);
6199 dn->mark_dirty(dpv, mdr->ls);
6200 } else {
6201 // unlink main dentry
6202 dn->get_dir()->unlink_inode(dn);
6203 dn->pop_projected_linkage();
6204 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
6205 }
6206
6207 mdr->apply();
6208
6209 MDRequestRef null_ref;
6210 if (inc)
6211 mdcache->send_dentry_link(dn, null_ref);
6212 else
6213 mdcache->send_dentry_unlink(dn, NULL, null_ref);
6214
6215 // bump target popularity
6216 mds->balancer->hit_inode(targeti, META_POP_IWR);
6217 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6218
6219 // reply
6220 respond_to_request(mdr, 0);
6221
6222 if (!inc)
6223 // removing a new dn?
6224 dn->get_dir()->try_remove_unlinked_dn(dn);
6225 }
6226
6227
6228 // remote linking/unlinking
6229
6230 class C_MDS_SlaveLinkPrep : public ServerLogContext {
6231 CInode *targeti;
6232 bool adjust_realm;
6233 public:
6234 C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
6235 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
6236 void finish(int r) override {
6237 ceph_assert(r == 0);
6238 server->_logged_slave_link(mdr, targeti, adjust_realm);
6239 }
6240 };
6241
6242 class C_MDS_SlaveLinkCommit : public ServerContext {
6243 MDRequestRef mdr;
6244 CInode *targeti;
6245 public:
6246 C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
6247 ServerContext(s), mdr(r), targeti(t) { }
6248 void finish(int r) override {
6249 server->_commit_slave_link(mdr, r, targeti);
6250 }
6251 };
6252
6253 void Server::handle_slave_link_prep(MDRequestRef& mdr)
6254 {
6255 dout(10) << "handle_slave_link_prep " << *mdr
6256 << " on " << mdr->slave_request->get_object_info()
6257 << dendl;
6258
6259 ceph_assert(g_conf()->mds_kill_link_at != 4);
6260
6261 CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
6262 ceph_assert(targeti);
6263 dout(10) << "targeti " << *targeti << dendl;
6264 CDentry *dn = targeti->get_parent_dn();
6265 CDentry::linkage_t *dnl = dn->get_linkage();
6266 ceph_assert(dnl->is_primary());
6267
6268 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6269
6270 mdr->auth_pin(targeti);
6271
6272 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
6273 ceph_assert(g_conf()->mds_kill_link_at != 5);
6274
6275 // journal it
6276 mdr->ls = mdlog->get_current_segment();
6277 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
6278 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
6279 mdlog->start_entry(le);
6280
6281 auto &pi = dnl->get_inode()->project_inode();
6282
6283 // update journaled target inode
6284 bool inc;
6285 bool adjust_realm = false;
6286 bool realm_projected = false;
6287 if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
6288 inc = true;
6289 pi.inode.nlink++;
6290 if (!targeti->is_projected_snaprealm_global()) {
6291 sr_t *newsnap = targeti->project_snaprealm();
6292 targeti->mark_snaprealm_global(newsnap);
6293 targeti->record_snaprealm_parent_dentry(newsnap, NULL, targeti->get_projected_parent_dn(), true);
6294 adjust_realm = true;
6295 realm_projected = true;
6296 }
6297 } else {
6298 inc = false;
6299 pi.inode.nlink--;
6300 if (targeti->is_projected_snaprealm_global()) {
6301 ceph_assert(mdr->slave_request->desti_snapbl.length());
6302 auto p = mdr->slave_request->desti_snapbl.cbegin();
6303
6304 sr_t *newsnap = targeti->project_snaprealm();
6305 decode(*newsnap, p);
6306
6307 if (pi.inode.nlink == 0)
6308 ceph_assert(!newsnap->is_parent_global());
6309
6310 realm_projected = true;
6311 } else {
6312 ceph_assert(mdr->slave_request->desti_snapbl.length() == 0);
6313 }
6314 }
6315
6316 link_rollback rollback;
6317 rollback.reqid = mdr->reqid;
6318 rollback.ino = targeti->ino();
6319 rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
6320 const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
6321 rollback.old_dir_mtime = pf->fragstat.mtime;
6322 rollback.old_dir_rctime = pf->rstat.rctime;
6323 rollback.was_inc = inc;
6324 if (realm_projected) {
6325 if (targeti->snaprealm) {
6326 encode(true, rollback.snapbl);
6327 targeti->encode_snap_blob(rollback.snapbl);
6328 } else {
6329 encode(false, rollback.snapbl);
6330 }
6331 }
6332 encode(rollback, le->rollback);
6333 mdr->more()->rollback_bl = le->rollback;
6334
6335 pi.inode.ctime = mdr->get_op_stamp();
6336 pi.inode.version = targeti->pre_dirty();
6337
6338 dout(10) << " projected inode " << pi.inode.ino << " v " << pi.inode.version << dendl;
6339
6340 // commit case
6341 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
6342 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
6343
6344 // set up commit waiter
6345 mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
6346
6347 mdr->more()->slave_update_journaled = true;
6348 submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, adjust_realm),
6349 mdr, __func__);
6350 mdlog->flush();
6351 }
6352
6353 void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
6354 {
6355 dout(10) << "_logged_slave_link " << *mdr
6356 << " " << *targeti << dendl;
6357
6358 ceph_assert(g_conf()->mds_kill_link_at != 6);
6359
6360 // update the target
6361 targeti->pop_and_dirty_projected_inode(mdr->ls);
6362 mdr->apply();
6363
6364 // hit pop
6365 mds->balancer->hit_inode(targeti, META_POP_IWR);
6366
6367 // done.
6368 mdr->reset_slave_request();
6369
6370 if (adjust_realm) {
6371 int op = CEPH_SNAP_OP_SPLIT;
6372 mds->mdcache->send_snap_update(targeti, 0, op);
6373 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6374 }
6375
6376 // ack
6377 if (!mdr->aborted) {
6378 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_LINKPREPACK);
6379 mds->send_message_mds(reply, mdr->slave_to_mds);
6380 } else {
6381 dout(10) << " abort flag set, finishing" << dendl;
6382 mdcache->request_finish(mdr);
6383 }
6384 }
6385
6386
6387 struct C_MDS_CommittedSlave : public ServerLogContext {
6388 C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
6389 void finish(int r) override {
6390 server->_committed_slave(mdr);
6391 }
6392 };
6393
6394 void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
6395 {
6396 dout(10) << "_commit_slave_link " << *mdr
6397 << " r=" << r
6398 << " " << *targeti << dendl;
6399
6400 ceph_assert(g_conf()->mds_kill_link_at != 7);
6401
6402 if (r == 0) {
6403 // drop our pins, etc.
6404 mdr->cleanup();
6405
6406 // write a commit to the journal
6407 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
6408 ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
6409 mdlog->start_entry(le);
6410 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
6411 mdlog->flush();
6412 } else {
6413 do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
6414 }
6415 }
6416
6417 void Server::_committed_slave(MDRequestRef& mdr)
6418 {
6419 dout(10) << "_committed_slave " << *mdr << dendl;
6420
6421 ceph_assert(g_conf()->mds_kill_link_at != 8);
6422
6423 auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_COMMITTED);
6424 mds->send_message_mds(req, mdr->slave_to_mds);
6425 mdcache->request_finish(mdr);
6426 }
6427
6428 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
6429 MutationRef mut;
6430 map<client_t,MClientSnap::ref> splits;
6431 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
6432 map<client_t,MClientSnap::ref>&& _splits) :
6433 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
6434 }
6435 void finish(int r) override {
6436 server->_link_rollback_finish(mut, mdr, splits);
6437 }
6438 };
6439
6440 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
6441 {
6442 link_rollback rollback;
6443 auto p = rbl.cbegin();
6444 decode(rollback, p);
6445
6446 dout(10) << "do_link_rollback on " << rollback.reqid
6447 << (rollback.was_inc ? " inc":" dec")
6448 << " ino " << rollback.ino
6449 << dendl;
6450
6451 ceph_assert(g_conf()->mds_kill_link_at != 9);
6452
6453 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
6454 ceph_assert(mdr || mds->is_resolve());
6455
6456 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
6457 mut->ls = mds->mdlog->get_current_segment();
6458
6459 CInode *in = mdcache->get_inode(rollback.ino);
6460 ceph_assert(in);
6461 dout(10) << " target is " << *in << dendl;
6462 ceph_assert(!in->is_projected()); // live slave request hold versionlock xlock.
6463
6464 auto &pi = in->project_inode();
6465 pi.inode.version = in->pre_dirty();
6466 mut->add_projected_inode(in);
6467
6468 // parent dir rctime
6469 CDir *parent = in->get_projected_parent_dn()->get_dir();
6470 fnode_t *pf = parent->project_fnode();
6471 mut->add_projected_fnode(parent);
6472 pf->version = parent->pre_dirty();
6473 if (pf->fragstat.mtime == pi.inode.ctime) {
6474 pf->fragstat.mtime = rollback.old_dir_mtime;
6475 if (pf->rstat.rctime == pi.inode.ctime)
6476 pf->rstat.rctime = rollback.old_dir_rctime;
6477 mut->add_updated_lock(&parent->get_inode()->filelock);
6478 mut->add_updated_lock(&parent->get_inode()->nestlock);
6479 }
6480
6481 // inode
6482 pi.inode.ctime = rollback.old_ctime;
6483 if (rollback.was_inc)
6484 pi.inode.nlink--;
6485 else
6486 pi.inode.nlink++;
6487
6488 map<client_t,MClientSnap::ref> splits;
6489 if (rollback.snapbl.length() && in->snaprealm) {
6490 bool hadrealm;
6491 auto p = rollback.snapbl.cbegin();
6492 decode(hadrealm, p);
6493 if (hadrealm) {
6494 if (!mds->is_resolve()) {
6495 sr_t *new_srnode = new sr_t();
6496 decode(*new_srnode, p);
6497 in->project_snaprealm(new_srnode);
6498 } else {
6499 decode(in->snaprealm->srnode, p);
6500 }
6501 } else {
6502 SnapRealm *realm = parent->get_inode()->find_snaprealm();
6503 if (!mds->is_resolve())
6504 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
6505 in->project_snaprealm(NULL);
6506 }
6507 }
6508
6509 // journal it
6510 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
6511 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
6512 mdlog->start_entry(le);
6513 le->commit.add_dir_context(parent);
6514 le->commit.add_dir(parent, true);
6515 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
6516
6517 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
6518 mdr, __func__);
6519 mdlog->flush();
6520 }
6521
6522 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
6523 map<client_t,MClientSnap::ref>& splits)
6524 {
6525 dout(10) << "_link_rollback_finish" << dendl;
6526
6527 ceph_assert(g_conf()->mds_kill_link_at != 10);
6528
6529 mut->apply();
6530
6531 if (!mds->is_resolve())
6532 mdcache->send_snaps(splits);
6533
6534 if (mdr)
6535 mdcache->request_finish(mdr);
6536
6537 mdcache->finish_rollback(mut->reqid);
6538
6539 mut->cleanup();
6540 }
6541
6542
6543 void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &m)
6544 {
6545 dout(10) << "handle_slave_link_prep_ack " << *mdr
6546 << " " << *m << dendl;
6547 mds_rank_t from = mds_rank_t(m->get_source().num());
6548
6549 ceph_assert(g_conf()->mds_kill_link_at != 11);
6550
6551 // note slave
6552 mdr->more()->slaves.insert(from);
6553
6554 // witnessed!
6555 ceph_assert(mdr->more()->witnessed.count(from) == 0);
6556 mdr->more()->witnessed.insert(from);
6557 ceph_assert(!m->is_not_journaled());
6558 mdr->more()->has_journaled_slaves = true;
6559
6560 // remove from waiting list
6561 ceph_assert(mdr->more()->waiting_on_slave.count(from));
6562 mdr->more()->waiting_on_slave.erase(from);
6563
6564 ceph_assert(mdr->more()->waiting_on_slave.empty());
6565
6566 dispatch_client_request(mdr); // go again!
6567 }
6568
6569
6570
6571
6572
6573 // UNLINK
6574
6575 void Server::handle_client_unlink(MDRequestRef& mdr)
6576 {
6577 const MClientRequest::const_ref &req = mdr->client_request;
6578 client_t client = mdr->get_client();
6579
6580 // rmdir or unlink?
6581 bool rmdir = false;
6582 if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true;
6583
6584 const filepath& refpath = req->get_filepath();
6585 if (refpath.depth() == 0) {
6586 respond_to_request(mdr, -EINVAL);
6587 return;
6588 }
6589 if (refpath.is_last_dot_or_dotdot()) {
6590 respond_to_request(mdr, -ENOTEMPTY);
6591 return;
6592 }
6593
6594 // traverse to path
6595 vector<CDentry*> trace;
6596 CInode *in;
6597 CF_MDS_MDRContextFactory cf(mdcache, mdr);
6598 int r = mdcache->path_traverse(mdr, cf, refpath, &trace, &in, MDS_TRAVERSE_FORWARD);
6599 if (r > 0) return;
6600 if (r < 0) {
6601 if (r == -ESTALE) {
6602 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
6603 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
6604 return;
6605 }
6606 respond_to_request(mdr, r);
6607 return;
6608 }
6609 if (mdr->snapid != CEPH_NOSNAP) {
6610 respond_to_request(mdr, -EROFS);
6611 return;
6612 }
6613
6614 CDentry *dn = trace.back();
6615 ceph_assert(dn);
6616 if (!dn->is_auth()) {
6617 mdcache->request_forward(mdr, dn->authority().first);
6618 return;
6619 }
6620
6621 CInode *diri = dn->get_dir()->get_inode();
6622
6623 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
6624 ceph_assert(!dnl->is_null());
6625
6626 if (rmdir) {
6627 dout(7) << "handle_client_rmdir on " << *dn << dendl;
6628 } else {
6629 dout(7) << "handle_client_unlink on " << *dn << dendl;
6630 }
6631 dout(7) << "dn links to " << *in << dendl;
6632
6633 // rmdir vs is_dir
6634 if (in->is_dir()) {
6635 if (rmdir) {
6636 // do empty directory checks
6637 if (_dir_is_nonempty_unlocked(mdr, in)) {
6638 respond_to_request(mdr, -ENOTEMPTY);
6639 return;
6640 }
6641 } else {
6642 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
6643 respond_to_request(mdr, -EISDIR);
6644 return;
6645 }
6646 } else {
6647 if (rmdir) {
6648 // unlink
6649 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
6650 respond_to_request(mdr, -ENOTDIR);
6651 return;
6652 }
6653 }
6654
6655 // -- create stray dentry? --
6656 CDentry *straydn = NULL;
6657 if (dnl->is_primary()) {
6658 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
6659 if (!straydn)
6660 return;
6661 dout(10) << " straydn is " << *straydn << dendl;
6662 } else if (mdr->straydn) {
6663 mdr->unpin(mdr->straydn);
6664 mdr->straydn = NULL;
6665 }
6666
6667 // lock
6668 MutationImpl::LockOpVec lov;
6669
6670 for (int i=0; i<(int)trace.size()-1; i++)
6671 lov.add_rdlock(&trace[i]->lock);
6672 lov.add_xlock(&dn->lock);
6673 lov.add_wrlock(&diri->filelock);
6674 lov.add_wrlock(&diri->nestlock);
6675 lov.add_xlock(&in->linklock);
6676 if (straydn) {
6677 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
6678 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
6679 lov.add_xlock(&straydn->lock);
6680 }
6681
6682 mds->locker->include_snap_rdlocks(diri, lov);
6683 lov.add_xlock(&in->snaplock);
6684 if (in->is_dir())
6685 lov.add_rdlock(&in->filelock); // to verify it's empty
6686
6687 if (!mds->locker->acquire_locks(mdr, lov))
6688 return;
6689
6690 if (in->is_dir() &&
6691 _dir_is_nonempty(mdr, in)) {
6692 respond_to_request(mdr, -ENOTEMPTY);
6693 return;
6694 }
6695
6696 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6697 if (!check_access(mdr, diri, MAY_WRITE))
6698 return;
6699 }
6700
6701 if (straydn)
6702 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
6703
6704 if (!mdr->more()->desti_srnode) {
6705 if (in->is_projected_snaprealm_global()) {
6706 sr_t *new_srnode = in->prepare_new_srnode(0);
6707 in->record_snaprealm_parent_dentry(new_srnode, NULL, dn, dnl->is_primary());
6708 // dropping the last linkage or dropping the last remote linkage,
6709 // detch the inode from global snaprealm
6710 auto nlink = in->get_projected_inode()->nlink;
6711 if (nlink == 1 ||
6712 (nlink == 2 && !dnl->is_primary() &&
6713 !in->get_projected_parent_dir()->inode->is_stray()))
6714 in->clear_snaprealm_global(new_srnode);
6715 mdr->more()->desti_srnode = new_srnode;
6716 } else if (dnl->is_primary()) {
6717 // prepare snaprealm blob for slave request
6718 SnapRealm *realm = in->find_snaprealm();
6719 snapid_t follows = realm->get_newest_seq();
6720 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
6721 sr_t *new_srnode = in->prepare_new_srnode(follows);
6722 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
6723 mdr->more()->desti_srnode = new_srnode;
6724 }
6725 }
6726 }
6727
6728 // yay!
6729 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
6730 // subtree root auths need to be witnesses
6731 set<mds_rank_t> witnesses;
6732 in->list_replicas(witnesses);
6733 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
6734
6735 for (set<mds_rank_t>::iterator p = witnesses.begin();
6736 p != witnesses.end();
6737 ++p) {
6738 if (mdr->more()->witnessed.count(*p)) {
6739 dout(10) << " already witnessed by mds." << *p << dendl;
6740 } else if (mdr->more()->waiting_on_slave.count(*p)) {
6741 dout(10) << " already waiting on witness mds." << *p << dendl;
6742 } else {
6743 if (!_rmdir_prepare_witness(mdr, *p, trace, straydn))
6744 return;
6745 }
6746 }
6747 if (!mdr->more()->waiting_on_slave.empty())
6748 return; // we're waiting for a witness.
6749 }
6750
6751 // ok!
6752 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
6753 _link_remote(mdr, false, dn, dnl->get_inode());
6754 else
6755 _unlink_local(mdr, dn, straydn);
6756 }
6757
6758 class C_MDS_unlink_local_finish : public ServerLogContext {
6759 CDentry *dn;
6760 CDentry *straydn;
6761 version_t dnpv; // deleted dentry
6762 public:
6763 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
6764 ServerLogContext(s, r), dn(d), straydn(sd),
6765 dnpv(d->get_projected_version()) {}
6766 void finish(int r) override {
6767 ceph_assert(r == 0);
6768 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
6769 }
6770 };
6771
6772 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
6773 {
6774 dout(10) << "_unlink_local " << *dn << dendl;
6775
6776 CDentry::linkage_t *dnl = dn->get_projected_linkage();
6777 CInode *in = dnl->get_inode();
6778
6779
6780 // ok, let's do it.
6781 mdr->ls = mdlog->get_current_segment();
6782
6783 // prepare log entry
6784 EUpdate *le = new EUpdate(mdlog, "unlink_local");
6785 mdlog->start_entry(le);
6786 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6787 if (!mdr->more()->witnessed.empty()) {
6788 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6789 le->reqid = mdr->reqid;
6790 le->had_slaves = true;
6791 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6792 }
6793
6794 if (straydn) {
6795 ceph_assert(dnl->is_primary());
6796 straydn->push_projected_linkage(in);
6797 }
6798
6799 // the unlinked dentry
6800 dn->pre_dirty();
6801
6802 auto &pi = in->project_inode();
6803 {
6804 std::string t;
6805 dn->make_path_string(t, true);
6806 pi.inode.stray_prior_path = std::move(t);
6807 }
6808 pi.inode.version = in->pre_dirty();
6809 pi.inode.ctime = mdr->get_op_stamp();
6810 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
6811 pi.inode.rstat.rctime = mdr->get_op_stamp();
6812 pi.inode.change_attr++;
6813 pi.inode.nlink--;
6814 if (pi.inode.nlink == 0)
6815 in->state_set(CInode::STATE_ORPHAN);
6816
6817 if (mdr->more()->desti_srnode) {
6818 auto& desti_srnode = mdr->more()->desti_srnode;
6819 in->project_snaprealm(desti_srnode);
6820 desti_srnode = NULL;
6821 }
6822
6823 if (straydn) {
6824 // will manually pop projected inode
6825
6826 // primary link. add stray dentry.
6827 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
6828 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6829
6830 pi.inode.update_backtrace();
6831 le->metablob.add_primary_dentry(straydn, in, true, true);
6832 } else {
6833 mdr->add_projected_inode(in);
6834 // remote link. update remote inode.
6835 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
6836 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6837 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
6838 }
6839
6840 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6841 le->metablob.add_null_dentry(dn, true);
6842
6843 if (in->is_dir()) {
6844 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6845 le->metablob.renamed_dirino = in->ino();
6846 }
6847
6848 dn->push_projected_linkage();
6849
6850 if (straydn) {
6851 ceph_assert(in->first <= straydn->first);
6852 in->first = straydn->first;
6853 }
6854
6855 if (in->is_dir()) {
6856 ceph_assert(straydn);
6857 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6858 }
6859
6860 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
6861 }
6862
6863 void Server::_unlink_local_finish(MDRequestRef& mdr,
6864 CDentry *dn, CDentry *straydn,
6865 version_t dnpv)
6866 {
6867 dout(10) << "_unlink_local_finish " << *dn << dendl;
6868
6869 if (!mdr->more()->witnessed.empty())
6870 mdcache->logged_master_update(mdr->reqid);
6871
6872 CInode *strayin = NULL;
6873 bool hadrealm = false;
6874 if (straydn) {
6875 // if there is newly created snaprealm, need to split old snaprealm's
6876 // inodes_with_caps. So pop snaprealm before linkage changes.
6877 strayin = dn->get_linkage()->get_inode();
6878 hadrealm = strayin->snaprealm ? true : false;
6879 strayin->early_pop_projected_snaprealm();
6880 }
6881
6882 // unlink main dentry
6883 dn->get_dir()->unlink_inode(dn);
6884 dn->pop_projected_linkage();
6885
6886 // relink as stray? (i.e. was primary link?)
6887 if (straydn) {
6888 dout(20) << " straydn is " << *straydn << dendl;
6889 straydn->pop_projected_linkage();
6890
6891 strayin->pop_and_dirty_projected_inode(mdr->ls);
6892
6893 mdcache->touch_dentry_bottom(straydn);
6894 }
6895
6896 dn->mark_dirty(dnpv, mdr->ls);
6897 mdr->apply();
6898
6899 mdcache->send_dentry_unlink(dn, straydn, mdr);
6900
6901 if (straydn) {
6902 // update subtree map?
6903 if (strayin->is_dir())
6904 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
6905
6906 if (strayin->snaprealm && !hadrealm)
6907 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
6908 }
6909
6910 // bump pop
6911 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6912
6913 // reply
6914 respond_to_request(mdr, 0);
6915
6916 // removing a new dn?
6917 dn->get_dir()->try_remove_unlinked_dn(dn);
6918
6919 // clean up ?
6920 // respond_to_request() drops locks. So stray reintegration can race with us.
6921 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6922 // Tip off the MDCache that this dentry is a stray that
6923 // might be elegible for purge.
6924 mdcache->notify_stray(straydn);
6925 }
6926 }
6927
6928 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
6929 {
6930 if (mds->is_cluster_degraded() &&
6931 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6932 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
6933 if (mdr->more()->waiting_on_slave.empty())
6934 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6935 return false;
6936 }
6937
6938 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
6939 auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREP);
6940 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
6941 for (auto dn : trace)
6942 req->srcdnpath.push_dentry(dn->get_name());
6943 mdcache->replicate_stray(straydn, who, req->straybl);
6944 if (mdr->more()->desti_srnode)
6945 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
6946
6947 req->op_stamp = mdr->get_op_stamp();
6948 mds->send_message_mds(req, who);
6949
6950 ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
6951 mdr->more()->waiting_on_slave.insert(who);
6952 return true;
6953 }
6954
6955 struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
6956 CDentry *dn, *straydn;
6957 C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
6958 : ServerLogContext(s, r), dn(d), straydn(st) {}
6959 void finish(int r) override {
6960 server->_logged_slave_rmdir(mdr, dn, straydn);
6961 }
6962 };
6963
6964 struct C_MDS_SlaveRmdirCommit : public ServerContext {
6965 MDRequestRef mdr;
6966 CDentry *straydn;
6967 C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
6968 : ServerContext(s), mdr(r), straydn(sd) { }
6969 void finish(int r) override {
6970 server->_commit_slave_rmdir(mdr, r, straydn);
6971 }
6972 };
6973
6974 void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
6975 {
6976 dout(10) << "handle_slave_rmdir_prep " << *mdr
6977 << " " << mdr->slave_request->srcdnpath
6978 << " to " << mdr->slave_request->destdnpath
6979 << dendl;
6980
6981 vector<CDentry*> trace;
6982 filepath srcpath(mdr->slave_request->srcdnpath);
6983 dout(10) << " src " << srcpath << dendl;
6984 CInode *in;
6985 CF_MDS_MDRContextFactory cf(mdcache, mdr);
6986 int r = mdcache->path_traverse(mdr, cf, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK);
6987 if (r > 0) return;
6988 if (r == -ESTALE) {
6989 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
6990 mdr->slave_to_mds);
6991 return;
6992 }
6993 ceph_assert(r == 0);
6994 CDentry *dn = trace.back();
6995 dout(10) << " dn " << *dn << dendl;
6996 mdr->pin(dn);
6997
6998 ceph_assert(mdr->straydn);
6999 CDentry *straydn = mdr->straydn;
7000 dout(10) << " straydn " << *straydn << dendl;
7001
7002 mdr->set_op_stamp(mdr->slave_request->op_stamp);
7003
7004 rmdir_rollback rollback;
7005 rollback.reqid = mdr->reqid;
7006 rollback.src_dir = dn->get_dir()->dirfrag();
7007 rollback.src_dname = dn->get_name();
7008 rollback.dest_dir = straydn->get_dir()->dirfrag();
7009 rollback.dest_dname = straydn->get_name();
7010 if (mdr->slave_request->desti_snapbl.length()) {
7011 if (in->snaprealm) {
7012 encode(true, rollback.snapbl);
7013 in->encode_snap_blob(rollback.snapbl);
7014 } else {
7015 encode(false, rollback.snapbl);
7016 }
7017 }
7018 encode(rollback, mdr->more()->rollback_bl);
7019 // FIXME: rollback snaprealm
7020 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7021
7022 // set up commit waiter
7023 mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
7024
7025 straydn->push_projected_linkage(in);
7026 dn->push_projected_linkage();
7027
7028 ceph_assert(straydn->first >= in->first);
7029 in->first = straydn->first;
7030
7031 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
7032 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
7033 _logged_slave_rmdir(mdr, dn, straydn);
7034 return;
7035 }
7036
7037 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
7038 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
7039 mdlog->start_entry(le);
7040 le->rollback = mdr->more()->rollback_bl;
7041
7042 le->commit.add_dir_context(straydn->get_dir());
7043 le->commit.add_primary_dentry(straydn, in, true);
7044 // slave: no need to journal original dentry
7045
7046 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7047 le->commit.renamed_dirino = in->ino();
7048
7049 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7050
7051 mdr->more()->slave_update_journaled = true;
7052 submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
7053 mdr, __func__);
7054 mdlog->flush();
7055 }
7056
7057 void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7058 {
7059 dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
7060 CInode *in = dn->get_linkage()->get_inode();
7061
7062 bool new_realm;
7063 if (mdr->slave_request->desti_snapbl.length()) {
7064 new_realm = !in->snaprealm;
7065 in->decode_snap_blob(mdr->slave_request->desti_snapbl);
7066 ceph_assert(in->snaprealm);
7067 ceph_assert(in->snaprealm->have_past_parents_open());
7068 } else {
7069 new_realm = false;
7070 }
7071
7072 // update our cache now, so we are consistent with what is in the journal
7073 // when we journal a subtree map
7074 dn->get_dir()->unlink_inode(dn);
7075 straydn->pop_projected_linkage();
7076 dn->pop_projected_linkage();
7077
7078 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->slave_update_journaled);
7079
7080 if (new_realm)
7081 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
7082
7083 // done.
7084 mdr->reset_slave_request();
7085 mdr->straydn = 0;
7086
7087 if (!mdr->aborted) {
7088 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREPACK);
7089 if (!mdr->more()->slave_update_journaled)
7090 reply->mark_not_journaled();
7091 mds->send_message_mds(reply, mdr->slave_to_mds);
7092 } else {
7093 dout(10) << " abort flag set, finishing" << dendl;
7094 mdcache->request_finish(mdr);
7095 }
7096 }
7097
7098 void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
7099 {
7100 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
7101 << " " << *ack << dendl;
7102
7103 mds_rank_t from = mds_rank_t(ack->get_source().num());
7104
7105 mdr->more()->slaves.insert(from);
7106 mdr->more()->witnessed.insert(from);
7107 if (!ack->is_not_journaled())
7108 mdr->more()->has_journaled_slaves = true;
7109
7110 // remove from waiting list
7111 ceph_assert(mdr->more()->waiting_on_slave.count(from));
7112 mdr->more()->waiting_on_slave.erase(from);
7113
7114 if (mdr->more()->waiting_on_slave.empty())
7115 dispatch_client_request(mdr); // go again!
7116 else
7117 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
7118 }
7119
7120 void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
7121 {
7122 dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
7123
7124 if (r == 0) {
7125 if (mdr->more()->slave_update_journaled) {
7126 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7127 if (strayin && !strayin->snaprealm)
7128 mdcache->clear_dirty_bits_for_stray(strayin);
7129 }
7130
7131 mdr->cleanup();
7132
7133 if (mdr->more()->slave_update_journaled) {
7134 // write a commit to the journal
7135 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
7136 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
7137 ESlaveUpdate::RMDIR);
7138 mdlog->start_entry(le);
7139 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
7140 mdlog->flush();
7141 } else {
7142 _committed_slave(mdr);
7143 }
7144 } else {
7145 // abort
7146 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
7147 }
7148 }
7149
7150 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
7151 metareqid_t reqid;
7152 CDentry *dn;
7153 CDentry *straydn;
7154 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
7155 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
7156 void finish(int r) override {
7157 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
7158 }
7159 };
7160
7161 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
7162 {
7163 // unlink the other rollback methods, the rmdir rollback is only
7164 // needed to record the subtree changes in the journal for inode
7165 // replicas who are auth for empty dirfrags. no actual changes to
7166 // the file system are taking place here, so there is no Mutation.
7167
7168 rmdir_rollback rollback;
7169 auto p = rbl.cbegin();
7170 decode(rollback, p);
7171
7172 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
7173 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
7174 ceph_assert(mdr || mds->is_resolve());
7175
7176 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
7177 if (!dir)
7178 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
7179 ceph_assert(dir);
7180 CDentry *dn = dir->lookup(rollback.src_dname);
7181 ceph_assert(dn);
7182 dout(10) << " dn " << *dn << dendl;
7183 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
7184 ceph_assert(straydir);
7185 CDentry *straydn = straydir->lookup(rollback.dest_dname);
7186 ceph_assert(straydn);
7187 dout(10) << " straydn " << *straydn << dendl;
7188 CInode *in = straydn->get_linkage()->get_inode();
7189
7190 dn->push_projected_linkage(in);
7191 straydn->push_projected_linkage();
7192
7193 if (rollback.snapbl.length() && in->snaprealm) {
7194 bool hadrealm;
7195 auto p = rollback.snapbl.cbegin();
7196 decode(hadrealm, p);
7197 if (hadrealm) {
7198 decode(in->snaprealm->srnode, p);
7199 } else {
7200 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
7201 }
7202 }
7203
7204 if (mdr && !mdr->more()->slave_update_journaled) {
7205 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
7206
7207 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
7208 return;
7209 }
7210
7211
7212 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
7213 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
7214 mdlog->start_entry(le);
7215
7216 le->commit.add_dir_context(dn->get_dir());
7217 le->commit.add_primary_dentry(dn, in, true);
7218 // slave: no need to journal straydn
7219
7220 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7221 le->commit.renamed_dirino = in->ino();
7222
7223 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
7224
7225 submit_mdlog_entry(le,
7226 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
7227 dn, straydn),
7228 mdr, __func__);
7229 mdlog->flush();
7230 }
7231
7232 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
7233 {
7234 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
7235
7236 straydn->get_dir()->unlink_inode(straydn);
7237 dn->pop_projected_linkage();
7238 straydn->pop_projected_linkage();
7239
7240 CInode *in = dn->get_linkage()->get_inode();
7241 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
7242 !mdr || mdr->more()->slave_update_journaled);
7243
7244 if (mds->is_resolve()) {
7245 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
7246 mdcache->try_trim_non_auth_subtree(root);
7247 }
7248
7249 if (mdr)
7250 mdcache->request_finish(mdr);
7251
7252 mdcache->finish_rollback(reqid);
7253 }
7254
7255
7256 /** _dir_is_nonempty[_unlocked]
7257 *
7258 * check if a directory is non-empty (i.e. we can rmdir it).
7259 *
7260 * the unlocked varient this is a fastpath check. we can't really be
7261 * sure until we rdlock the filelock.
7262 */
7263 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
7264 {
7265 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
7266 ceph_assert(in->is_auth());
7267
7268 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
7269 return true; // in a snapshot!
7270
7271 list<CDir*> ls;
7272 in->get_dirfrags(ls);
7273 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7274 CDir *dir = *p;
7275 // is the frag obviously non-empty?
7276 if (dir->is_auth()) {
7277 if (dir->get_projected_fnode()->fragstat.size()) {
7278 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7279 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
7280 return true;
7281 }
7282 }
7283 }
7284
7285 return false;
7286 }
7287
7288 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
7289 {
7290 dout(10) << "dir_is_nonempty " << *in << dendl;
7291 ceph_assert(in->is_auth());
7292 ceph_assert(in->filelock.can_read(mdr->get_client()));
7293
7294 frag_info_t dirstat;
7295 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
7296
7297 list<CDir*> ls;
7298 in->get_dirfrags(ls);
7299 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7300 CDir *dir = *p;
7301 const fnode_t *pf = dir->get_projected_fnode();
7302 if (pf->fragstat.size()) {
7303 dout(10) << "dir_is_nonempty dirstat has "
7304 << pf->fragstat.size() << " items " << *dir << dendl;
7305 return true;
7306 }
7307
7308 if (pf->accounted_fragstat.version == dirstat_version)
7309 dirstat.add(pf->accounted_fragstat);
7310 else
7311 dirstat.add(pf->fragstat);
7312 }
7313
7314 return dirstat.size() != in->get_projected_inode()->dirstat.size();
7315 }
7316
7317
7318 // ======================================================
7319
7320
7321 class C_MDS_rename_finish : public ServerLogContext {
7322 CDentry *srcdn;
7323 CDentry *destdn;
7324 CDentry *straydn;
7325 public:
7326 C_MDS_rename_finish(Server *s, MDRequestRef& r,
7327 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
7328 ServerLogContext(s, r),
7329 srcdn(sdn), destdn(ddn), straydn(stdn) { }
7330 void finish(int r) override {
7331 ceph_assert(r == 0);
7332 server->_rename_finish(mdr, srcdn, destdn, straydn);
7333 }
7334 };
7335
7336
7337 /** handle_client_rename
7338 *
7339 * rename master is the destdn auth. this is because cached inodes
7340 * must remain connected. thus, any replica of srci, must also
7341 * replicate destdn, and possibly straydn, so that srci (and
7342 * destdn->inode) remain connected during the rename.
7343 *
7344 * to do this, we freeze srci, then master (destdn auth) verifies that
7345 * all other nodes have also replciated destdn and straydn. note that
7346 * destdn replicas need not also replicate srci. this only works when
7347 * destdn is master.
7348 *
7349 * This function takes responsibility for the passed mdr.
7350 */
7351 void Server::handle_client_rename(MDRequestRef& mdr)
7352 {
7353 const MClientRequest::const_ref &req = mdr->client_request;
7354 dout(7) << "handle_client_rename " << *req << dendl;
7355
7356 filepath destpath = req->get_filepath();
7357 filepath srcpath = req->get_filepath2();
7358 if (destpath.depth() == 0 || srcpath.depth() == 0) {
7359 respond_to_request(mdr, -EINVAL);
7360 return;
7361 }
7362 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
7363 respond_to_request(mdr, -EBUSY);
7364 return;
7365 }
7366
7367 std::string_view destname = destpath.last_dentry();
7368
7369 vector<CDentry*>& srctrace = mdr->dn[1];
7370 vector<CDentry*>& desttrace = mdr->dn[0];
7371
7372 MutationImpl::LockOpVec lov;
7373
7374 CDentry *destdn = rdlock_path_xlock_dentry(mdr, 0, lov, true, false, true);
7375 if (!destdn) return;
7376 dout(10) << " destdn " << *destdn << dendl;
7377 if (mdr->snapid != CEPH_NOSNAP) {
7378 respond_to_request(mdr, -EROFS);
7379 return;
7380 }
7381 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7382 CDir *destdir = destdn->get_dir();
7383 ceph_assert(destdir->is_auth());
7384
7385 CF_MDS_MDRContextFactory cf(mdcache, mdr);
7386 int r = mdcache->path_traverse(mdr, cf, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER);
7387 if (r > 0)
7388 return; // delayed
7389 if (r < 0) {
7390 if (r == -ESTALE) {
7391 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
7392 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
7393 } else {
7394 dout(10) << "FAIL on error " << r << dendl;
7395 respond_to_request(mdr, r);
7396 }
7397 return;
7398
7399 }
7400 ceph_assert(!srctrace.empty());
7401 CDentry *srcdn = srctrace.back();
7402 dout(10) << " srcdn " << *srcdn << dendl;
7403 if (srcdn->last != CEPH_NOSNAP) {
7404 respond_to_request(mdr, -EROFS);
7405 return;
7406 }
7407 CDir *srcdir = srcdn->get_dir();
7408 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7409 CInode *srci = srcdnl->get_inode();
7410 dout(10) << " srci " << *srci << dendl;
7411
7412 CInode *oldin = 0;
7413 if (!destdnl->is_null()) {
7414 //dout(10) << "dest dn exists " << *destdn << dendl;
7415 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
7416 if (!oldin) return;
7417 dout(10) << " oldin " << *oldin << dendl;
7418
7419 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
7420 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
7421 respond_to_request(mdr, -ENOTEMPTY);
7422 return;
7423 }
7424
7425 // if srcdn is replica, need to make sure its linkage is correct
7426 if (srcdn->is_auth() ||
7427 srcdn->lock.can_read(mdr->get_client()) ||
7428 (srcdn->lock.is_xlocked() && srcdn->lock.get_xlock_by() == mdr)) {
7429 // mv /some/thing /to/some/existing_other_thing
7430 if (oldin->is_dir() && !srci->is_dir()) {
7431 respond_to_request(mdr, -EISDIR);
7432 return;
7433 }
7434 if (!oldin->is_dir() && srci->is_dir()) {
7435 respond_to_request(mdr, -ENOTDIR);
7436 return;
7437 }
7438 if (srci == oldin && !srcdir->inode->is_stray()) {
7439 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
7440 return;
7441 }
7442 }
7443 }
7444
7445 // -- some sanity checks --
7446
7447 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
7448 if (destpath.get_ino() != srcpath.get_ino() &&
7449 !(req->get_source().is_mds() &&
7450 MDS_INO_IS_MDSDIR(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
7451 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
7452 CInode *destbase = desttrace[0]->get_dir()->get_inode();
7453 // ok, extend srctrace toward root until it is an ancestor of desttrace.
7454 while (srcbase != destbase &&
7455 !srcbase->is_projected_ancestor_of(destbase)) {
7456 CDentry *pdn = srcbase->get_projected_parent_dn();
7457 srctrace.insert(srctrace.begin(), pdn);
7458 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
7459 srcbase = pdn->get_dir()->get_inode();
7460 }
7461
7462 // then, extend destpath until it shares the same parent inode as srcpath.
7463 while (destbase != srcbase) {
7464 CDentry *pdn = destbase->get_projected_parent_dn();
7465 desttrace.insert(desttrace.begin(), pdn);
7466 lov.add_rdlock(&pdn->lock);
7467 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
7468 destbase = pdn->get_dir()->get_inode();
7469 }
7470 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
7471 }
7472
7473 // src == dest?
7474 if (srcdir == destdir && srcdn->get_name() == destname) {
7475 dout(7) << "rename src=dest, noop" << dendl;
7476 respond_to_request(mdr, 0);
7477 return;
7478 }
7479
7480 // dest a child of src?
7481 // e.g. mv /usr /usr/foo
7482 CDentry *pdn = destdir->inode->get_projected_parent_dn();
7483 while (pdn) {
7484 if (pdn == srcdn) {
7485 dout(7) << "cannot rename item to be a child of itself" << dendl;
7486 respond_to_request(mdr, -EINVAL);
7487 return;
7488 }
7489 pdn = pdn->get_dir()->inode->parent;
7490 }
7491
7492 // is this a stray migration, reintegration or merge? (sanity checks!)
7493 if (mdr->reqid.name.is_mds() &&
7494 !(MDS_INO_IS_MDSDIR(srcpath.get_ino()) &&
7495 MDS_INO_IS_MDSDIR(destpath.get_ino())) &&
7496 !(destdnl->is_remote() &&
7497 destdnl->get_remote_ino() == srci->ino())) {
7498 respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
7499 return;
7500 }
7501
7502 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
7503 if (linkmerge)
7504 dout(10) << " this is a link merge" << dendl;
7505
7506 // -- create stray dentry? --
7507 CDentry *straydn = NULL;
7508 if (destdnl->is_primary() && !linkmerge) {
7509 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
7510 if (!straydn)
7511 return;
7512 dout(10) << " straydn is " << *straydn << dendl;
7513 } else if (mdr->straydn) {
7514 mdr->unpin(mdr->straydn);
7515 mdr->straydn = NULL;
7516 }
7517
7518 // -- prepare witness list --
7519 /*
7520 * NOTE: we use _all_ replicas as witnesses.
7521 * this probably isn't totally necessary (esp for file renames),
7522 * but if/when we change that, we have to make sure rejoin is
7523 * sufficiently robust to handle strong rejoins from survivors
7524 * with totally wrong dentry->inode linkage.
7525 * (currently, it can ignore rename effects, because the resolve
7526 * stage will sort them out.)
7527 */
7528 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
7529 if (srcdn->is_auth())
7530 srcdn->list_replicas(witnesses);
7531 else
7532 witnesses.insert(srcdn->authority().first);
7533 if (srcdnl->is_remote() && !srci->is_auth())
7534 witnesses.insert(srci->authority().first);
7535 destdn->list_replicas(witnesses);
7536 if (destdnl->is_remote() && !oldin->is_auth())
7537 witnesses.insert(oldin->authority().first);
7538 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7539
7540
7541 // -- locks --
7542
7543 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
7544 for (int i=0; i<(int)srctrace.size(); i++)
7545 lov.add_rdlock(&srctrace[i]->lock);
7546 lov.add_xlock(&srcdn->lock);
7547 mds_rank_t srcdirauth = srcdir->authority().first;
7548 if (srcdirauth != mds->get_nodeid()) {
7549 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
7550 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdirauth);
7551 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdirauth);
7552 if (srci->is_dir())
7553 lov.add_rdlock(&srci->dirfragtreelock);
7554 } else {
7555 lov.add_wrlock(&srcdir->inode->filelock);
7556 lov.add_wrlock(&srcdir->inode->nestlock);
7557 }
7558 mds->locker->include_snap_rdlocks(srcdir->inode, lov);
7559
7560 // straydn?
7561 if (straydn) {
7562 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7563 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7564 lov.add_xlock(&straydn->lock);
7565 }
7566
7567 // xlock versionlock on dentries if there are witnesses.
7568 // replicas can't see projected dentry linkages, and will get
7569 // confused if we try to pipeline things.
7570 if (!witnesses.empty()) {
7571 // take xlock on all projected ancestor dentries for srcdn and destdn.
7572 // this ensures the srcdn and destdn can be traversed to by the witnesses.
7573 for (int i= 0; i<(int)srctrace.size(); i++) {
7574 if (srctrace[i]->is_auth() && srctrace[i]->is_projected())
7575 lov.add_xlock(&srctrace[i]->versionlock);
7576 }
7577 for (int i=0; i<(int)desttrace.size(); i++) {
7578 if (desttrace[i]->is_auth() && desttrace[i]->is_projected())
7579 lov.add_xlock(&desttrace[i]->versionlock);
7580 }
7581 // xlock srci and oldin's primary dentries, so witnesses can call
7582 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
7583 // is traversed.
7584 if (srcdnl->is_remote())
7585 lov.add_xlock(&srci->get_projected_parent_dn()->lock);
7586 if (destdnl->is_remote())
7587 lov.add_xlock(&oldin->get_projected_parent_dn()->lock);
7588 }
7589
7590 // we need to update srci's ctime. xlock its least contended lock to do that...
7591 lov.add_xlock(&srci->linklock);
7592 lov.add_xlock(&srci->snaplock);
7593
7594 if (oldin) {
7595 // xlock oldin (for nlink--)
7596 lov.add_xlock(&oldin->linklock);
7597 lov.add_xlock(&oldin->snaplock);
7598 if (oldin->is_dir())
7599 lov.add_rdlock(&oldin->filelock); // to verify it's empty
7600 }
7601
7602 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL;
7603 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
7604 return;
7605
7606 if (linkmerge)
7607 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
7608
7609 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7610 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
7611 return;
7612
7613 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
7614 return;
7615
7616 if (!check_fragment_space(mdr, destdn->get_dir()))
7617 return;
7618
7619 if (!check_access(mdr, srci, MAY_WRITE))
7620 return;
7621 }
7622
7623 // with read lock, really verify oldin is empty
7624 if (oldin &&
7625 oldin->is_dir() &&
7626 _dir_is_nonempty(mdr, oldin)) {
7627 respond_to_request(mdr, -ENOTEMPTY);
7628 return;
7629 }
7630
7631 /* project_snaprealm_past_parent() will do this job
7632 *
7633 // moving between snaprealms?
7634 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
7635 SnapRealm *srcrealm = srci->find_snaprealm();
7636 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
7637 if (srcrealm != destrealm &&
7638 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
7639 destrealm->get_newest_seq() + 1 > srcdn->first)) {
7640 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
7641 mdcache->snaprealm_create(mdr, srci);
7642 return;
7643 }
7644 }
7645 */
7646
7647 ceph_assert(g_conf()->mds_kill_rename_at != 1);
7648
7649 // -- open all srcdn inode frags, if any --
7650 // we need these open so that auth can properly delegate from inode to dirfrags
7651 // after the inode is _ours_.
7652 if (srcdnl->is_primary() &&
7653 !srcdn->is_auth() &&
7654 srci->is_dir()) {
7655 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
7656 mdr->set_stickydirs(srci);
7657
7658 frag_vec_t leaves;
7659 srci->dirfragtree.get_leaves(leaves);
7660 for (const auto& leaf : leaves) {
7661 CDir *dir = srci->get_dirfrag(leaf);
7662 if (!dir) {
7663 dout(10) << " opening " << leaf << " under " << *srci << dendl;
7664 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
7665 return;
7666 }
7667 }
7668 }
7669
7670 // -- prepare snaprealm ---
7671
7672 if (linkmerge) {
7673 if (!mdr->more()->srci_srnode &&
7674 srci->get_projected_inode()->nlink == 1 &&
7675 srci->is_projected_snaprealm_global()) {
7676 sr_t *new_srnode = srci->prepare_new_srnode(0);
7677 srci->record_snaprealm_parent_dentry(new_srnode, NULL, destdn, false);
7678
7679 srci->clear_snaprealm_global(new_srnode);
7680 mdr->more()->srci_srnode = new_srnode;
7681 }
7682 } else {
7683 if (oldin && !mdr->more()->desti_srnode) {
7684 if (oldin->is_projected_snaprealm_global()) {
7685 sr_t *new_srnode = oldin->prepare_new_srnode(0);
7686 oldin->record_snaprealm_parent_dentry(new_srnode, NULL, destdn, destdnl->is_primary());
7687 // dropping the last linkage or dropping the last remote linkage,
7688 // detch the inode from global snaprealm
7689 auto nlink = oldin->get_projected_inode()->nlink;
7690 if (nlink == 1 ||
7691 (nlink == 2 && !destdnl->is_primary() &&
7692 !oldin->get_projected_parent_dir()->inode->is_stray()))
7693 oldin->clear_snaprealm_global(new_srnode);
7694 mdr->more()->desti_srnode = new_srnode;
7695 } else if (destdnl->is_primary()) {
7696 SnapRealm *dest_realm = destdir->inode->find_snaprealm();
7697 snapid_t follows = dest_realm->get_newest_seq();
7698 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
7699 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
7700 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7701 mdr->more()->desti_srnode = new_srnode;
7702 }
7703 }
7704 }
7705 if (!mdr->more()->srci_srnode) {
7706 SnapRealm *dest_realm = destdir->inode->find_snaprealm();
7707 if (srci->is_projected_snaprealm_global()) {
7708 sr_t *new_srnode = srci->prepare_new_srnode(0);
7709 srci->record_snaprealm_parent_dentry(new_srnode, dest_realm, srcdn, srcdnl->is_primary());
7710 mdr->more()->srci_srnode = new_srnode;
7711 } else if (srcdnl->is_primary()) {
7712 SnapRealm *src_realm = srcdir->inode->find_snaprealm();
7713 snapid_t follows = src_realm->get_newest_seq();
7714 if (src_realm != dest_realm &&
7715 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
7716 sr_t *new_srnode = srci->prepare_new_srnode(follows);
7717 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
7718 mdr->more()->srci_srnode = new_srnode;
7719 }
7720 }
7721 }
7722 }
7723
7724 // -- prepare witnesses --
7725
7726 // do srcdn auth last
7727 mds_rank_t last = MDS_RANK_NONE;
7728 if (!srcdn->is_auth()) {
7729 last = srcdn->authority().first;
7730 mdr->more()->srcdn_auth_mds = last;
7731 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
7732 // are involved in the rename operation.
7733 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
7734 dout(10) << " preparing ambiguous auth for srci" << dendl;
7735 ceph_assert(mdr->more()->is_remote_frozen_authpin);
7736 ceph_assert(mdr->more()->rename_inode == srci);
7737 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
7738 return;
7739 }
7740 }
7741
7742 for (set<mds_rank_t>::iterator p = witnesses.begin();
7743 p != witnesses.end();
7744 ++p) {
7745 if (*p == last) continue; // do it last!
7746 if (mdr->more()->witnessed.count(*p)) {
7747 dout(10) << " already witnessed by mds." << *p << dendl;
7748 } else if (mdr->more()->waiting_on_slave.count(*p)) {
7749 dout(10) << " already waiting on witness mds." << *p << dendl;
7750 } else {
7751 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
7752 return;
7753 }
7754 }
7755 if (!mdr->more()->waiting_on_slave.empty())
7756 return; // we're waiting for a witness.
7757
7758 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
7759 dout(10) << " preparing last witness (srcdn auth)" << dendl;
7760 ceph_assert(mdr->more()->waiting_on_slave.count(last) == 0);
7761 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
7762 return;
7763 }
7764
7765 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
7766 if (!mdr->more()->slaves.empty() && !srci->is_dir())
7767 ceph_assert(g_conf()->mds_kill_rename_at != 3);
7768 if (!mdr->more()->slaves.empty() && srci->is_dir())
7769 ceph_assert(g_conf()->mds_kill_rename_at != 4);
7770
7771 // -- declare now --
7772 mdr->set_mds_stamp(ceph_clock_now());
7773
7774 // -- prepare journal entry --
7775 mdr->ls = mdlog->get_current_segment();
7776 EUpdate *le = new EUpdate(mdlog, "rename");
7777 mdlog->start_entry(le);
7778 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7779 if (!mdr->more()->witnessed.empty()) {
7780 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
7781
7782 le->reqid = mdr->reqid;
7783 le->had_slaves = true;
7784
7785 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7786 // no need to send frozen auth pin to recovring auth MDS of srci
7787 mdr->more()->is_remote_frozen_authpin = false;
7788 }
7789
7790 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
7791 if (le->client_map.length())
7792 le->cmapv = mds->sessionmap.get_projected();
7793
7794 // -- commit locally --
7795 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
7796
7797 journal_and_reply(mdr, srci, destdn, le, fin);
7798 mds->balancer->maybe_fragment(destdn->get_dir(), false);
7799 }
7800
7801
7802 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7803 {
7804 dout(10) << "_rename_finish " << *mdr << dendl;
7805
7806 if (!mdr->more()->witnessed.empty())
7807 mdcache->logged_master_update(mdr->reqid);
7808
7809 // apply
7810 _rename_apply(mdr, srcdn, destdn, straydn);
7811
7812 mdcache->send_dentry_link(destdn, mdr);
7813
7814 CDentry::linkage_t *destdnl = destdn->get_linkage();
7815 CInode *in = destdnl->get_inode();
7816 bool need_eval = mdr->more()->cap_imports.count(in);
7817
7818 // test hack: test slave commit
7819 if (!mdr->more()->slaves.empty() && !in->is_dir())
7820 ceph_assert(g_conf()->mds_kill_rename_at != 5);
7821 if (!mdr->more()->slaves.empty() && in->is_dir())
7822 ceph_assert(g_conf()->mds_kill_rename_at != 6);
7823
7824 // bump popularity
7825 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
7826 if (destdnl->is_remote() && in->is_auth())
7827 mds->balancer->hit_inode(in, META_POP_IWR);
7828
7829 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
7830
7831 ceph_assert(g_conf()->mds_kill_rename_at != 7);
7832
7833 // reply
7834 respond_to_request(mdr, 0);
7835
7836 if (need_eval)
7837 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
7838
7839 // clean up?
7840 // respond_to_request() drops locks. So stray reintegration can race with us.
7841 if (straydn && !straydn->get_projected_linkage()->is_null()) {
7842 mdcache->notify_stray(straydn);
7843 }
7844 }
7845
7846
7847
7848 // helpers
7849
7850 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
7851 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
7852 {
7853 if (mds->is_cluster_degraded() &&
7854 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7855 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
7856 if (mdr->more()->waiting_on_slave.empty())
7857 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7858 return false;
7859 }
7860
7861 dout(10) << "_rename_prepare_witness mds." << who << dendl;
7862 auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREP);
7863
7864 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
7865 for (auto dn : srctrace)
7866 req->srcdnpath.push_dentry(dn->get_name());
7867 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
7868 for (auto dn : dsttrace)
7869 req->destdnpath.push_dentry(dn->get_name());
7870 if (straydn)
7871 mdcache->replicate_stray(straydn, who, req->straybl);
7872
7873 if (mdr->more()->srci_srnode)
7874 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
7875 if (mdr->more()->desti_srnode)
7876 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
7877
7878 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
7879
7880 // srcdn auth will verify our current witness list is sufficient
7881 req->witnesses = witnesse;
7882
7883 req->op_stamp = mdr->get_op_stamp();
7884 mds->send_message_mds(req, who);
7885
7886 ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
7887 mdr->more()->waiting_on_slave.insert(who);
7888 return true;
7889 }
7890
7891 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
7892 {
7893 version_t oldpv = mdr->more()->inode_import_v;
7894
7895 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7896
7897 /* import node */
7898 auto blp = mdr->more()->inode_import.cbegin();
7899
7900 // imported caps
7901 map<client_t,entity_inst_t> client_map;
7902 map<client_t, client_metadata_t> client_metadata_map;
7903 decode(client_map, blp);
7904 decode(client_metadata_map, blp);
7905 prepare_force_open_sessions(client_map, client_metadata_map,
7906 mdr->more()->imported_session_map);
7907 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
7908 encode(client_metadata_map, *client_map_bl);
7909
7910 list<ScatterLock*> updated_scatterlocks;
7911 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
7912 mdr->more()->cap_imports, updated_scatterlocks);
7913
7914 // hack: force back to !auth and clean, temporarily
7915 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
7916 srcdnl->get_inode()->mark_clean();
7917
7918 return oldpv;
7919 }
7920
7921 bool Server::_need_force_journal(CInode *diri, bool empty)
7922 {
7923 std::vector<CDir*> dirs;
7924 diri->get_dirfrags(dirs);
7925
7926 bool force_journal = false;
7927 if (empty) {
7928 for (const auto& dir : dirs) {
7929 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
7930 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
7931 force_journal = true;
7932 break;
7933 } else
7934 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
7935 }
7936 } else {
7937 // see if any children of our frags are auth subtrees.
7938 std::vector<CDir*> subtrees;
7939 mdcache->get_subtrees(subtrees);
7940 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
7941 for (const auto& dir : dirs) {
7942 for (const auto& subtree : subtrees) {
7943 if (dir->contains(subtree)) {
7944 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
7945 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
7946 << *subtree << dendl;
7947 force_journal = true;
7948 break;
7949 } else
7950 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
7951 } else
7952 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
7953 }
7954 if (force_journal)
7955 break;
7956 }
7957 }
7958 return force_journal;
7959 }
7960
7961 void Server::_rename_prepare(MDRequestRef& mdr,
7962 EMetaBlob *metablob, bufferlist *client_map_bl,
7963 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7964 {
7965 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7966 if (straydn)
7967 dout(10) << " straydn " << *straydn << dendl;
7968
7969 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7970 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7971 CInode *srci = srcdnl->get_inode();
7972 CInode *oldin = destdnl->get_inode();
7973
7974 // primary+remote link merge?
7975 bool linkmerge = (srci == oldin);
7976 if (linkmerge)
7977 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
7978 bool silent = srcdn->get_dir()->inode->is_stray();
7979
7980 bool force_journal_dest = false;
7981 if (srci->is_dir() && !destdn->is_auth()) {
7982 if (srci->is_auth()) {
7983 // if we are auth for srci and exporting it, force journal because journal replay needs
7984 // the source inode to create auth subtrees.
7985 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
7986 force_journal_dest = true;
7987 } else
7988 force_journal_dest = _need_force_journal(srci, false);
7989 }
7990
7991 bool force_journal_stray = false;
7992 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
7993 force_journal_stray = _need_force_journal(oldin, true);
7994
7995 if (linkmerge)
7996 dout(10) << " merging remote and primary links to the same inode" << dendl;
7997 if (silent)
7998 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
7999 if (force_journal_dest)
8000 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
8001 if (force_journal_stray)
8002 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
8003
8004 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
8005 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
8006 metablob->renamed_dirino = srci->ino();
8007 } else if (oldin && oldin->is_dir() && force_journal_stray) {
8008 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
8009 metablob->renamed_dirino = oldin->ino();
8010 }
8011
8012 // prepare
8013 CInode::mempool_inode *spi = 0; // renamed inode
8014 CInode::mempool_inode *tpi = 0; // target/overwritten inode
8015
8016 // target inode
8017 if (!linkmerge) {
8018 if (destdnl->is_primary()) {
8019 ceph_assert(straydn); // moving to straydn.
8020 // link--, and move.
8021 if (destdn->is_auth()) {
8022 auto &pi= oldin->project_inode(); //project_snaprealm
8023 pi.inode.version = straydn->pre_dirty(pi.inode.version);
8024 pi.inode.update_backtrace();
8025 tpi = &pi.inode;
8026 }
8027 straydn->push_projected_linkage(oldin);
8028 } else if (destdnl->is_remote()) {
8029 // nlink-- targeti
8030 if (oldin->is_auth()) {
8031 auto &pi = oldin->project_inode();
8032 pi.inode.version = oldin->pre_dirty();
8033 tpi = &pi.inode;
8034 }
8035 }
8036 }
8037
8038 // dest
8039 if (srcdnl->is_remote()) {
8040 if (!linkmerge) {
8041 // destdn
8042 if (destdn->is_auth())
8043 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
8044 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8045 // srci
8046 if (srci->is_auth()) {
8047 auto &pi = srci->project_inode();
8048 pi.inode.version = srci->pre_dirty();
8049 spi = &pi.inode;
8050 }
8051 } else {
8052 dout(10) << " will merge remote onto primary link" << dendl;
8053 if (destdn->is_auth()) {
8054 auto &pi = oldin->project_inode();
8055 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
8056 spi = &pi.inode;
8057 }
8058 }
8059 } else { // primary
8060 if (destdn->is_auth()) {
8061 version_t oldpv;
8062 if (srcdn->is_auth())
8063 oldpv = srci->get_projected_version();
8064 else {
8065 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
8066
8067 // note which dirfrags have child subtrees in the journal
8068 // event, so that we can open those (as bounds) during replay.
8069 if (srci->is_dir()) {
8070 list<CDir*> ls;
8071 srci->get_dirfrags(ls);
8072 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
8073 CDir *dir = *p;
8074 if (!dir->is_auth())
8075 metablob->renamed_dir_frags.push_back(dir->get_frag());
8076 }
8077 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
8078 }
8079 }
8080 auto &pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
8081 // & srcdnl->snaprealm
8082 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
8083 pi.inode.update_backtrace();
8084 spi = &pi.inode;
8085 }
8086 destdn->push_projected_linkage(srci);
8087 }
8088
8089 // src
8090 if (srcdn->is_auth())
8091 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
8092 srcdn->push_projected_linkage(); // push null linkage
8093
8094 if (!silent) {
8095 if (spi) {
8096 spi->ctime = mdr->get_op_stamp();
8097 if (mdr->get_op_stamp() > spi->rstat.rctime)
8098 spi->rstat.rctime = mdr->get_op_stamp();
8099 spi->change_attr++;
8100 if (linkmerge)
8101 spi->nlink--;
8102 }
8103 if (tpi) {
8104 tpi->ctime = mdr->get_op_stamp();
8105 if (mdr->get_op_stamp() > tpi->rstat.rctime)
8106 tpi->rstat.rctime = mdr->get_op_stamp();
8107 tpi->change_attr++;
8108 {
8109 std::string t;
8110 destdn->make_path_string(t, true);
8111 tpi->stray_prior_path = std::move(t);
8112 }
8113 tpi->nlink--;
8114 if (tpi->nlink == 0)
8115 oldin->state_set(CInode::STATE_ORPHAN);
8116 }
8117 }
8118
8119 // prepare nesting, mtime updates
8120 int predirty_dir = silent ? 0:PREDIRTY_DIR;
8121
8122 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8123 // then link the source inode to destdn
8124 if (destdnl->is_primary()) {
8125 ceph_assert(straydn);
8126 if (straydn->is_auth()) {
8127 metablob->add_dir_context(straydn->get_dir());
8128 metablob->add_dir(straydn->get_dir(), true);
8129 }
8130 }
8131
8132 // sub off target
8133 if (destdn->is_auth() && !destdnl->is_null()) {
8134 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
8135 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
8136 if (destdnl->is_primary()) {
8137 ceph_assert(straydn);
8138 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
8139 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
8140 }
8141 }
8142
8143 // move srcdn
8144 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
8145 int flags = predirty_dir | predirty_primary;
8146 if (srcdn->is_auth())
8147 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
8148 if (destdn->is_auth())
8149 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
8150
8151 // add it all to the metablob
8152 // target inode
8153 if (!linkmerge) {
8154 if (destdnl->is_primary()) {
8155 ceph_assert(straydn);
8156 if (destdn->is_auth()) {
8157 // project snaprealm, too
8158 if (auto& desti_srnode = mdr->more()->desti_srnode) {
8159 oldin->project_snaprealm(desti_srnode);
8160 if (tpi->nlink == 0)
8161 ceph_assert(!desti_srnode->is_parent_global());
8162 desti_srnode = NULL;
8163 }
8164 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8165 metablob->add_primary_dentry(straydn, oldin, true, true);
8166 } else if (force_journal_stray) {
8167 dout(10) << " forced journaling straydn " << *straydn << dendl;
8168 metablob->add_dir_context(straydn->get_dir());
8169 metablob->add_primary_dentry(straydn, oldin, true);
8170 }
8171 } else if (destdnl->is_remote()) {
8172 if (oldin->is_auth()) {
8173 sr_t *new_srnode = NULL;
8174 if (mdr->slave_request) {
8175 if (mdr->slave_request->desti_snapbl.length() > 0) {
8176 new_srnode = new sr_t();
8177 auto p = mdr->slave_request->desti_snapbl.cbegin();
8178 decode(*new_srnode, p);
8179 }
8180 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8181 new_srnode = desti_srnode;
8182 desti_srnode = NULL;
8183 }
8184 if (new_srnode) {
8185 oldin->project_snaprealm(new_srnode);
8186 if (tpi->nlink == 0)
8187 ceph_assert(!new_srnode->is_parent_global());
8188 }
8189 // auth for targeti
8190 metablob->add_dir_context(oldin->get_projected_parent_dir());
8191 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
8192 CEPH_NOSNAP, 0, destdnl);
8193 metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
8194 }
8195 }
8196 }
8197
8198 // dest
8199 if (srcdnl->is_remote()) {
8200 ceph_assert(!linkmerge);
8201 if (destdn->is_auth() && !destdnl->is_null())
8202 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8203 else
8204 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8205
8206 if (destdn->is_auth())
8207 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8208
8209 if (srci->is_auth() ) { // it's remote
8210 if (mdr->slave_request) {
8211 if (mdr->slave_request->srci_snapbl.length() > 0) {
8212 sr_t *new_srnode = new sr_t();
8213 auto p = mdr->slave_request->srci_snapbl.cbegin();
8214 decode(*new_srnode, p);
8215 srci->project_snaprealm(new_srnode);
8216 }
8217 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8218 srci->project_snaprealm(srci_srnode);
8219 srci_srnode = NULL;
8220 }
8221
8222 CDentry *srci_pdn = srci->get_projected_parent_dn();
8223 metablob->add_dir_context(srci_pdn->get_dir());
8224 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn, CEPH_NOSNAP, 0, srcdnl);
8225 metablob->add_primary_dentry(srci_pdn, srci, true);
8226 }
8227 } else if (srcdnl->is_primary()) {
8228 // project snap parent update?
8229 if (destdn->is_auth()) {
8230 if (auto& srci_srnode = mdr->more()->srci_srnode) {
8231 srci->project_snaprealm(srci_srnode);
8232 srci_srnode = NULL;
8233 }
8234 }
8235
8236 if (destdn->is_auth() && !destdnl->is_null())
8237 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8238
8239 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8240
8241 if (destdn->is_auth())
8242 metablob->add_primary_dentry(destdn, srci, true, true);
8243 else if (force_journal_dest) {
8244 dout(10) << " forced journaling destdn " << *destdn << dendl;
8245 metablob->add_dir_context(destdn->get_dir());
8246 metablob->add_primary_dentry(destdn, srci, true);
8247 if (srcdn->is_auth() && srci->is_dir()) {
8248 // journal new subtrees root dirfrags
8249 list<CDir*> ls;
8250 srci->get_dirfrags(ls);
8251 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
8252 CDir *dir = *p;
8253 if (dir->is_auth())
8254 metablob->add_dir(dir, true);
8255 }
8256 }
8257 }
8258 }
8259
8260 // src
8261 if (srcdn->is_auth()) {
8262 dout(10) << " journaling srcdn " << *srcdn << dendl;
8263 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
8264 // also journal the inode in case we need do slave rename rollback. It is Ok to add
8265 // both primary and NULL dentries. Because during journal replay, null dentry is
8266 // processed after primary dentry.
8267 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
8268 metablob->add_primary_dentry(srcdn, srci, true);
8269 metablob->add_null_dentry(srcdn, true);
8270 } else
8271 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
8272
8273 // make renamed inode first track the dn
8274 if (srcdnl->is_primary() && destdn->is_auth()) {
8275 ceph_assert(srci->first <= destdn->first);
8276 srci->first = destdn->first;
8277 }
8278 // make stray inode first track the straydn
8279 if (straydn && straydn->is_auth()) {
8280 ceph_assert(oldin->first <= straydn->first);
8281 oldin->first = straydn->first;
8282 }
8283
8284 if (oldin && oldin->is_dir()) {
8285 ceph_assert(straydn);
8286 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
8287 }
8288 if (srci->is_dir())
8289 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
8290
8291 }
8292
8293
8294 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8295 {
8296 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8297 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
8298
8299 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8300 CDentry::linkage_t *destdnl = destdn->get_linkage();
8301
8302 CInode *oldin = destdnl->get_inode();
8303
8304 // primary+remote link merge?
8305 bool linkmerge = (srcdnl->get_inode() == oldin);
8306 if (linkmerge)
8307 ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
8308
8309 bool new_in_snaprealm = false;
8310 bool new_oldin_snaprealm = false;
8311
8312 // target inode
8313 if (!linkmerge) {
8314 if (destdnl->is_primary()) {
8315 ceph_assert(straydn);
8316 dout(10) << "straydn is " << *straydn << dendl;
8317
8318 // if there is newly created snaprealm, need to split old snaprealm's
8319 // inodes_with_caps. So pop snaprealm before linkage changes.
8320 if (destdn->is_auth()) {
8321 bool hadrealm = (oldin->snaprealm ? true : false);
8322 oldin->early_pop_projected_snaprealm();
8323 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
8324 } else {
8325 ceph_assert(mdr->slave_request);
8326 if (mdr->slave_request->desti_snapbl.length()) {
8327 new_oldin_snaprealm = !oldin->snaprealm;
8328 oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
8329 ceph_assert(oldin->snaprealm);
8330 ceph_assert(oldin->snaprealm->have_past_parents_open());
8331 }
8332 }
8333
8334 destdn->get_dir()->unlink_inode(destdn, false);
8335
8336 straydn->pop_projected_linkage();
8337 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8338 ceph_assert(!straydn->is_projected()); // no other projected
8339
8340 // nlink-- targeti
8341 if (destdn->is_auth())
8342 oldin->pop_and_dirty_projected_inode(mdr->ls);
8343
8344 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
8345 } else if (destdnl->is_remote()) {
8346 destdn->get_dir()->unlink_inode(destdn, false);
8347 if (oldin->is_auth()) {
8348 oldin->pop_and_dirty_projected_inode(mdr->ls);
8349 } else if (mdr->slave_request) {
8350 if (mdr->slave_request->desti_snapbl.length() > 0) {
8351 ceph_assert(oldin->snaprealm);
8352 oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
8353 }
8354 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8355 delete desti_srnode;
8356 desti_srnode = NULL;
8357 }
8358 }
8359 }
8360
8361 // unlink src before we relink it at dest
8362 CInode *in = srcdnl->get_inode();
8363 ceph_assert(in);
8364
8365 bool srcdn_was_remote = srcdnl->is_remote();
8366 if (!srcdn_was_remote) {
8367 // if there is newly created snaprealm, need to split old snaprealm's
8368 // inodes_with_caps. So pop snaprealm before linkage changes.
8369 if (destdn->is_auth()) {
8370 bool hadrealm = (in->snaprealm ? true : false);
8371 in->early_pop_projected_snaprealm();
8372 new_in_snaprealm = (in->snaprealm && !hadrealm);
8373 } else {
8374 ceph_assert(mdr->slave_request);
8375 if (mdr->slave_request->srci_snapbl.length()) {
8376 new_in_snaprealm = !in->snaprealm;
8377 in->decode_snap_blob(mdr->slave_request->srci_snapbl);
8378 ceph_assert(in->snaprealm);
8379 ceph_assert(in->snaprealm->have_past_parents_open());
8380 }
8381 }
8382 }
8383
8384 srcdn->get_dir()->unlink_inode(srcdn);
8385
8386 // dest
8387 if (srcdn_was_remote) {
8388 if (!linkmerge) {
8389 // destdn
8390 destdnl = destdn->pop_projected_linkage();
8391 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8392 ceph_assert(!destdn->is_projected()); // no other projected
8393
8394 destdn->link_remote(destdnl, in);
8395 if (destdn->is_auth())
8396 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
8397 // in
8398 if (in->is_auth()) {
8399 in->pop_and_dirty_projected_inode(mdr->ls);
8400 } else if (mdr->slave_request) {
8401 if (mdr->slave_request->srci_snapbl.length() > 0) {
8402 ceph_assert(in->snaprealm);
8403 in->decode_snap_blob(mdr->slave_request->srci_snapbl);
8404 }
8405 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8406 delete srci_srnode;
8407 srci_srnode = NULL;
8408 }
8409 } else {
8410 dout(10) << "merging remote onto primary link" << dendl;
8411 oldin->pop_and_dirty_projected_inode(mdr->ls);
8412 }
8413 } else { // primary
8414 if (linkmerge) {
8415 dout(10) << "merging primary onto remote link" << dendl;
8416 destdn->get_dir()->unlink_inode(destdn, false);
8417 }
8418 destdnl = destdn->pop_projected_linkage();
8419 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8420 ceph_assert(!destdn->is_projected()); // no other projected
8421
8422 // srcdn inode import?
8423 if (!srcdn->is_auth() && destdn->is_auth()) {
8424 ceph_assert(mdr->more()->inode_import.length() > 0);
8425
8426 map<client_t,Capability::Import> imported_caps;
8427
8428 // finish cap imports
8429 finish_force_open_sessions(mdr->more()->imported_session_map);
8430 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
8431 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
8432 mdr->more()->srcdn_auth_mds, true,
8433 mdr->more()->imported_session_map,
8434 mdr->more()->cap_imports[destdnl->get_inode()],
8435 imported_caps);
8436 }
8437
8438 mdr->more()->inode_import.clear();
8439 encode(imported_caps, mdr->more()->inode_import);
8440
8441 /* hack: add an auth pin for each xlock we hold. These were
8442 * remote xlocks previously but now they're local and
8443 * we're going to try and unpin when we xlock_finish. */
8444
8445 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
8446 i != mdr->locks.end();
8447 ++i) {
8448 SimpleLock *lock = i->lock;
8449 if (lock->get_parent() != destdnl->get_inode())
8450 break;
8451 if (i->is_xlock() && !lock->is_locallock())
8452 mds->locker->xlock_import(lock);
8453 }
8454
8455 // hack: fix auth bit
8456 in->state_set(CInode::STATE_AUTH);
8457
8458 mdr->clear_ambiguous_auth();
8459 }
8460
8461 if (destdn->is_auth())
8462 in->pop_and_dirty_projected_inode(mdr->ls);
8463 }
8464
8465 // src
8466 if (srcdn->is_auth())
8467 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
8468 srcdn->pop_projected_linkage();
8469 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8470 ceph_assert(!srcdn->is_projected()); // no other projected
8471
8472 // apply remaining projected inodes (nested)
8473 mdr->apply();
8474
8475 // update subtree map?
8476 if (destdnl->is_primary() && in->is_dir())
8477 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
8478
8479 if (straydn && oldin->is_dir())
8480 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
8481
8482 if (new_oldin_snaprealm)
8483 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
8484 if (new_in_snaprealm)
8485 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
8486
8487 // removing a new dn?
8488 if (srcdn->is_auth())
8489 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
8490 }
8491
8492
8493
8494 // ------------
8495 // SLAVE
8496
8497 class C_MDS_SlaveRenamePrep : public ServerLogContext {
8498 CDentry *srcdn, *destdn, *straydn;
8499 public:
8500 C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
8501 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
8502 void finish(int r) override {
8503 server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
8504 }
8505 };
8506
8507 class C_MDS_SlaveRenameCommit : public ServerContext {
8508 MDRequestRef mdr;
8509 CDentry *srcdn, *destdn, *straydn;
8510 public:
8511 C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
8512 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
8513 void finish(int r) override {
8514 server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
8515 }
8516 };
8517
8518 class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
8519 MDRequestRef mdr;
8520 public:
8521 C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
8522 ServerContext(s), mdr(r) {}
8523 void finish(int r) override {
8524 server->_slave_rename_sessions_flushed(mdr);
8525 }
8526 };
8527
8528 void Server::handle_slave_rename_prep(MDRequestRef& mdr)
8529 {
8530 dout(10) << "handle_slave_rename_prep " << *mdr
8531 << " " << mdr->slave_request->srcdnpath
8532 << " to " << mdr->slave_request->destdnpath
8533 << dendl;
8534
8535 if (mdr->slave_request->is_interrupted()) {
8536 dout(10) << " slave request interrupted, sending noop reply" << dendl;
8537 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
8538 reply->mark_interrupted();
8539 mds->send_message_mds(reply, mdr->slave_to_mds);
8540 mdr->reset_slave_request();
8541 return;
8542 }
8543
8544 // discover destdn
8545 filepath destpath(mdr->slave_request->destdnpath);
8546 dout(10) << " dest " << destpath << dendl;
8547 vector<CDentry*> trace;
8548 CF_MDS_MDRContextFactory cf(mdcache, mdr);
8549 int r = mdcache->path_traverse(mdr, cf, destpath, &trace, NULL, MDS_TRAVERSE_DISCOVERXLOCK);
8550 if (r > 0) return;
8551 if (r == -ESTALE) {
8552 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
8553 mdr->slave_to_mds);
8554 return;
8555 }
8556 ceph_assert(r == 0); // we shouldn't get an error here!
8557
8558 CDentry *destdn = trace.back();
8559 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8560 dout(10) << " destdn " << *destdn << dendl;
8561 mdr->pin(destdn);
8562
8563 // discover srcdn
8564 filepath srcpath(mdr->slave_request->srcdnpath);
8565 dout(10) << " src " << srcpath << dendl;
8566 CInode *srci = nullptr;
8567 r = mdcache->path_traverse(mdr, cf, srcpath, &trace, &srci, MDS_TRAVERSE_DISCOVERXLOCK);
8568 if (r > 0) return;
8569 ceph_assert(r == 0);
8570
8571 // srcpath must not point to a null dentry
8572 ceph_assert(srci != nullptr);
8573
8574 CDentry *srcdn = trace.back();
8575 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8576 dout(10) << " srcdn " << *srcdn << dendl;
8577 mdr->pin(srcdn);
8578 mdr->pin(srci);
8579
8580 // stray?
8581 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
8582 if (linkmerge)
8583 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
8584 CDentry *straydn = mdr->straydn;
8585 if (destdnl->is_primary() && !linkmerge)
8586 ceph_assert(straydn);
8587
8588 mdr->set_op_stamp(mdr->slave_request->op_stamp);
8589 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
8590
8591 // set up commit waiter (early, to clean up any freezing etc we do)
8592 if (!mdr->more()->slave_commit)
8593 mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
8594
8595 // am i srcdn auth?
8596 if (srcdn->is_auth()) {
8597 set<mds_rank_t> srcdnrep;
8598 srcdn->list_replicas(srcdnrep);
8599
8600 bool reply_witness = false;
8601 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
8602 // freeze?
8603 // we need this to
8604 // - avoid conflicting lock state changes
8605 // - avoid concurrent updates to the inode
8606 // (this could also be accomplished with the versionlock)
8607 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
8608 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
8609 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
8610
8611 // unfreeze auth pin after freezing the inode to avoid queueing waiters
8612 if (srcdnl->get_inode()->is_frozen_auth_pin())
8613 mdr->unfreeze_auth_pin();
8614
8615 if (!frozen_inode) {
8616 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
8617 return;
8618 }
8619
8620 /*
8621 * set ambiguous auth for srci
8622 * NOTE: we don't worry about ambiguous cache expire as we do
8623 * with subtree migrations because all slaves will pin
8624 * srcdn->get_inode() for duration of this rename.
8625 */
8626 mdr->set_ambiguous_auth(srcdnl->get_inode());
8627
8628 // just mark the source inode as ambiguous auth if more than two MDS are involved.
8629 // the master will send another OP_RENAMEPREP slave request later.
8630 if (mdr->slave_request->witnesses.size() > 1) {
8631 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
8632 reply_witness = true;
8633 }
8634
8635 // make sure bystanders have received all lock related messages
8636 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
8637 if (*p == mdr->slave_to_mds ||
8638 (mds->is_cluster_degraded() &&
8639 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
8640 continue;
8641 auto notify = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMENOTIFY);
8642 mds->send_message_mds(notify, *p);
8643 mdr->more()->waiting_on_slave.insert(*p);
8644 }
8645
8646 // make sure clients have received all cap related messages
8647 set<client_t> export_client_set;
8648 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
8649
8650 MDSGatherBuilder gather(g_ceph_context);
8651 flush_client_sessions(export_client_set, gather);
8652 if (gather.has_subs()) {
8653 mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
8654 gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
8655 gather.activate();
8656 }
8657 }
8658
8659 // is witness list sufficient?
8660 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
8661 if (*p == mdr->slave_to_mds ||
8662 mdr->slave_request->witnesses.count(*p)) continue;
8663 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
8664 reply_witness = true;
8665 break;
8666 }
8667
8668 if (reply_witness) {
8669 ceph_assert(!srcdnrep.empty());
8670 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
8671 reply->witnesses.swap(srcdnrep);
8672 mds->send_message_mds(reply, mdr->slave_to_mds);
8673 mdr->reset_slave_request();
8674 return;
8675 }
8676 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
8677 if (!mdr->more()->waiting_on_slave.empty()) {
8678 dout(10) << " still waiting for rename notify acks from "
8679 << mdr->more()->waiting_on_slave << dendl;
8680 return;
8681 }
8682 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
8683 // set ambiguous auth for srci on witnesses
8684 mdr->set_ambiguous_auth(srcdnl->get_inode());
8685 }
8686
8687 // encode everything we'd need to roll this back... basically, just the original state.
8688 rename_rollback rollback;
8689
8690 rollback.reqid = mdr->reqid;
8691
8692 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
8693 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
8694 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
8695 rollback.orig_src.dname = srcdn->get_name();
8696 if (srcdnl->is_primary())
8697 rollback.orig_src.ino = srcdnl->get_inode()->ino();
8698 else {
8699 ceph_assert(srcdnl->is_remote());
8700 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
8701 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
8702 }
8703
8704 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
8705 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
8706 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
8707 rollback.orig_dest.dname = destdn->get_name();
8708 if (destdnl->is_primary())
8709 rollback.orig_dest.ino = destdnl->get_inode()->ino();
8710 else if (destdnl->is_remote()) {
8711 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
8712 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
8713 }
8714
8715 if (straydn) {
8716 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
8717 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
8718 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
8719 rollback.stray.dname = straydn->get_name();
8720 }
8721 if (mdr->slave_request->desti_snapbl.length()) {
8722 CInode *oldin = destdnl->get_inode();
8723 if (oldin->snaprealm) {
8724 encode(true, rollback.desti_snapbl);
8725 oldin->encode_snap_blob(rollback.desti_snapbl);
8726 } else {
8727 encode(false, rollback.desti_snapbl);
8728 }
8729 }
8730 if (mdr->slave_request->srci_snapbl.length()) {
8731 if (srci->snaprealm) {
8732 encode(true, rollback.srci_snapbl);
8733 srci->encode_snap_blob(rollback.srci_snapbl);
8734 } else {
8735 encode(false, rollback.srci_snapbl);
8736 }
8737 }
8738 encode(rollback, mdr->more()->rollback_bl);
8739 // FIXME: rollback snaprealm
8740 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
8741
8742 // journal.
8743 mdr->ls = mdlog->get_current_segment();
8744 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
8745 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
8746 mdlog->start_entry(le);
8747 le->rollback = mdr->more()->rollback_bl;
8748
8749 bufferlist blah; // inode import data... obviously not used if we're the slave
8750 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
8751
8752 if (le->commit.empty()) {
8753 dout(10) << " empty metablob, skipping journal" << dendl;
8754 mdlog->cancel_entry(le);
8755 mdr->ls = NULL;
8756 _logged_slave_rename(mdr, srcdn, destdn, straydn);
8757 } else {
8758 mdr->more()->slave_update_journaled = true;
8759 submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
8760 mdr, __func__);
8761 mdlog->flush();
8762 }
8763 }
8764
8765 void Server::_logged_slave_rename(MDRequestRef& mdr,
8766 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8767 {
8768 dout(10) << "_logged_slave_rename " << *mdr << dendl;
8769
8770 // prepare ack
8771 MMDSSlaveRequest::ref reply;
8772 if (!mdr->aborted) {
8773 reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
8774 if (!mdr->more()->slave_update_journaled)
8775 reply->mark_not_journaled();
8776 }
8777
8778 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8779 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
8780
8781 // export srci?
8782 if (srcdn->is_auth() && srcdnl->is_primary()) {
8783 // set export bounds for CInode::encode_export()
8784 if (reply) {
8785 list<CDir*> bounds;
8786 if (srcdnl->get_inode()->is_dir()) {
8787 srcdnl->get_inode()->get_dirfrags(bounds);
8788 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
8789 (*p)->state_set(CDir::STATE_EXPORTBOUND);
8790 }
8791
8792 map<client_t,entity_inst_t> exported_client_map;
8793 map<client_t, client_metadata_t> exported_client_metadata_map;
8794 bufferlist inodebl;
8795 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
8796 exported_client_map,
8797 exported_client_metadata_map);
8798
8799 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
8800 (*p)->state_clear(CDir::STATE_EXPORTBOUND);
8801
8802 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
8803 encode(exported_client_metadata_map, reply->inode_export);
8804 reply->inode_export.claim_append(inodebl);
8805 reply->inode_export_v = srcdnl->get_inode()->inode.version;
8806 }
8807
8808 // remove mdr auth pin
8809 mdr->auth_unpin(srcdnl->get_inode());
8810 mdr->more()->is_inode_exporter = true;
8811
8812 if (srcdnl->get_inode()->is_dirty())
8813 srcdnl->get_inode()->mark_clean();
8814
8815 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
8816 }
8817
8818 // apply
8819 _rename_apply(mdr, srcdn, destdn, straydn);
8820
8821 CDentry::linkage_t *destdnl = destdn->get_linkage();
8822
8823 // bump popularity
8824 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
8825 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
8826 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
8827
8828 // done.
8829 mdr->reset_slave_request();
8830 mdr->straydn = 0;
8831
8832 if (reply) {
8833 mds->send_message_mds(reply, mdr->slave_to_mds);
8834 } else {
8835 ceph_assert(mdr->aborted);
8836 dout(10) << " abort flag set, finishing" << dendl;
8837 mdcache->request_finish(mdr);
8838 }
8839 }
8840
8841 void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
8842 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8843 {
8844 dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
8845
8846 CInode *in = destdn->get_linkage()->get_inode();
8847
8848 inodeno_t migrated_stray;
8849 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
8850 migrated_stray = in->ino();
8851
8852 MDSContext::vec finished;
8853 if (r == 0) {
8854 // unfreeze+singleauth inode
8855 // hmm, do i really need to delay this?
8856 if (mdr->more()->is_inode_exporter) {
8857 // drop our pins
8858 // we exported, clear out any xlocks that we moved to another MDS
8859
8860 for (auto i = mdr->locks.lower_bound(&in->versionlock);
8861 i != mdr->locks.end(); ) {
8862 SimpleLock *lock = i->lock;
8863 if (lock->get_parent() != in)
8864 break;
8865 // we only care about xlocks on the exported inode
8866 if (i->is_xlock() && !lock->is_locallock())
8867 mds->locker->xlock_export(i++, mdr.get());
8868 else
8869 ++i;
8870 }
8871
8872 map<client_t,Capability::Import> peer_imported;
8873 auto bp = mdr->more()->inode_import.cbegin();
8874 decode(peer_imported, bp);
8875
8876 dout(10) << " finishing inode export on " << *in << dendl;
8877 mdcache->migrator->finish_export_inode(in, mdr->slave_to_mds, peer_imported, finished);
8878 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
8879
8880 // unfreeze
8881 ceph_assert(in->is_frozen_inode());
8882 in->unfreeze_inode(finished);
8883 }
8884
8885 // singleauth
8886 if (mdr->more()->is_ambiguous_auth) {
8887 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8888 mdr->more()->is_ambiguous_auth = false;
8889 }
8890
8891 if (straydn && mdr->more()->slave_update_journaled) {
8892 CInode *strayin = straydn->get_projected_linkage()->get_inode();
8893 if (strayin && !strayin->snaprealm)
8894 mdcache->clear_dirty_bits_for_stray(strayin);
8895 }
8896
8897 mds->queue_waiters(finished);
8898 mdr->cleanup();
8899
8900 if (mdr->more()->slave_update_journaled) {
8901 // write a commit to the journal
8902 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
8903 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
8904 ESlaveUpdate::RENAME);
8905 mdlog->start_entry(le);
8906 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
8907 mdlog->flush();
8908 } else {
8909 _committed_slave(mdr);
8910 }
8911 } else {
8912
8913 // abort
8914 // rollback_bl may be empty if we froze the inode but had to provide an expanded
8915 // witness list from the master, and they failed before we tried prep again.
8916 if (mdr->more()->rollback_bl.length()) {
8917 if (mdr->more()->is_inode_exporter) {
8918 dout(10) << " reversing inode export of " << *in << dendl;
8919 in->abort_export();
8920 }
8921 if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
8922 mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
8923 // rollback but preserve the slave request
8924 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
8925 mdr->more()->rollback_bl.clear();
8926 } else
8927 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
8928 } else {
8929 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
8930 // singleauth
8931 if (mdr->more()->is_ambiguous_auth) {
8932 if (srcdn->is_auth())
8933 mdr->more()->rename_inode->unfreeze_inode(finished);
8934
8935 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8936 mdr->more()->is_ambiguous_auth = false;
8937 }
8938 mds->queue_waiters(finished);
8939 mdcache->request_finish(mdr);
8940 }
8941 }
8942
8943 if (migrated_stray && mds->is_stopping())
8944 mdcache->shutdown_export_stray_finish(migrated_stray);
8945 }
8946
8947 void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
8948 bool isdir, int linkunlink, nest_info_t &rstat)
8949 {
8950 fnode_t *pf;
8951 pf = dir->project_fnode();
8952 mut->add_projected_fnode(dir);
8953 pf->version = dir->pre_dirty();
8954
8955 if (isdir) {
8956 pf->fragstat.nsubdirs += linkunlink;
8957 } else {
8958 pf->fragstat.nfiles += linkunlink;
8959 }
8960 if (r.ino) {
8961 pf->rstat.rbytes += linkunlink * rstat.rbytes;
8962 pf->rstat.rfiles += linkunlink * rstat.rfiles;
8963 pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
8964 pf->rstat.rsnaps += linkunlink * rstat.rsnaps;
8965 }
8966 if (pf->fragstat.mtime == ctime) {
8967 pf->fragstat.mtime = r.dirfrag_old_mtime;
8968 if (pf->rstat.rctime == ctime)
8969 pf->rstat.rctime = r.dirfrag_old_rctime;
8970 }
8971 mut->add_updated_lock(&dir->get_inode()->filelock);
8972 mut->add_updated_lock(&dir->get_inode()->nestlock);
8973 }
8974
8975 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
8976 MutationRef mut;
8977 CDentry *srcdn;
8978 version_t srcdnpv;
8979 CDentry *destdn;
8980 CDentry *straydn;
8981 map<client_t,MClientSnap::ref> splits[2];
8982 bool finish_mdr;
8983 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
8984 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
8985 map<client_t,MClientSnap::ref> _splits[2], bool f) :
8986 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
8987 straydn(st), finish_mdr(f) {
8988 splits[0].swap(_splits[0]);
8989 splits[1].swap(_splits[1]);
8990 }
8991 void finish(int r) override {
8992 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
8993 destdn, straydn, splits, finish_mdr);
8994 }
8995 };
8996
8997 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
8998 bool finish_mdr)
8999 {
9000 rename_rollback rollback;
9001 auto p = rbl.cbegin();
9002 decode(rollback, p);
9003
9004 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
9005 // need to finish this update before sending resolve to claim the subtree
9006 mdcache->add_rollback(rollback.reqid, master);
9007
9008 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
9009 mut->ls = mds->mdlog->get_current_segment();
9010
9011 CDentry *srcdn = NULL;
9012 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
9013 if (!srcdir)
9014 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
9015 if (srcdir) {
9016 dout(10) << " srcdir " << *srcdir << dendl;
9017 srcdn = srcdir->lookup(rollback.orig_src.dname);
9018 if (srcdn) {
9019 dout(10) << " srcdn " << *srcdn << dendl;
9020 ceph_assert(srcdn->get_linkage()->is_null());
9021 } else
9022 dout(10) << " srcdn not found" << dendl;
9023 } else
9024 dout(10) << " srcdir not found" << dendl;
9025
9026 CDentry *destdn = NULL;
9027 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
9028 if (!destdir)
9029 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
9030 if (destdir) {
9031 dout(10) << " destdir " << *destdir << dendl;
9032 destdn = destdir->lookup(rollback.orig_dest.dname);
9033 if (destdn)
9034 dout(10) << " destdn " << *destdn << dendl;
9035 else
9036 dout(10) << " destdn not found" << dendl;
9037 } else
9038 dout(10) << " destdir not found" << dendl;
9039
9040 CInode *in = NULL;
9041 if (rollback.orig_src.ino) {
9042 in = mdcache->get_inode(rollback.orig_src.ino);
9043 if (in && in->is_dir())
9044 ceph_assert(srcdn && destdn);
9045 } else
9046 in = mdcache->get_inode(rollback.orig_src.remote_ino);
9047
9048 CDir *straydir = NULL;
9049 CDentry *straydn = NULL;
9050 if (rollback.stray.dirfrag.ino) {
9051 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
9052 if (straydir) {
9053 dout(10) << "straydir " << *straydir << dendl;
9054 straydn = straydir->lookup(rollback.stray.dname);
9055 if (straydn) {
9056 dout(10) << " straydn " << *straydn << dendl;
9057 ceph_assert(straydn->get_linkage()->is_primary());
9058 } else
9059 dout(10) << " straydn not found" << dendl;
9060 } else
9061 dout(10) << "straydir not found" << dendl;
9062 }
9063
9064 CInode *target = NULL;
9065 if (rollback.orig_dest.ino) {
9066 target = mdcache->get_inode(rollback.orig_dest.ino);
9067 if (target)
9068 ceph_assert(destdn && straydn);
9069 } else if (rollback.orig_dest.remote_ino)
9070 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
9071
9072 // can't use is_auth() in the resolve stage
9073 mds_rank_t whoami = mds->get_nodeid();
9074 // slave
9075 ceph_assert(!destdn || destdn->authority().first != whoami);
9076 ceph_assert(!straydn || straydn->authority().first != whoami);
9077
9078 bool force_journal_src = false;
9079 bool force_journal_dest = false;
9080 if (in && in->is_dir() && srcdn->authority().first != whoami)
9081 force_journal_src = _need_force_journal(in, false);
9082 if (in && target && target->is_dir())
9083 force_journal_dest = _need_force_journal(in, true);
9084
9085 version_t srcdnpv = 0;
9086 // repair src
9087 if (srcdn) {
9088 if (srcdn->authority().first == whoami)
9089 srcdnpv = srcdn->pre_dirty();
9090 if (rollback.orig_src.ino) {
9091 ceph_assert(in);
9092 srcdn->push_projected_linkage(in);
9093 } else
9094 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
9095 rollback.orig_src.remote_d_type);
9096 }
9097
9098 map<client_t,MClientSnap::ref> splits[2];
9099
9100 CInode::mempool_inode *pip = nullptr;
9101 if (in) {
9102 bool projected;
9103 if (in->get_projected_parent_dn()->authority().first == whoami) {
9104 auto &pi = in->project_inode();
9105 pip = &pi.inode;
9106 mut->add_projected_inode(in);
9107 pip->version = in->pre_dirty();
9108 projected = true;
9109 } else {
9110 pip = in->get_projected_inode();
9111 projected = false;
9112 }
9113 if (pip->ctime == rollback.ctime)
9114 pip->ctime = rollback.orig_src.old_ctime;
9115
9116 if (rollback.srci_snapbl.length() && in->snaprealm) {
9117 bool hadrealm;
9118 auto p = rollback.srci_snapbl.cbegin();
9119 decode(hadrealm, p);
9120 if (hadrealm) {
9121 if (projected && !mds->is_resolve()) {
9122 sr_t *new_srnode = new sr_t();
9123 decode(*new_srnode, p);
9124 in->project_snaprealm(new_srnode);
9125 } else
9126 decode(in->snaprealm->srnode, p);
9127 } else {
9128 SnapRealm *realm;
9129 if (rollback.orig_src.ino) {
9130 ceph_assert(srcdir);
9131 realm = srcdir->get_inode()->find_snaprealm();
9132 } else {
9133 realm = in->snaprealm->parent;
9134 }
9135 if (!mds->is_resolve())
9136 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
9137 if (projected)
9138 in->project_snaprealm(NULL);
9139 else
9140 in->snaprealm->merge_to(realm);
9141 }
9142 }
9143 }
9144
9145 if (srcdn && srcdn->authority().first == whoami) {
9146 nest_info_t blah;
9147 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
9148 in ? in->is_dir() : false, 1, pip ? pip->accounted_rstat : blah);
9149 }
9150
9151 // repair dest
9152 if (destdn) {
9153 if (rollback.orig_dest.ino && target) {
9154 destdn->push_projected_linkage(target);
9155 } else if (rollback.orig_dest.remote_ino) {
9156 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
9157 rollback.orig_dest.remote_d_type);
9158 } else {
9159 // the dentry will be trimmed soon, it's ok to have wrong linkage
9160 if (rollback.orig_dest.ino)
9161 ceph_assert(mds->is_resolve());
9162 destdn->push_projected_linkage();
9163 }
9164 }
9165
9166 if (straydn)
9167 straydn->push_projected_linkage();
9168
9169 if (target) {
9170 bool projected;
9171 CInode::mempool_inode *ti = nullptr;
9172 if (target->get_projected_parent_dn()->authority().first == whoami) {
9173 auto &pi = target->project_inode();
9174 ti = &pi.inode;
9175 mut->add_projected_inode(target);
9176 ti->version = target->pre_dirty();
9177 projected = true;
9178 } else {
9179 ti = target->get_projected_inode();
9180 projected = false;
9181 }
9182 if (ti->ctime == rollback.ctime)
9183 ti->ctime = rollback.orig_dest.old_ctime;
9184 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
9185 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
9186 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
9187 else
9188 ceph_assert(rollback.orig_dest.remote_ino &&
9189 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
9190 } else
9191 ti->nlink++;
9192
9193 if (rollback.desti_snapbl.length() && target->snaprealm) {
9194 bool hadrealm;
9195 auto p = rollback.desti_snapbl.cbegin();
9196 decode(hadrealm, p);
9197 if (hadrealm) {
9198 if (projected && !mds->is_resolve()) {
9199 sr_t *new_srnode = new sr_t();
9200 decode(*new_srnode, p);
9201 target->project_snaprealm(new_srnode);
9202 } else
9203 decode(target->snaprealm->srnode, p);
9204 } else {
9205 SnapRealm *realm;
9206 if (rollback.orig_dest.ino) {
9207 ceph_assert(destdir);
9208 realm = destdir->get_inode()->find_snaprealm();
9209 } else {
9210 realm = target->snaprealm->parent;
9211 }
9212 if (!mds->is_resolve())
9213 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
9214 if (projected)
9215 target->project_snaprealm(NULL);
9216 else
9217 target->snaprealm->merge_to(realm);
9218 }
9219 }
9220 }
9221
9222 if (srcdn)
9223 dout(0) << " srcdn back to " << *srcdn << dendl;
9224 if (in)
9225 dout(0) << " srci back to " << *in << dendl;
9226 if (destdn)
9227 dout(0) << " destdn back to " << *destdn << dendl;
9228 if (target)
9229 dout(0) << " desti back to " << *target << dendl;
9230
9231 // journal it
9232 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
9233 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
9234 mdlog->start_entry(le);
9235
9236 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
9237 le->commit.add_dir_context(srcdir);
9238 if (rollback.orig_src.ino)
9239 le->commit.add_primary_dentry(srcdn, 0, true);
9240 else
9241 le->commit.add_remote_dentry(srcdn, true);
9242 }
9243
9244 if (!rollback.orig_src.ino && // remote linkage
9245 in && in->authority().first == whoami) {
9246 le->commit.add_dir_context(in->get_projected_parent_dir());
9247 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9248 }
9249
9250 if (force_journal_dest) {
9251 ceph_assert(rollback.orig_dest.ino);
9252 le->commit.add_dir_context(destdir);
9253 le->commit.add_primary_dentry(destdn, 0, true);
9254 }
9255
9256 // slave: no need to journal straydn
9257
9258 if (target && target != in && target->authority().first == whoami) {
9259 ceph_assert(rollback.orig_dest.remote_ino);
9260 le->commit.add_dir_context(target->get_projected_parent_dir());
9261 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
9262 }
9263
9264 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
9265 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
9266 le->commit.renamed_dirino = in->ino();
9267 if (srcdn->authority().first == whoami) {
9268 list<CDir*> ls;
9269 in->get_dirfrags(ls);
9270 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
9271 CDir *dir = *p;
9272 if (!dir->is_auth())
9273 le->commit.renamed_dir_frags.push_back(dir->get_frag());
9274 }
9275 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
9276 }
9277 } else if (force_journal_dest) {
9278 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
9279 le->commit.renamed_dirino = target->ino();
9280 }
9281
9282 if (target && target->is_dir()) {
9283 ceph_assert(destdn);
9284 mdcache->project_subtree_rename(target, straydir, destdir);
9285 }
9286
9287 if (in && in->is_dir()) {
9288 ceph_assert(srcdn);
9289 mdcache->project_subtree_rename(in, destdir, srcdir);
9290 }
9291
9292 if (mdr && !mdr->more()->slave_update_journaled) {
9293 ceph_assert(le->commit.empty());
9294 mdlog->cancel_entry(le);
9295 mut->ls = NULL;
9296 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
9297 } else {
9298 ceph_assert(!le->commit.empty());
9299 if (mdr)
9300 mdr->more()->slave_update_journaled = false;
9301 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
9302 srcdn, srcdnpv, destdn, straydn,
9303 splits, finish_mdr);
9304 submit_mdlog_entry(le, fin, mdr, __func__);
9305 mdlog->flush();
9306 }
9307 }
9308
9309 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
9310 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
9311 map<client_t,MClientSnap::ref> splits[2], bool finish_mdr)
9312 {
9313 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
9314
9315 if (straydn) {
9316 straydn->get_dir()->unlink_inode(straydn);
9317 straydn->pop_projected_linkage();
9318 }
9319 if (destdn) {
9320 destdn->get_dir()->unlink_inode(destdn);
9321 destdn->pop_projected_linkage();
9322 }
9323 if (srcdn) {
9324 srcdn->pop_projected_linkage();
9325 if (srcdn->authority().first == mds->get_nodeid()) {
9326 srcdn->mark_dirty(srcdnpv, mut->ls);
9327 if (srcdn->get_linkage()->is_primary())
9328 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
9329 }
9330 }
9331
9332 mut->apply();
9333
9334 if (srcdn && srcdn->get_linkage()->is_primary()) {
9335 CInode *in = srcdn->get_linkage()->get_inode();
9336 if (in && in->is_dir()) {
9337 ceph_assert(destdn);
9338 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
9339 }
9340 }
9341
9342 if (destdn) {
9343 CInode *oldin = destdn->get_linkage()->get_inode();
9344 // update subtree map?
9345 if (oldin && oldin->is_dir()) {
9346 ceph_assert(straydn);
9347 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
9348 }
9349 }
9350
9351 if (mds->is_resolve()) {
9352 CDir *root = NULL;
9353 if (straydn)
9354 root = mdcache->get_subtree_root(straydn->get_dir());
9355 else if (destdn)
9356 root = mdcache->get_subtree_root(destdn->get_dir());
9357 if (root)
9358 mdcache->try_trim_non_auth_subtree(root);
9359 } else {
9360 mdcache->send_snaps(splits[1]);
9361 mdcache->send_snaps(splits[0]);
9362 }
9363
9364 if (mdr) {
9365 MDSContext::vec finished;
9366 if (mdr->more()->is_ambiguous_auth) {
9367 if (srcdn->is_auth())
9368 mdr->more()->rename_inode->unfreeze_inode(finished);
9369
9370 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9371 mdr->more()->is_ambiguous_auth = false;
9372 }
9373 mds->queue_waiters(finished);
9374 if (finish_mdr || mdr->aborted)
9375 mdcache->request_finish(mdr);
9376 else
9377 mdr->more()->slave_rolling_back = false;
9378 }
9379
9380 mdcache->finish_rollback(mut->reqid);
9381
9382 mut->cleanup();
9383 }
9384
9385 void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
9386 {
9387 dout(10) << "handle_slave_rename_prep_ack " << *mdr
9388 << " witnessed by " << ack->get_source()
9389 << " " << *ack << dendl;
9390 mds_rank_t from = mds_rank_t(ack->get_source().num());
9391
9392 // note slave
9393 mdr->more()->slaves.insert(from);
9394 if (mdr->more()->srcdn_auth_mds == from &&
9395 mdr->more()->is_remote_frozen_authpin &&
9396 !mdr->more()->is_ambiguous_auth) {
9397 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
9398 }
9399
9400 // witnessed? or add extra witnesses?
9401 ceph_assert(mdr->more()->witnessed.count(from) == 0);
9402 if (ack->is_interrupted()) {
9403 dout(10) << " slave request interrupted, noop" << dendl;
9404 } else if (ack->witnesses.empty()) {
9405 mdr->more()->witnessed.insert(from);
9406 if (!ack->is_not_journaled())
9407 mdr->more()->has_journaled_slaves = true;
9408 } else {
9409 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
9410 mdr->more()->extra_witnesses = ack->witnesses;
9411 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
9412 }
9413
9414 // srci import?
9415 if (ack->inode_export.length()) {
9416 dout(10) << " got srci import" << dendl;
9417 mdr->more()->inode_import.share(ack->inode_export);
9418 mdr->more()->inode_import_v = ack->inode_export_v;
9419 }
9420
9421 // remove from waiting list
9422 ceph_assert(mdr->more()->waiting_on_slave.count(from));
9423 mdr->more()->waiting_on_slave.erase(from);
9424
9425 if (mdr->more()->waiting_on_slave.empty())
9426 dispatch_client_request(mdr); // go again!
9427 else
9428 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
9429 }
9430
9431 void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
9432 {
9433 dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
9434 << ack->get_source() << dendl;
9435 ceph_assert(mdr->is_slave());
9436 mds_rank_t from = mds_rank_t(ack->get_source().num());
9437
9438 if (mdr->more()->waiting_on_slave.count(from)) {
9439 mdr->more()->waiting_on_slave.erase(from);
9440
9441 if (mdr->more()->waiting_on_slave.empty()) {
9442 if (mdr->slave_request)
9443 dispatch_slave_request(mdr);
9444 } else
9445 dout(10) << " still waiting for rename notify acks from "
9446 << mdr->more()->waiting_on_slave << dendl;
9447 }
9448 }
9449
9450 void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
9451 {
9452 dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
9453
9454 if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
9455 mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
9456
9457 if (mdr->more()->waiting_on_slave.empty()) {
9458 if (mdr->slave_request)
9459 dispatch_slave_request(mdr);
9460 } else
9461 dout(10) << " still waiting for rename notify acks from "
9462 << mdr->more()->waiting_on_slave << dendl;
9463 }
9464 }
9465
9466 // snaps
9467 /* This function takes responsibility for the passed mdr*/
9468 void Server::handle_client_lssnap(MDRequestRef& mdr)
9469 {
9470 const MClientRequest::const_ref &req = mdr->client_request;
9471
9472 // traverse to path
9473 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
9474 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
9475 respond_to_request(mdr, -ESTALE);
9476 return;
9477 }
9478 if (!diri->is_auth()) {
9479 mdcache->request_forward(mdr, diri->authority().first);
9480 return;
9481 }
9482 if (!diri->is_dir()) {
9483 respond_to_request(mdr, -ENOTDIR);
9484 return;
9485 }
9486 dout(10) << "lssnap on " << *diri << dendl;
9487
9488 // lock snap
9489 MutationImpl::LockOpVec lov;
9490 mds->locker->include_snap_rdlocks(diri, lov);
9491 if (!mds->locker->acquire_locks(mdr, lov))
9492 return;
9493
9494 if (!check_access(mdr, diri, MAY_READ))
9495 return;
9496
9497 SnapRealm *realm = diri->find_snaprealm();
9498 map<snapid_t,const SnapInfo*> infomap;
9499 realm->get_snap_info(infomap, diri->get_oldest_snap());
9500
9501 unsigned max_entries = req->head.args.readdir.max_entries;
9502 if (!max_entries)
9503 max_entries = infomap.size();
9504 int max_bytes = req->head.args.readdir.max_bytes;
9505 if (!max_bytes)
9506 // make sure at least one item can be encoded
9507 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
9508
9509 __u64 last_snapid = 0;
9510 string offset_str = req->get_path2();
9511 if (!offset_str.empty())
9512 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
9513
9514 //Empty DirStat
9515 bufferlist dirbl;
9516 static DirStat empty;
9517 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
9518
9519 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
9520
9521 __u32 num = 0;
9522 bufferlist dnbl;
9523 auto p = infomap.upper_bound(last_snapid);
9524 for (; p != infomap.end() && num < max_entries; ++p) {
9525 dout(10) << p->first << " -> " << *p->second << dendl;
9526
9527 // actual
9528 string snap_name;
9529 if (p->second->ino == diri->ino())
9530 snap_name = p->second->name;
9531 else
9532 snap_name = p->second->get_long_name();
9533
9534 unsigned start_len = dnbl.length();
9535 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
9536 break;
9537
9538 encode(snap_name, dnbl);
9539 //infinite lease
9540 LeaseStat e(-1, -1, 0);
9541 mds->locker->encode_lease(dnbl, mdr->session->info, e);
9542 dout(20) << "encode_infinite_lease" << dendl;
9543
9544 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
9545 if (r < 0) {
9546 bufferlist keep;
9547 keep.substr_of(dnbl, 0, start_len);
9548 dnbl.swap(keep);
9549 break;
9550 }
9551 ++num;
9552 }
9553
9554 encode(num, dirbl);
9555 __u16 flags = 0;
9556 if (p == infomap.end()) {
9557 flags = CEPH_READDIR_FRAG_END;
9558 if (last_snapid == 0)
9559 flags |= CEPH_READDIR_FRAG_COMPLETE;
9560 }
9561 encode(flags, dirbl);
9562 dirbl.claim_append(dnbl);
9563
9564 mdr->reply_extra_bl = dirbl;
9565 mdr->tracei = diri;
9566 respond_to_request(mdr, 0);
9567 }
9568
9569
9570 // MKSNAP
9571
9572 struct C_MDS_mksnap_finish : public ServerLogContext {
9573 CInode *diri;
9574 SnapInfo info;
9575 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
9576 ServerLogContext(s, r), diri(di), info(i) {}
9577 void finish(int r) override {
9578 server->_mksnap_finish(mdr, diri, info);
9579 }
9580 };
9581
9582 /* This function takes responsibility for the passed mdr*/
9583 void Server::handle_client_mksnap(MDRequestRef& mdr)
9584 {
9585 const MClientRequest::const_ref &req = mdr->client_request;
9586 // make sure we have as new a map as the client
9587 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
9588 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
9589 return;
9590 }
9591 if (!mds->mdsmap->allows_snaps()) {
9592 // you can't make snapshots until you set an option right now
9593 respond_to_request(mdr, -EPERM);
9594 return;
9595 }
9596
9597 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
9598 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
9599 respond_to_request(mdr, -ESTALE);
9600 return;
9601 }
9602
9603 if (!diri->is_auth()) { // fw to auth?
9604 mdcache->request_forward(mdr, diri->authority().first);
9605 return;
9606 }
9607
9608 // dir only
9609 if (!diri->is_dir()) {
9610 respond_to_request(mdr, -ENOTDIR);
9611 return;
9612 }
9613 if (diri->is_system() && !diri->is_root()) {
9614 // no snaps in system dirs (root is ok)
9615 respond_to_request(mdr, -EPERM);
9616 return;
9617 }
9618
9619 std::string_view snapname = req->get_filepath().last_dentry();
9620
9621 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
9622 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
9623 respond_to_request(mdr, -EPERM);
9624 return;
9625 }
9626
9627 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
9628
9629 // lock snap
9630 MutationImpl::LockOpVec lov;
9631
9632 mds->locker->include_snap_rdlocks(diri, lov);
9633 lov.erase_rdlock(&diri->snaplock);
9634 lov.add_xlock(&diri->snaplock);
9635
9636 if (!mds->locker->acquire_locks(mdr, lov))
9637 return;
9638
9639 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
9640 return;
9641
9642 // make sure name is unique
9643 if (diri->snaprealm &&
9644 diri->snaprealm->exists(snapname)) {
9645 respond_to_request(mdr, -EEXIST);
9646 return;
9647 }
9648 if (snapname.length() == 0 ||
9649 snapname[0] == '_') {
9650 respond_to_request(mdr, -EINVAL);
9651 return;
9652 }
9653
9654 // allocate a snapid
9655 if (!mdr->more()->stid) {
9656 // prepare an stid
9657 mds->snapclient->prepare_create(diri->ino(), snapname,
9658 mdr->get_mds_stamp(),
9659 &mdr->more()->stid, &mdr->more()->snapidbl,
9660 new C_MDS_RetryRequest(mdcache, mdr));
9661 return;
9662 }
9663
9664 version_t stid = mdr->more()->stid;
9665 snapid_t snapid;
9666 auto p = mdr->more()->snapidbl.cbegin();
9667 decode(snapid, p);
9668 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
9669
9670 ceph_assert(mds->snapclient->get_cached_version() >= stid);
9671
9672 // journal
9673 SnapInfo info;
9674 info.ino = diri->ino();
9675 info.snapid = snapid;
9676 info.name = snapname;
9677 info.stamp = mdr->get_op_stamp();
9678
9679 auto &pi = diri->project_inode(false, true);
9680 pi.inode.ctime = info.stamp;
9681 if (info.stamp > pi.inode.rstat.rctime)
9682 pi.inode.rstat.rctime = info.stamp;
9683 pi.inode.rstat.rsnaps++;
9684 pi.inode.version = diri->pre_dirty();
9685
9686 // project the snaprealm
9687 auto &newsnap = *pi.snapnode;
9688 newsnap.created = snapid;
9689 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
9690 if (!em.second)
9691 em.first->second = info;
9692 newsnap.seq = snapid;
9693 newsnap.last_created = snapid;
9694
9695 // journal the inode changes
9696 mdr->ls = mdlog->get_current_segment();
9697 EUpdate *le = new EUpdate(mdlog, "mksnap");
9698 mdlog->start_entry(le);
9699
9700 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
9701 le->metablob.add_table_transaction(TABLE_SNAP, stid);
9702 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
9703 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
9704
9705 // journal the snaprealm changes
9706 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
9707 mdr, __func__);
9708 mdlog->flush();
9709 }
9710
9711 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
9712 {
9713 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
9714
9715 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
9716
9717 diri->pop_and_dirty_projected_inode(mdr->ls);
9718 mdr->apply();
9719
9720 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
9721
9722 // create snap
9723 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
9724
9725 // notify other mds
9726 mdcache->send_snap_update(diri, mdr->more()->stid, op);
9727
9728 mdcache->do_realm_invalidate_and_update_notify(diri, op);
9729
9730 // yay
9731 mdr->in[0] = diri;
9732 mdr->snapid = info.snapid;
9733 mdr->tracei = diri;
9734 respond_to_request(mdr, 0);
9735 }
9736
9737
9738 // RMSNAP
9739
9740 struct C_MDS_rmsnap_finish : public ServerLogContext {
9741 CInode *diri;
9742 snapid_t snapid;
9743 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
9744 ServerLogContext(s, r), diri(di), snapid(sn) {}
9745 void finish(int r) override {
9746 server->_rmsnap_finish(mdr, diri, snapid);
9747 }
9748 };
9749
9750 /* This function takes responsibility for the passed mdr*/
9751 void Server::handle_client_rmsnap(MDRequestRef& mdr)
9752 {
9753 const MClientRequest::const_ref &req = mdr->client_request;
9754
9755 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
9756 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
9757 respond_to_request(mdr, -ESTALE);
9758 return;
9759 }
9760 if (!diri->is_auth()) { // fw to auth?
9761 mdcache->request_forward(mdr, diri->authority().first);
9762 return;
9763 }
9764 if (!diri->is_dir()) {
9765 respond_to_request(mdr, -ENOTDIR);
9766 return;
9767 }
9768
9769 std::string_view snapname = req->get_filepath().last_dentry();
9770
9771 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
9772 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
9773 respond_to_request(mdr, -EPERM);
9774 return;
9775 }
9776
9777 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
9778
9779 // does snap exist?
9780 if (snapname.length() == 0 || snapname[0] == '_') {
9781 respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
9782 return;
9783 }
9784 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
9785 respond_to_request(mdr, -ENOENT);
9786 return;
9787 }
9788 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
9789 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
9790
9791 MutationImpl::LockOpVec lov;
9792 mds->locker->include_snap_rdlocks(diri, lov);
9793 lov.erase_rdlock(&diri->snaplock);
9794 lov.add_xlock(&diri->snaplock);
9795
9796 if (!mds->locker->acquire_locks(mdr, lov))
9797 return;
9798
9799 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
9800 return;
9801
9802 // prepare
9803 if (!mdr->more()->stid) {
9804 mds->snapclient->prepare_destroy(diri->ino(), snapid,
9805 &mdr->more()->stid, &mdr->more()->snapidbl,
9806 new C_MDS_RetryRequest(mdcache, mdr));
9807 return;
9808 }
9809 version_t stid = mdr->more()->stid;
9810 auto p = mdr->more()->snapidbl.cbegin();
9811 snapid_t seq;
9812 decode(seq, p);
9813 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
9814
9815 ceph_assert(mds->snapclient->get_cached_version() >= stid);
9816
9817 // journal
9818 auto &pi = diri->project_inode(false, true);
9819 pi.inode.version = diri->pre_dirty();
9820 pi.inode.ctime = mdr->get_op_stamp();
9821 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
9822 pi.inode.rstat.rctime = mdr->get_op_stamp();
9823 pi.inode.rstat.rsnaps--;
9824
9825 mdr->ls = mdlog->get_current_segment();
9826 EUpdate *le = new EUpdate(mdlog, "rmsnap");
9827 mdlog->start_entry(le);
9828
9829 // project the snaprealm
9830 auto &newnode = *pi.snapnode;
9831 newnode.snaps.erase(snapid);
9832 newnode.seq = seq;
9833 newnode.last_destroyed = seq;
9834
9835 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
9836 le->metablob.add_table_transaction(TABLE_SNAP, stid);
9837 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
9838 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
9839
9840 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
9841 mdr, __func__);
9842 mdlog->flush();
9843 }
9844
9845 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
9846 {
9847 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
9848 snapid_t stid = mdr->more()->stid;
9849 auto p = mdr->more()->snapidbl.cbegin();
9850 snapid_t seq;
9851 decode(seq, p);
9852
9853 diri->pop_and_dirty_projected_inode(mdr->ls);
9854 mdr->apply();
9855
9856 mds->snapclient->commit(stid, mdr->ls);
9857
9858 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
9859
9860 // notify other mds
9861 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
9862
9863 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
9864
9865 // yay
9866 mdr->in[0] = diri;
9867 respond_to_request(mdr, 0);
9868
9869 // purge snapshot data
9870 if (diri->snaprealm->have_past_parents_open())
9871 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
9872 }
9873
9874 struct C_MDS_renamesnap_finish : public ServerLogContext {
9875 CInode *diri;
9876 snapid_t snapid;
9877 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
9878 ServerLogContext(s, r), diri(di), snapid(sn) {}
9879 void finish(int r) override {
9880 server->_renamesnap_finish(mdr, diri, snapid);
9881 }
9882 };
9883
9884 /* This function takes responsibility for the passed mdr*/
9885 void Server::handle_client_renamesnap(MDRequestRef& mdr)
9886 {
9887 const MClientRequest::const_ref &req = mdr->client_request;
9888 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
9889 respond_to_request(mdr, -EINVAL);
9890 return;
9891 }
9892
9893 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
9894 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
9895 respond_to_request(mdr, -ESTALE);
9896 return;
9897 }
9898
9899 if (!diri->is_auth()) { // fw to auth?
9900 mdcache->request_forward(mdr, diri->authority().first);
9901 return;
9902 }
9903
9904 if (!diri->is_dir()) { // dir only
9905 respond_to_request(mdr, -ENOTDIR);
9906 return;
9907 }
9908
9909 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
9910 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
9911 respond_to_request(mdr, -EPERM);
9912 return;
9913 }
9914
9915 std::string_view dstname = req->get_filepath().last_dentry();
9916 std::string_view srcname = req->get_filepath2().last_dentry();
9917 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
9918
9919 if (srcname.length() == 0 || srcname[0] == '_') {
9920 respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
9921 return;
9922 }
9923 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
9924 respond_to_request(mdr, -ENOENT);
9925 return;
9926 }
9927 if (dstname.length() == 0 || dstname[0] == '_') {
9928 respond_to_request(mdr, -EINVAL);
9929 return;
9930 }
9931 if (diri->snaprealm->exists(dstname)) {
9932 respond_to_request(mdr, -EEXIST);
9933 return;
9934 }
9935
9936 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
9937 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
9938
9939 // lock snap
9940 MutationImpl::LockOpVec lov;
9941
9942 mds->locker->include_snap_rdlocks(diri, lov);
9943 lov.erase_rdlock(&diri->snaplock);
9944 lov.add_xlock(&diri->snaplock);
9945
9946 if (!mds->locker->acquire_locks(mdr, lov))
9947 return;
9948
9949 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
9950 return;
9951
9952 // prepare
9953 if (!mdr->more()->stid) {
9954 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
9955 &mdr->more()->stid,
9956 new C_MDS_RetryRequest(mdcache, mdr));
9957 return;
9958 }
9959
9960 version_t stid = mdr->more()->stid;
9961 dout(10) << " stid is " << stid << dendl;
9962
9963 ceph_assert(mds->snapclient->get_cached_version() >= stid);
9964
9965 // journal
9966 auto &pi = diri->project_inode(false, true);
9967 pi.inode.ctime = mdr->get_op_stamp();
9968 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
9969 pi.inode.rstat.rctime = mdr->get_op_stamp();
9970 pi.inode.version = diri->pre_dirty();
9971
9972 // project the snaprealm
9973 auto &newsnap = *pi.snapnode;
9974 auto it = newsnap.snaps.find(snapid);
9975 ceph_assert(it != newsnap.snaps.end());
9976 it->second.name = dstname;
9977
9978 // journal the inode changes
9979 mdr->ls = mdlog->get_current_segment();
9980 EUpdate *le = new EUpdate(mdlog, "renamesnap");
9981 mdlog->start_entry(le);
9982
9983 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
9984 le->metablob.add_table_transaction(TABLE_SNAP, stid);
9985 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
9986 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
9987
9988 // journal the snaprealm changes
9989 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
9990 mdr, __func__);
9991 mdlog->flush();
9992 }
9993
9994 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
9995 {
9996 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
9997
9998 diri->pop_and_dirty_projected_inode(mdr->ls);
9999 mdr->apply();
10000
10001 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10002
10003 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10004
10005 // notify other mds
10006 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
10007
10008 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
10009
10010 // yay
10011 mdr->in[0] = diri;
10012 mdr->tracei = diri;
10013 mdr->snapid = snapid;
10014 respond_to_request(mdr, 0);
10015 }
10016
10017 /**
10018 * Return true if server is in state RECONNECT and this
10019 * client has not yet reconnected.
10020 */
10021 bool Server::waiting_for_reconnect(client_t c) const
10022 {
10023 return client_reconnect_gather.count(c) > 0;
10024 }
10025
10026 void Server::dump_reconnect_status(Formatter *f) const
10027 {
10028 f->open_object_section("reconnect_status");
10029 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
10030 f->close_section();
10031 }