]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
21
22 #include "MDSRank.h"
23 #include "Server.h"
24 #include "Locker.h"
25 #include "MDCache.h"
26 #include "MDLog.h"
27 #include "Migrator.h"
28 #include "MDBalancer.h"
29 #include "InoTable.h"
30 #include "SnapClient.h"
31 #include "Mutation.h"
32 #include "cephfs_features.h"
33
34 #include "msg/Messenger.h"
35
36 #include "osdc/Objecter.h"
37
38 #include "events/EUpdate.h"
39 #include "events/ESlaveUpdate.h"
40 #include "events/ESession.h"
41 #include "events/EOpen.h"
42 #include "events/ECommitted.h"
43
44 #include "include/stringify.h"
45 #include "include/filepath.h"
46 #include "common/errno.h"
47 #include "common/Timer.h"
48 #include "common/perf_counters.h"
49 #include "include/compat.h"
50 #include "osd/OSDMap.h"
51
52 #include <errno.h>
53 #include <math.h>
54
55 #include <list>
56 #include <iostream>
57 #include <string_view>
58
59 #include "common/config.h"
60
61 #define dout_context g_ceph_context
62 #define dout_subsys ceph_subsys_mds
63 #undef dout_prefix
64 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
65
66 class ServerContext : public MDSContext {
67 protected:
68 Server *server;
69 MDSRank *get_mds() override
70 {
71 return server->mds;
72 }
73
74 public:
75 explicit ServerContext(Server *s) : server(s) {
76 ceph_assert(server != NULL);
77 }
78 };
79
80 class ServerLogContext : public MDSLogContextBase {
81 protected:
82 Server *server;
83 MDSRank *get_mds() override
84 {
85 return server->mds;
86 }
87
88 MDRequestRef mdr;
89 void pre_finish(int r) override {
90 if (mdr)
91 mdr->mark_event("journal_committed: ");
92 }
93 public:
94 explicit ServerLogContext(Server *s) : server(s) {
95 ceph_assert(server != NULL);
96 }
97 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
98 ceph_assert(server != NULL);
99 }
100 };
101
102 void Server::create_logger()
103 {
104 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
105
106 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
107 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
108 plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
109 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
110 plb.add_u64_counter(l_mdss_handle_client_session,
111 "handle_client_session", "Client session messages", "hcs",
112 PerfCountersBuilder::PRIO_INTERESTING);
113 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
114 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
115
116 // fop latencies are useful
117 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
118 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
119 "Request type lookup hash of inode latency");
120 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
121 "Request type lookup inode latency");
122 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
123 "Request type lookup parent latency");
124 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
125 "Request type lookup name latency");
126 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
127 "Request type lookup latency");
128 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
129 "Request type lookup snapshot latency");
130 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
131 "Request type get attribute latency");
132 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
133 "Request type set attribute latency");
134 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
135 "Request type set file layout latency");
136 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
137 "Request type set directory layout latency");
138 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
139 "Request type set extended attribute latency");
140 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
141 "Request type remove extended attribute latency");
142 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
143 "Request type read directory latency");
144 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
145 "Request type set file lock latency");
146 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
147 "Request type get file lock latency");
148 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
149 "Request type create latency");
150 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
151 "Request type open latency");
152 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
153 "Request type make node latency");
154 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
155 "Request type link latency");
156 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
157 "Request type unlink latency");
158 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
159 "Request type remove directory latency");
160 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
161 "Request type rename latency");
162 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
163 "Request type make directory latency");
164 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
165 "Request type symbolic link latency");
166 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
167 "Request type list snapshot latency");
168 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
169 "Request type make snapshot latency");
170 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
171 "Request type remove snapshot latency");
172 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
173 "Request type rename snapshot latency");
174
175 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
176 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
177 "Client requests dispatched");
178 plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request",
179 "Server requests dispatched");
180
181 logger = plb.create_perf_counters();
182 g_ceph_context->get_perfcounters_collection()->add(logger);
183 }
184
185 Server::Server(MDSRank *m) :
186 mds(m),
187 mdcache(mds->mdcache), mdlog(mds->mdlog),
188 logger(0),
189 is_full(false),
190 reconnect_done(NULL),
191 failed_reconnects(0),
192 reconnect_evicting(false),
193 terminating_sessions(false),
194 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate"))
195 {
196 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
197 }
198
199 void Server::dispatch(const Message::const_ref &m)
200 {
201 switch (m->get_type()) {
202 case CEPH_MSG_CLIENT_RECONNECT:
203 handle_client_reconnect(MClientReconnect::msgref_cast(m));
204 return;
205 }
206
207 // active?
208 // handle_slave_request()/handle_client_session() will wait if necessary
209 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
210 const auto &req = MClientRequest::msgref_cast(m);
211 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
212 Session *session = mds->get_session(req);
213 if (!session || session->is_closed()) {
214 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
215 return;
216 }
217 bool queue_replay = false;
218 if (req->is_replay()) {
219 dout(3) << "queuing replayed op" << dendl;
220 queue_replay = true;
221 if (req->head.ino &&
222 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
223 mdcache->add_replay_ino_alloc(inodeno_t(req->head.ino));
224 }
225 } else if (req->get_retry_attempt()) {
226 // process completed request in clientreplay stage. The completed request
227 // might have created new file/directorie. This guarantees MDS sends a reply
228 // to client before other request modifies the new file/directorie.
229 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
230 dout(3) << "queuing completed op" << dendl;
231 queue_replay = true;
232 }
233 // this request was created before the cap reconnect message, drop any embedded
234 // cap releases.
235 req->releases.clear();
236 }
237 if (queue_replay) {
238 req->mark_queued_for_replay();
239 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
240 return;
241 }
242 }
243
244 bool wait_for_active = true;
245 if (mds->is_stopping()) {
246 wait_for_active = false;
247 } else if (mds->is_clientreplay()) {
248 if (req->is_queued_for_replay()) {
249 wait_for_active = false;
250 }
251 }
252 if (wait_for_active) {
253 dout(3) << "not active yet, waiting" << dendl;
254 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
255 return;
256 }
257 }
258
259 switch (m->get_type()) {
260 case CEPH_MSG_CLIENT_SESSION:
261 handle_client_session(MClientSession::msgref_cast(m));
262 return;
263 case CEPH_MSG_CLIENT_REQUEST:
264 handle_client_request(MClientRequest::msgref_cast(m));
265 return;
266 case CEPH_MSG_CLIENT_RECLAIM:
267 handle_client_reclaim(MClientReclaim::msgref_cast(m));
268 return;
269 case MSG_MDS_SLAVE_REQUEST:
270 handle_slave_request(MMDSSlaveRequest::msgref_cast(m));
271 return;
272 default:
273 derr << "server unknown message " << m->get_type() << dendl;
274 ceph_abort_msg("server unknown message");
275 }
276 }
277
278
279
280 // ----------------------------------------------------------
281 // SESSION management
282
283 class C_MDS_session_finish : public ServerLogContext {
284 Session *session;
285 uint64_t state_seq;
286 bool open;
287 version_t cmapv;
288 interval_set<inodeno_t> inos;
289 version_t inotablev;
290 Context *fin;
291 public:
292 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
293 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
294 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t>& i, version_t iv, Context *fin_ = NULL) :
295 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(i), inotablev(iv), fin(fin_) { }
296 void finish(int r) override {
297 ceph_assert(r == 0);
298 server->_session_logged(session, state_seq, open, cmapv, inos, inotablev);
299 if (fin) {
300 fin->complete(r);
301 }
302 }
303 };
304
305 Session* Server::find_session_by_uuid(std::string_view uuid)
306 {
307 Session* session = nullptr;
308 for (auto& it : mds->sessionmap.get_sessions()) {
309 auto& metadata = it.second->info.client_metadata;
310
311 auto p = metadata.find("uuid");
312 if (p == metadata.end() || p->second != uuid)
313 continue;
314
315 if (!session) {
316 session = it.second;
317 } else if (!session->reclaiming_from) {
318 assert(it.second->reclaiming_from == session);
319 session = it.second;
320 } else {
321 assert(session->reclaiming_from == it.second);
322 }
323 }
324 return session;
325 }
326
327 void Server::reclaim_session(Session *session, const MClientReclaim::const_ref &m)
328 {
329 if (!session->is_open() && !session->is_stale()) {
330 dout(10) << "session not open, dropping this req" << dendl;
331 return;
332 }
333
334 auto reply = MClientReclaimReply::create(0);
335 if (m->get_uuid().empty()) {
336 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
337 reply->set_result(-EINVAL);
338 mds->send_message_client(reply, session);
339 return;
340 }
341
342 unsigned flags = m->get_flags();
343 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
344 dout(10) << __func__ << " unsupported flags" << dendl;
345 reply->set_result(-EOPNOTSUPP);
346 mds->send_message_client(reply, session);
347 return;
348 }
349
350 Session* target = find_session_by_uuid(m->get_uuid());
351 if (target) {
352 if (session->info.auth_name != target->info.auth_name) {
353 dout(10) << __func__ << " session auth_name " << session->info.auth_name
354 << " != target auth_name " << target->info.auth_name << dendl;
355 reply->set_result(-EPERM);
356 mds->send_message_client(reply, session);
357 }
358
359 assert(!target->reclaiming_from);
360 assert(!session->reclaiming_from);
361 session->reclaiming_from = target;
362 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
363 }
364
365 if (flags & CEPH_RECLAIM_RESET) {
366 finish_reclaim_session(session, reply);
367 return;
368 }
369
370 ceph_abort();
371 }
372
373 void Server::finish_reclaim_session(Session *session, const MClientReclaimReply::ref &reply)
374 {
375 Session *target = session->reclaiming_from;
376 if (target) {
377 session->reclaiming_from = nullptr;
378
379 Context *send_reply;
380 if (reply) {
381 int64_t session_id = session->get_client().v;
382 send_reply = new FunctionContext([this, session_id, reply](int r) {
383 assert(mds->mds_lock.is_locked_by_me());
384 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
385 if (!session) {
386 return;
387 }
388 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
389 reply->set_epoch(epoch);
390 mds->send_message_client(reply, session);
391 });
392 } else {
393 send_reply = nullptr;
394 }
395
396 bool blacklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
397 return map.is_blacklisted(target->info.inst.addr);
398 });
399
400 if (blacklisted || !g_conf()->mds_session_blacklist_on_evict) {
401 kill_session(target, send_reply);
402 } else {
403 std::stringstream ss;
404 mds->evict_client(target->get_client().v, false, true, ss, send_reply);
405 }
406 } else if (reply) {
407 mds->send_message_client(reply, session);
408 }
409 }
410
411 void Server::handle_client_reclaim(const MClientReclaim::const_ref &m)
412 {
413 Session *session = mds->get_session(m);
414 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
415 assert(m->get_source().is_client()); // should _not_ come from an mds!
416
417 if (!session) {
418 dout(0) << " ignoring sessionless msg " << *m << dendl;
419 return;
420 }
421
422 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
423 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
424 return;
425 }
426
427 if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
428 finish_reclaim_session(session);
429 } else {
430 reclaim_session(session, m);
431 }
432 }
433
434 void Server::handle_client_session(const MClientSession::const_ref &m)
435 {
436 version_t pv;
437 Session *session = mds->get_session(m);
438
439 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
440 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
441
442 if (!session) {
443 dout(0) << " ignoring sessionless msg " << *m << dendl;
444 return;
445 }
446
447 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
448 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
449 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
450 // close requests need to be handled when mds is active
451 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
452 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
453 return;
454 }
455 } else {
456 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
457 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
458 return;
459 }
460 }
461
462 if (logger)
463 logger->inc(l_mdss_handle_client_session);
464
465 uint64_t sseq = 0;
466 switch (m->get_op()) {
467 case CEPH_SESSION_REQUEST_OPEN:
468 if (session->is_opening() ||
469 session->is_open() ||
470 session->is_stale() ||
471 session->is_killing() ||
472 terminating_sessions) {
473 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
474 return;
475 }
476 ceph_assert(session->is_closed() || session->is_closing());
477
478 if (mds->is_stopping()) {
479 dout(10) << "mds is stopping, dropping open req" << dendl;
480 return;
481 }
482
483 {
484 auto& addr = session->info.inst.addr;
485 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features));
486 auto& client_metadata = session->info.client_metadata;
487
488 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
489 auto now = ceph_clock_now();
490 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
491 auto elapsed = now - m->get_recv_stamp();
492 CachedStackStringStream css;
493 *css << "New client session:"
494 << " addr=\"" << session->info.inst.addr << "\""
495 << ",elapsed=" << elapsed
496 << ",throttled=" << throttle_elapsed
497 << ",status=\"" << status << "\"";
498 if (!err.empty()) {
499 *css << ",error=\"" << err << "\"";
500 }
501 const auto& metadata = session->info.client_metadata;
502 if (auto it = metadata.find("root"); it != metadata.end()) {
503 *css << ",root=\"" << it->second << "\"";
504 }
505 dout(2) << css->strv() << dendl;
506 };
507
508 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str) {
509 auto m = MClientSession::create(CEPH_SESSION_REJECT);
510 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
511 m->metadata["error_string"] = err_str;
512 mds->send_message_client(m, session);
513 log_session_status("REJECTED", err_str);
514 };
515
516 bool blacklisted = mds->objecter->with_osdmap(
517 [&addr](const OSDMap &osd_map) -> bool {
518 return osd_map.is_blacklisted(addr);
519 });
520
521 if (blacklisted) {
522 dout(10) << "rejecting blacklisted client " << addr << dendl;
523 send_reject_message("blacklisted");
524 session->clear();
525 break;
526 }
527
528 if (client_metadata.features.empty())
529 infer_supported_features(session, client_metadata);
530
531 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
532 dout(20) << " features: '" << client_metadata.features << dendl;
533 for (const auto& p : client_metadata) {
534 dout(20) << " " << p.first << ": " << p.second << dendl;
535 }
536
537 feature_bitset_t missing_features = required_client_features;
538 missing_features -= client_metadata.features;
539 if (!missing_features.empty()) {
540 stringstream ss;
541 ss << "missing required features '" << missing_features << "'";
542 send_reject_message(ss.str());
543 mds->clog->warn() << "client session lacks required features '"
544 << missing_features << "' denied (" << session->info.inst << ")";
545 session->clear();
546 break;
547 }
548
549 // Special case for the 'root' metadata path; validate that the claimed
550 // root is actually within the caps of the session
551 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
552 auto claimed_root = it->second;
553 stringstream ss;
554 bool denied = false;
555 // claimed_root has a leading "/" which we strip before passing
556 // into caps check
557 if (claimed_root.empty() || claimed_root[0] != '/') {
558 denied = true;
559 ss << "invalue root '" << claimed_root << "'";
560 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
561 denied = true;
562 ss << "non-allowable root '" << claimed_root << "'";
563 }
564
565 if (denied) {
566 // Tell the client we're rejecting their open
567 send_reject_message(ss.str());
568 mds->clog->warn() << "client session with " << ss.str()
569 << " denied (" << session->info.inst << ")";
570 session->clear();
571 break;
572 }
573 }
574
575 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
576 if (find_session_by_uuid(it->second)) {
577 send_reject_message("duplicated session uuid");
578 mds->clog->warn() << "client session with duplicated session uuid '"
579 << it->second << "' denied (" << session->info.inst << ")";
580 session->clear();
581 break;
582 }
583 }
584
585 if (session->is_closed())
586 mds->sessionmap.add_session(session);
587
588 pv = mds->sessionmap.mark_projected(session);
589 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
590 mds->sessionmap.touch_session(session);
591 auto fin = new FunctionContext([log_session_status = std::move(log_session_status)](int r){
592 ceph_assert(r == 0);
593 log_session_status("ACCEPTED", "");
594 });
595 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
596 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
597 mdlog->flush();
598 }
599 break;
600
601 case CEPH_SESSION_REQUEST_RENEWCAPS:
602 if (session->is_open() || session->is_stale()) {
603 mds->sessionmap.touch_session(session);
604 if (session->is_stale()) {
605 mds->sessionmap.set_state(session, Session::STATE_OPEN);
606 mds->locker->resume_stale_caps(session);
607 mds->sessionmap.touch_session(session);
608 }
609 auto reply = MClientSession::create(CEPH_SESSION_RENEWCAPS, m->get_seq());
610 mds->send_message_client(reply, session);
611 } else {
612 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
613 }
614 break;
615
616 case CEPH_SESSION_REQUEST_CLOSE:
617 {
618 if (session->is_closed() ||
619 session->is_closing() ||
620 session->is_killing()) {
621 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
622 return;
623 }
624 if (session->is_importing()) {
625 dout(10) << "ignoring close req on importing session" << dendl;
626 return;
627 }
628 ceph_assert(session->is_open() ||
629 session->is_stale() ||
630 session->is_opening());
631 if (m->get_seq() < session->get_push_seq()) {
632 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
633 << ", dropping" << dendl;
634 return;
635 }
636 // We are getting a seq that is higher than expected.
637 // Handle the same as any other seqn error.
638 //
639 if (m->get_seq() != session->get_push_seq()) {
640 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
641 << ", BUGGY!" << dendl;
642 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
643 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
644 return;
645 }
646 journal_close_session(session, Session::STATE_CLOSING, NULL);
647 }
648 break;
649
650 case CEPH_SESSION_FLUSHMSG_ACK:
651 finish_flush_session(session, m->get_seq());
652 break;
653
654 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
655 if (mds->is_active())
656 mdlog->flush();
657 break;
658
659 default:
660 ceph_abort();
661 }
662 }
663
664
665 void Server::flush_session(Session *session, MDSGatherBuilder *gather) {
666 if (!session->is_open() ||
667 !session->get_connection() ||
668 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
669 return;
670 }
671
672 version_t seq = session->wait_for_flush(gather->new_sub());
673 mds->send_message_client(
674 MClientSession::create(CEPH_SESSION_FLUSHMSG, seq), session);
675 }
676
677 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
678 {
679 for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
680 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
681 ceph_assert(session);
682 flush_session(session, &gather);
683 }
684 }
685
686 void Server::finish_flush_session(Session *session, version_t seq)
687 {
688 MDSContext::vec finished;
689 session->finish_flush(seq, finished);
690 mds->queue_waiters(finished);
691 }
692
693 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
694 interval_set<inodeno_t>& inos, version_t piv)
695 {
696 dout(10) << "_session_logged " << session->info.inst << " state_seq " << state_seq << " " << (open ? "open":"close")
697 << " " << pv << dendl;
698
699 if (piv) {
700 ceph_assert(session->is_closing() || session->is_killing() ||
701 session->is_opening()); // re-open closing session
702 session->info.prealloc_inos.subtract(inos);
703 mds->inotable->apply_release_ids(inos);
704 ceph_assert(mds->inotable->get_version() == piv);
705 }
706
707 mds->sessionmap.mark_dirty(session);
708
709 // apply
710 if (session->get_state_seq() != state_seq) {
711 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
712 << ", noop" << dendl;
713 // close must have been canceled (by an import?), or any number of other things..
714 } else if (open) {
715 ceph_assert(session->is_opening());
716 mds->sessionmap.set_state(session, Session::STATE_OPEN);
717 mds->sessionmap.touch_session(session);
718 ceph_assert(session->get_connection());
719 auto reply = MClientSession::create(CEPH_SESSION_OPEN);
720 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
721 reply->supported_features = supported_features;
722 mds->send_message_client(reply, session);
723 if (mdcache->is_readonly()) {
724 auto m = MClientSession::create(CEPH_SESSION_FORCE_RO);
725 mds->send_message_client(m, session);
726 }
727 } else if (session->is_closing() ||
728 session->is_killing()) {
729 // kill any lingering capabilities, leases, requests
730 while (!session->caps.empty()) {
731 Capability *cap = session->caps.front();
732 CInode *in = cap->get_inode();
733 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
734 mds->locker->remove_client_cap(in, cap);
735 }
736 while (!session->leases.empty()) {
737 ClientLease *r = session->leases.front();
738 CDentry *dn = static_cast<CDentry*>(r->parent);
739 dout(20) << " killing client lease of " << *dn << dendl;
740 dn->remove_client_lease(r, mds->locker);
741 }
742 if (client_reconnect_gather.erase(session->info.get_client())) {
743 dout(20) << " removing client from reconnect set" << dendl;
744 if (client_reconnect_gather.empty()) {
745 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
746 reconnect_gather_finish();
747 }
748 }
749 if (client_reclaim_gather.erase(session->info.get_client())) {
750 dout(20) << " removing client from reclaim set" << dendl;
751 if (client_reclaim_gather.empty()) {
752 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
753 mds->maybe_clientreplay_done();
754 }
755 }
756
757 if (session->is_closing()) {
758 // mark con disposable. if there is a fault, we will get a
759 // reset and clean it up. if the client hasn't received the
760 // CLOSE message yet, they will reconnect and get an
761 // ms_handle_remote_reset() and realize they had in fact closed.
762 // do this *before* sending the message to avoid a possible
763 // race.
764 if (session->get_connection()) {
765 // Conditional because terminate_sessions will indiscrimately
766 // put sessions in CLOSING whether they ever had a conn or not.
767 session->get_connection()->mark_disposable();
768 }
769
770 // reset session
771 mds->send_message_client(MClientSession::create(CEPH_SESSION_CLOSE), session);
772 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
773 session->clear();
774 mds->sessionmap.remove_session(session);
775 } else if (session->is_killing()) {
776 // destroy session, close connection
777 if (session->get_connection()) {
778 session->get_connection()->mark_down();
779 session->get_connection()->set_priv(NULL);
780 }
781 mds->sessionmap.remove_session(session);
782 } else {
783 ceph_abort();
784 }
785 } else {
786 ceph_abort();
787 }
788 }
789
790 /**
791 * Inject sessions from some source other than actual connections.
792 *
793 * For example:
794 * - sessions inferred from journal replay
795 * - sessions learned from other MDSs during rejoin
796 * - sessions learned from other MDSs during dir/caps migration
797 * - sessions learned from other MDSs during a cross-MDS rename
798 */
799 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
800 map<client_t,client_metadata_t>& cmm,
801 map<client_t, pair<Session*,uint64_t> >& smap)
802 {
803 version_t pv = mds->sessionmap.get_projected();
804
805 dout(10) << "prepare_force_open_sessions " << pv
806 << " on " << cm.size() << " clients"
807 << dendl;
808
809 mds->objecter->with_osdmap(
810 [this, &cm, &cmm](const OSDMap &osd_map) {
811 for (auto p = cm.begin(); p != cm.end(); ) {
812 if (osd_map.is_blacklisted(p->second.addr)) {
813 dout(10) << " ignoring blacklisted client." << p->first
814 << " (" << p->second.addr << ")" << dendl;
815 cmm.erase(p->first);
816 cm.erase(p++);
817 } else {
818 ++p;
819 }
820 }
821 });
822
823 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
824 Session *session = mds->sessionmap.get_or_add_session(p->second);
825 pv = mds->sessionmap.mark_projected(session);
826 uint64_t sseq;
827 if (session->is_closed() ||
828 session->is_closing() ||
829 session->is_killing()) {
830 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
831 auto q = cmm.find(p->first);
832 if (q != cmm.end())
833 session->info.client_metadata.merge(q->second);
834 } else {
835 ceph_assert(session->is_open() ||
836 session->is_opening() ||
837 session->is_stale());
838 sseq = 0;
839 }
840 smap[p->first] = make_pair(session, sseq);
841 session->inc_importing();
842 }
843 return pv;
844 }
845
846 void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
847 bool dec_import)
848 {
849 /*
850 * FIXME: need to carefully consider the race conditions between a
851 * client trying to close a session and an MDS doing an import
852 * trying to force open a session...
853 */
854 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
855 << " initial v " << mds->sessionmap.get_version() << dendl;
856
857 for (auto &it : smap) {
858 Session *session = it.second.first;
859 uint64_t sseq = it.second.second;
860 if (sseq > 0) {
861 if (session->get_state_seq() != sseq) {
862 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
863 } else {
864 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
865 mds->sessionmap.set_state(session, Session::STATE_OPEN);
866 mds->sessionmap.touch_session(session);
867
868 auto reply = MClientSession::create(CEPH_SESSION_OPEN);
869 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
870 reply->supported_features = supported_features;
871 mds->send_message_client(reply, session);
872
873 if (mdcache->is_readonly())
874 mds->send_message_client(MClientSession::create(CEPH_SESSION_FORCE_RO), session);
875 }
876 } else {
877 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
878 ceph_assert(session->is_open() || session->is_stale());
879 }
880
881 if (dec_import) {
882 session->dec_importing();
883 }
884
885 mds->sessionmap.mark_dirty(session);
886 }
887
888 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
889 }
890
891 class C_MDS_TerminatedSessions : public ServerContext {
892 void finish(int r) override {
893 server->terminating_sessions = false;
894 }
895 public:
896 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
897 };
898
899 void Server::terminate_sessions()
900 {
901 dout(5) << "terminating all sessions..." << dendl;
902
903 terminating_sessions = true;
904
905 // kill them off. clients will retry etc.
906 set<Session*> sessions;
907 mds->sessionmap.get_client_session_set(sessions);
908 for (set<Session*>::const_iterator p = sessions.begin();
909 p != sessions.end();
910 ++p) {
911 Session *session = *p;
912 if (session->is_closing() ||
913 session->is_killing() ||
914 session->is_closed())
915 continue;
916 journal_close_session(session, Session::STATE_CLOSING, NULL);
917 }
918
919 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
920 }
921
922
923 void Server::find_idle_sessions()
924 {
925 auto now = clock::now();
926 auto last_cleared_laggy = mds->last_cleared_laggy();
927
928 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
929
930 // timeout/stale
931 // (caps go stale, lease die)
932 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
933 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
934
935 std::vector<Session*> to_evict;
936
937 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
938 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
939 std::vector<Session*> new_stale;
940
941 for (auto session : *(sessions_p1->second)) {
942 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
943 if (last_cap_renew_span < cutoff) {
944 dout(20) << "laggiest active session is " << session->info.inst
945 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
946 break;
947 }
948
949 if (session->last_seen > session->last_cap_renew) {
950 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
951 if (last_cap_renew_span < cutoff) {
952 dout(20) << "laggiest active session is " << session->info.inst
953 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
954 continue;
955 }
956 }
957
958 auto it = session->info.client_metadata.find("timeout");
959 if (it != session->info.client_metadata.end()) {
960 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
961 if (timeout == 0) {
962 dout(10) << "skipping session " << session->info.inst
963 << ", infinite timeout specified" << dendl;
964 continue;
965 }
966 double cutoff = queue_max_age + timeout;
967 if (last_cap_renew_span < cutoff) {
968 dout(10) << "skipping session " << session->info.inst
969 << ", timeout (" << timeout << ") specified"
970 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
971 continue;
972 }
973
974 // do not go through stale, evict it directly.
975 to_evict.push_back(session);
976 } else {
977 dout(10) << "new stale session " << session->info.inst
978 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
979 new_stale.push_back(session);
980 }
981 }
982
983 for (auto session : new_stale) {
984 mds->sessionmap.set_state(session, Session::STATE_STALE);
985 mds->locker->revoke_stale_caps(session);
986 mds->locker->remove_stale_leases(session);
987 mds->send_message_client(MClientSession::create(CEPH_SESSION_STALE, session->get_push_seq()), session);
988 finish_flush_session(session, session->get_push_seq());
989 }
990 }
991
992 // autoclose
993 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
994
995 // don't kick clients if we've been laggy
996 if (last_cleared_laggy < cutoff) {
997 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
998 << "), not kicking any clients to be safe" << dendl;
999 return;
1000 }
1001
1002 if (mds->sessionmap.get_sessions().size() == 1 && mds->mdsmap->get_num_in_mds() == 1) {
1003 dout(20) << "skipping client eviction because there is only one" << dendl;
1004 return;
1005 }
1006
1007 // Collect a list of sessions exceeding the autoclose threshold
1008 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
1009 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1010 for (auto session : *(sessions_p2->second)) {
1011 assert(session->is_stale());
1012 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1013 if (last_cap_renew_span < cutoff) {
1014 dout(20) << "oldest stale session is " << session->info.inst
1015 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1016 break;
1017 }
1018 to_evict.push_back(session);
1019 }
1020 }
1021
1022 for (auto session: to_evict) {
1023 if (session->is_importing()) {
1024 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1025 continue;
1026 }
1027
1028 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1029 mds->clog->warn() << "evicting unresponsive client " << *session
1030 << ", after " << last_cap_renew_span << " seconds";
1031 dout(10) << "autoclosing stale session " << session->info.inst
1032 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1033
1034 if (g_conf()->mds_session_blacklist_on_timeout) {
1035 std::stringstream ss;
1036 mds->evict_client(session->get_client().v, false, true, ss, nullptr);
1037 } else {
1038 kill_session(session, NULL);
1039 }
1040 }
1041 }
1042
1043 void Server::evict_cap_revoke_non_responders() {
1044 if (!cap_revoke_eviction_timeout) {
1045 return;
1046 }
1047
1048 std::list<client_t> to_evict;
1049 mds->locker->get_late_revoking_clients(&to_evict, cap_revoke_eviction_timeout);
1050
1051 for (auto const &client: to_evict) {
1052 mds->clog->warn() << "client id " << client << " has not responded to"
1053 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1054 << " seconds, evicting";
1055 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1056 << client << dendl;
1057
1058 std::stringstream ss;
1059 bool evicted = mds->evict_client(client.v, false,
1060 g_conf()->mds_session_blacklist_on_evict,
1061 ss, nullptr);
1062 if (evicted && logger) {
1063 logger->inc(l_mdss_cap_revoke_eviction);
1064 }
1065 }
1066 }
1067
1068 void Server::handle_conf_change(const ConfigProxy& conf,
1069 const std::set <std::string> &changed) {
1070 if (changed.count("mds_cap_revoke_eviction_timeout")) {
1071 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
1072 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1073 << cap_revoke_eviction_timeout << dendl;
1074 }
1075 if (changed.count("mds_recall_max_decay_rate")) {
1076 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
1077 }
1078 }
1079
1080 /*
1081 * XXX bump in the interface here, not using an MDSContext here
1082 * because all the callers right now happen to use a SaferCond
1083 */
1084 void Server::kill_session(Session *session, Context *on_safe)
1085 {
1086 ceph_assert(mds->mds_lock.is_locked_by_me());
1087
1088 if ((session->is_opening() ||
1089 session->is_open() ||
1090 session->is_stale()) &&
1091 !session->is_importing()) {
1092 dout(10) << "kill_session " << session << dendl;
1093 journal_close_session(session, Session::STATE_KILLING, on_safe);
1094 } else {
1095 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
1096 if (session->is_closing() ||
1097 session->is_killing()) {
1098 if (on_safe)
1099 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1100 } else {
1101 ceph_assert(session->is_closed() ||
1102 session->is_importing());
1103 if (on_safe)
1104 on_safe->complete(0);
1105 }
1106 }
1107 }
1108
1109 size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
1110 {
1111 std::list<Session*> victims;
1112 const auto& sessions = mds->sessionmap.get_sessions();
1113 for (const auto& p : sessions) {
1114 if (!p.first.is_client()) {
1115 // Do not apply OSDMap blacklist to MDS daemons, we find out
1116 // about their death via MDSMap.
1117 continue;
1118 }
1119
1120 Session *s = p.second;
1121 if (blacklist.count(s->info.inst.addr)) {
1122 victims.push_back(s);
1123 }
1124 }
1125
1126 for (const auto s : victims) {
1127 kill_session(s, nullptr);
1128 }
1129
1130 dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
1131
1132 return victims.size();
1133 }
1134
1135 void Server::journal_close_session(Session *session, int state, Context *on_safe)
1136 {
1137 uint64_t sseq = mds->sessionmap.set_state(session, state);
1138 version_t pv = mds->sessionmap.mark_projected(session);
1139 version_t piv = 0;
1140
1141 // release alloc and pending-alloc inos for this session
1142 // and wipe out session state, in case the session close aborts for some reason
1143 interval_set<inodeno_t> both;
1144 both.insert(session->info.prealloc_inos);
1145 both.insert(session->pending_prealloc_inos);
1146 if (both.size()) {
1147 mds->inotable->project_release_ids(both);
1148 piv = mds->inotable->get_projected_version();
1149 } else
1150 piv = 0;
1151
1152 mdlog->start_submit_entry(new ESession(session->info.inst, false, pv, both, piv),
1153 new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe));
1154 mdlog->flush();
1155
1156 // clean up requests, too
1157 elist<MDRequestImpl*>::iterator p =
1158 session->requests.begin(member_offset(MDRequestImpl,
1159 item_session_request));
1160 while (!p.end()) {
1161 MDRequestRef mdr = mdcache->request_get((*p)->reqid);
1162 ++p;
1163 mdcache->request_kill(mdr);
1164 }
1165
1166 finish_flush_session(session, session->get_push_seq());
1167 }
1168
1169 void Server::reconnect_clients(MDSContext *reconnect_done_)
1170 {
1171 reconnect_done = reconnect_done_;
1172
1173 auto now = clock::now();
1174 set<Session*> sessions;
1175 mds->sessionmap.get_client_session_set(sessions);
1176 for (auto session : sessions) {
1177 if (session->is_open()) {
1178 client_reconnect_gather.insert(session->get_client());
1179 session->last_cap_renew = now;
1180 }
1181 }
1182
1183 if (client_reconnect_gather.empty()) {
1184 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1185 reconnect_gather_finish();
1186 return;
1187 }
1188
1189 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1190
1191 reconnect_start = now;
1192 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1193 mds->sessionmap.dump();
1194 }
1195
1196 void Server::handle_client_reconnect(const MClientReconnect::const_ref &m)
1197 {
1198 dout(7) << "handle_client_reconnect " << m->get_source()
1199 << (m->has_more() ? " (more)" : "") << dendl;
1200 client_t from = m->get_source().num();
1201 Session *session = mds->get_session(m);
1202 ceph_assert(session);
1203
1204 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1205 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1206 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1207 return;
1208 }
1209
1210 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
1211 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1212
1213 bool deny = false;
1214 if (!mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
1215 // XXX maybe in the future we can do better than this?
1216 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1217 mds->clog->info() << "denied reconnect attempt (mds is "
1218 << ceph_mds_state_name(mds->get_state())
1219 << ") from " << m->get_source_inst()
1220 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
1221 deny = true;
1222 } else {
1223 std::string error_str;
1224 if (!session->is_open()) {
1225 error_str = "session is closed";
1226 } else if (mdcache->is_readonly()) {
1227 error_str = "mds is readonly";
1228 } else {
1229 if (session->info.client_metadata.features.empty())
1230 infer_supported_features(session, session->info.client_metadata);
1231
1232 feature_bitset_t missing_features = required_client_features;
1233 missing_features -= session->info.client_metadata.features;
1234 if (!missing_features.empty()) {
1235 stringstream ss;
1236 ss << "missing required features '" << missing_features << "'";
1237 error_str = ss.str();
1238 }
1239 }
1240
1241 if (!error_str.empty()) {
1242 deny = true;
1243 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1244 mds->clog->info() << "denied reconnect attempt from "
1245 << m->get_source_inst() << " (" << error_str << ")";
1246 }
1247 }
1248
1249 if (deny) {
1250 auto r = MClientSession::create(CEPH_SESSION_CLOSE);
1251 mds->send_message_client(r, session);
1252 if (session->is_open())
1253 kill_session(session, nullptr);
1254 return;
1255 }
1256
1257 if (!m->has_more()) {
1258 // notify client of success with an OPEN
1259 auto reply = MClientSession::create(CEPH_SESSION_OPEN);
1260 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
1261 reply->supported_features = supported_features;
1262 mds->send_message_client(reply, session);
1263 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1264 }
1265
1266 session->last_cap_renew = clock::now();
1267
1268 // snaprealms
1269 for (const auto &r : m->realms) {
1270 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
1271 if (in && in->state_test(CInode::STATE_PURGING))
1272 continue;
1273 if (in) {
1274 if (in->snaprealm) {
1275 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
1276 } else {
1277 // this can happen if we are non-auth or we rollback snaprealm
1278 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
1279 }
1280 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1281 } else {
1282 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1283 << " seq " << r.realm.seq << dendl;
1284 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1285 }
1286 }
1287
1288 // caps
1289 for (const auto &p : m->caps) {
1290 // make sure our last_cap_id is MAX over all issued caps
1291 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1292 mdcache->last_cap_id = p.second.capinfo.cap_id;
1293
1294 CInode *in = mdcache->get_inode(p.first);
1295 if (in && in->state_test(CInode::STATE_PURGING))
1296 continue;
1297 if (in && in->is_auth()) {
1298 // we recovered it, and it's ours. take note.
1299 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
1300 << " on " << *in << dendl;
1301 in->reconnect_cap(from, p.second, session);
1302 mdcache->add_reconnected_cap(from, p.first, p.second);
1303 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
1304 continue;
1305 }
1306
1307 if (in && !in->is_auth()) {
1308 // not mine.
1309 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1310 // add to cap export list.
1311 mdcache->rejoin_export_caps(p.first, from, p.second,
1312 in->authority().first, true);
1313 } else {
1314 // don't know if the inode is mine
1315 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1316 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
1317 }
1318 }
1319
1320 reconnect_last_seen = clock::now();
1321
1322 if (!m->has_more()) {
1323 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1324
1325 // remove from gather set
1326 client_reconnect_gather.erase(from);
1327 if (client_reconnect_gather.empty())
1328 reconnect_gather_finish();
1329 }
1330 }
1331
1332 void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1333 {
1334 int supported = -1;
1335 auto it = client_metadata.find("ceph_version");
1336 if (it != client_metadata.end()) {
1337 // user space client
1338 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1339 supported = CEPHFS_FEATURE_LUMINOUS;
1340 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1341 supported = CEPHFS_FEATURE_KRAKEN;
1342 } else {
1343 it = client_metadata.find("kernel_version");
1344 if (it != client_metadata.end()) {
1345 // kernel client
1346 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1347 supported = CEPHFS_FEATURE_LUMINOUS;
1348 }
1349 }
1350 if (supported == -1 &&
1351 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1352 supported = CEPHFS_FEATURE_JEWEL;
1353
1354 if (supported >= 0) {
1355 unsigned long value = (1UL << (supported + 1)) - 1;
1356 client_metadata.features = feature_bitset_t(value);
1357 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1358 }
1359 }
1360
1361 void Server::update_required_client_features()
1362 {
1363 vector<size_t> bits = CEPHFS_FEATURES_MDS_REQUIRED;
1364
1365 int min_compat = mds->mdsmap->get_min_compat_client();
1366 if (min_compat >= CEPH_RELEASE_NAUTILUS) {
1367 bits.push_back(CEPHFS_FEATURE_NAUTILUS);
1368 } else if (min_compat >= CEPH_RELEASE_MIMIC)
1369 bits.push_back(CEPHFS_FEATURE_MIMIC);
1370 else if (min_compat >= CEPH_RELEASE_LUMINOUS)
1371 bits.push_back(CEPHFS_FEATURE_LUMINOUS);
1372 else if (min_compat >= CEPH_RELEASE_KRAKEN)
1373 bits.push_back(CEPHFS_FEATURE_KRAKEN);
1374 else if (min_compat >= CEPH_RELEASE_JEWEL)
1375 bits.push_back(CEPHFS_FEATURE_JEWEL);
1376
1377 std::sort(bits.begin(), bits.end());
1378 required_client_features = feature_bitset_t(bits);
1379 dout(7) << "required_client_features: " << required_client_features << dendl;
1380
1381 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1382 set<Session*> sessions;
1383 mds->sessionmap.get_client_session_set(sessions);
1384 for (auto session : sessions) {
1385 feature_bitset_t missing_features = required_client_features;
1386 missing_features -= session->info.client_metadata.features;
1387 if (!missing_features.empty()) {
1388 bool blacklisted = mds->objecter->with_osdmap(
1389 [session](const OSDMap &osd_map) -> bool {
1390 return osd_map.is_blacklisted(session->info.inst.addr);
1391 });
1392 if (blacklisted)
1393 continue;
1394
1395 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1396 << missing_features << "'";
1397 std::stringstream ss;
1398 mds->evict_client(session->get_client().v, false,
1399 g_conf()->mds_session_blacklist_on_evict, ss);
1400 }
1401 }
1402 }
1403 }
1404
1405 void Server::reconnect_gather_finish()
1406 {
1407 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1408 ceph_assert(reconnect_done);
1409
1410 if (!mds->snapclient->is_synced()) {
1411 // make sure snaptable cache is populated. snaprealms will be
1412 // extensively used in rejoin stage.
1413 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1414 mds->snapclient->wait_for_sync(reconnect_done);
1415 } else {
1416 reconnect_done->complete(0);
1417 }
1418 reconnect_done = NULL;
1419 }
1420
1421 void Server::reconnect_tick()
1422 {
1423 if (reconnect_evicting) {
1424 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
1425 return;
1426 }
1427
1428 if (client_reconnect_gather.empty())
1429 return;
1430
1431 auto now = clock::now();
1432 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
1433 if (elapse1 < g_conf()->mds_reconnect_timeout)
1434 return;
1435
1436 vector<Session*> remaining_sessions;
1437 remaining_sessions.reserve(client_reconnect_gather.size());
1438 for (auto c : client_reconnect_gather) {
1439 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1440 ceph_assert(session);
1441 remaining_sessions.push_back(session);
1442 // client re-sends cap flush messages before the reconnect message
1443 if (session->last_seen > reconnect_last_seen)
1444 reconnect_last_seen = session->last_seen;
1445 }
1446
1447 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
1448 if (elapse2 < g_conf()->mds_reconnect_timeout / 2) {
1449 dout(7) << "reconnect_tick: last seen " << elapse2
1450 << " seconds ago, extending reconnect interval" << dendl;
1451 return;
1452 }
1453
1454 dout(7) << "reconnect timed out, " << remaining_sessions.size()
1455 << " clients have not reconnected in time" << dendl;
1456
1457 // If we're doing blacklist evictions, use this to wait for them before
1458 // proceeding to reconnect_gather_finish
1459 MDSGatherBuilder gather(g_ceph_context);
1460
1461 for (auto session : remaining_sessions) {
1462 // Keep sessions that have specified timeout. These sessions will prevent
1463 // mds from going to active. MDS goes to active after they all have been
1464 // killed or reclaimed.
1465 if (session->info.client_metadata.find("timeout") !=
1466 session->info.client_metadata.end()) {
1467 dout(1) << "reconnect keeps " << session->info.inst
1468 << ", need to be reclaimed" << dendl;
1469 client_reclaim_gather.insert(session->get_client());
1470 continue;
1471 }
1472
1473 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
1474
1475 mds->clog->warn() << "evicting unresponsive client " << *session
1476 << ", after waiting " << elapse1
1477 << " seconds during MDS startup";
1478
1479 if (g_conf()->mds_session_blacklist_on_timeout) {
1480 std::stringstream ss;
1481 mds->evict_client(session->get_client().v, false, true, ss,
1482 gather.new_sub());
1483 } else {
1484 kill_session(session, NULL);
1485 }
1486
1487 failed_reconnects++;
1488 }
1489 client_reconnect_gather.clear();
1490
1491 if (gather.has_subs()) {
1492 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1493 gather.set_finisher(new MDSInternalContextWrapper(mds, new FunctionContext(
1494 [this](int r){reconnect_gather_finish();})));
1495 gather.activate();
1496 reconnect_evicting = true;
1497 } else {
1498 reconnect_gather_finish();
1499 }
1500 }
1501
1502 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1503 {
1504 if (!locks.length()) return;
1505 int numlocks;
1506 ceph_filelock lock;
1507 auto p = locks.cbegin();
1508 decode(numlocks, p);
1509 for (int i = 0; i < numlocks; ++i) {
1510 decode(lock, p);
1511 lock.client = client;
1512 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1513 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1514 }
1515 decode(numlocks, p);
1516 for (int i = 0; i < numlocks; ++i) {
1517 decode(lock, p);
1518 lock.client = client;
1519 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1520 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1521 }
1522 }
1523
1524 /**
1525 * Call this when the MDCache is oversized, to send requests to the clients
1526 * to trim some caps, and consequently unpin some inodes in the MDCache so
1527 * that it can trim too.
1528 */
1529 std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1530 {
1531 const auto now = clock::now();
1532 const bool steady = flags&RecallFlags::STEADY;
1533 const bool enforce_max = flags&RecallFlags::ENFORCE_MAX;
1534
1535 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1536 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1537 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1538 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1539 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
1540
1541 dout(7) << __func__ << ":"
1542 << " min=" << min_caps_per_client
1543 << " max=" << max_caps_per_client
1544 << " total=" << Capability::count()
1545 << " flags=0x" << std::hex << flags
1546 << dendl;
1547
1548 /* trim caps of sessions with the most caps first */
1549 std::multimap<uint64_t, Session*> caps_session;
1550 auto f = [&caps_session, enforce_max, max_caps_per_client](auto& s) {
1551 auto num_caps = s->caps.size();
1552 if (!enforce_max || num_caps > max_caps_per_client) {
1553 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1554 }
1555 };
1556 mds->sessionmap.get_client_sessions(std::move(f));
1557
1558 std::pair<bool, uint64_t> result = {false, 0};
1559 auto& [throttled, caps_recalled] = result;
1560 last_recall_state = now;
1561 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
1562 if (!session->is_open() ||
1563 !session->get_connection() ||
1564 !session->info.inst.name.is_client())
1565 continue;
1566
1567 dout(10) << __func__ << ":"
1568 << " session " << session->info.inst
1569 << " caps " << num_caps
1570 << ", leases " << session->leases.size()
1571 << dendl;
1572
1573 uint64_t newlim;
1574 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1575 newlim = min_caps_per_client;
1576 } else {
1577 newlim = num_caps-recall_max_caps;
1578 }
1579 if (num_caps > newlim) {
1580 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1581 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1582 newlim = num_caps-recall;
1583 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
1584 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1585 const uint64_t global_recall_throttle = recall_throttle.get();
1586 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1587 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1588 throttled = true;
1589 continue;
1590 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1591 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1592 throttled = true;
1593 continue;
1594 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1595 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1596 throttled = true;
1597 break;
1598 }
1599
1600 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1601 if (steady) {
1602 const auto session_recall = session->get_recall_caps();
1603 const auto session_release = session->get_release_caps();
1604 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1605 /* The session has been unable to keep up with the number of caps
1606 * recalled (by half); additionally, to prevent marking sessions
1607 * we've just begun to recall from, the session_recall counter
1608 * (decayed count of caps recently recalled) is **greater** than the
1609 * session threshold for the session's cap recall throttle.
1610 */
1611 dout(15) << " 2*session_release < session_recall"
1612 " (2*" << session_release << " < " << session_recall << ") &&"
1613 " 2*session_recall < recall_max_decay_threshold"
1614 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
1615 " Skipping because we are unlikely to get more released." << dendl;
1616 continue;
1617 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1618 /* The number of caps recalled is less than the number we *could*
1619 * recall (so there isn't much left to recall?) and the number of
1620 * caps is less than the current recall_caps counter (decayed count
1621 * of caps recently recalled).
1622 */
1623 dout(15) << " 2*recall < session_recall "
1624 " (2*" << recall << " < " << session_recall << ") &&"
1625 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1626 " Skipping because we are unlikely to get more released." << dendl;
1627 continue;
1628 }
1629 }
1630
1631 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1632
1633 auto m = MClientSession::create(CEPH_SESSION_RECALL_STATE);
1634 m->head.max_caps = newlim;
1635 mds->send_message_client(m, session);
1636 if (gather) {
1637 flush_session(session, gather);
1638 }
1639 caps_recalled += session->notify_recall_sent(newlim);
1640 recall_throttle.hit(recall);
1641 }
1642 }
1643
1644 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1645
1646 return result;
1647 }
1648
1649 void Server::force_clients_readonly()
1650 {
1651 dout(10) << "force_clients_readonly" << dendl;
1652 set<Session*> sessions;
1653 mds->sessionmap.get_client_session_set(sessions);
1654 for (set<Session*>::const_iterator p = sessions.begin();
1655 p != sessions.end();
1656 ++p) {
1657 Session *session = *p;
1658 if (!session->info.inst.name.is_client() ||
1659 !(session->is_open() || session->is_stale()))
1660 continue;
1661 mds->send_message_client(MClientSession::create(CEPH_SESSION_FORCE_RO), session);
1662 }
1663 }
1664
1665 /*******
1666 * some generic stuff for finishing off requests
1667 */
1668 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1669 {
1670 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1671 ceph_assert(!mdr->has_completed);
1672
1673 // note trace items for eventual reply.
1674 mdr->tracei = in;
1675 if (in)
1676 mdr->pin(in);
1677
1678 mdr->tracedn = dn;
1679 if (dn)
1680 mdr->pin(dn);
1681
1682 early_reply(mdr, in, dn);
1683
1684 mdr->committing = true;
1685 submit_mdlog_entry(le, fin, mdr, __func__);
1686
1687 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1688 if (mds->queue_one_replay()) {
1689 dout(10) << " queued next replay op" << dendl;
1690 } else {
1691 dout(10) << " journaled last replay op" << dendl;
1692 }
1693 } else if (mdr->did_early_reply)
1694 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
1695 else
1696 mdlog->flush();
1697 }
1698
1699 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1700 std::string_view event)
1701 {
1702 if (mdr) {
1703 string event_str("submit entry: ");
1704 event_str += event;
1705 mdr->mark_event(event_str);
1706 }
1707 mdlog->submit_entry(le, fin);
1708 }
1709
1710 /*
1711 * send response built from mdr contents and error code; clean up mdr
1712 */
1713 void Server::respond_to_request(MDRequestRef& mdr, int r)
1714 {
1715 if (mdr->client_request) {
1716 reply_client_request(mdr, MClientReply::create(*mdr->client_request, r));
1717 } else if (mdr->internal_op > -1) {
1718 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1719 if (!mdr->internal_op_finish)
1720 ceph_abort_msg("trying to respond to internal op without finisher");
1721 mdr->internal_op_finish->complete(r);
1722 mdcache->request_finish(mdr);
1723 }
1724 }
1725
1726 // statistics mds req op number and latency
1727 void Server::perf_gather_op_latency(const MClientRequest::const_ref &req, utime_t lat)
1728 {
1729 int code = l_mdss_first;
1730 switch(req->get_op()) {
1731 case CEPH_MDS_OP_LOOKUPHASH:
1732 code = l_mdss_req_lookuphash_latency;
1733 break;
1734 case CEPH_MDS_OP_LOOKUPINO:
1735 code = l_mdss_req_lookupino_latency;
1736 break;
1737 case CEPH_MDS_OP_LOOKUPPARENT:
1738 code = l_mdss_req_lookupparent_latency;
1739 break;
1740 case CEPH_MDS_OP_LOOKUPNAME:
1741 code = l_mdss_req_lookupname_latency;
1742 break;
1743 case CEPH_MDS_OP_LOOKUP:
1744 code = l_mdss_req_lookup_latency;
1745 break;
1746 case CEPH_MDS_OP_LOOKUPSNAP:
1747 code = l_mdss_req_lookupsnap_latency;
1748 break;
1749 case CEPH_MDS_OP_GETATTR:
1750 code = l_mdss_req_getattr_latency;
1751 break;
1752 case CEPH_MDS_OP_SETATTR:
1753 code = l_mdss_req_setattr_latency;
1754 break;
1755 case CEPH_MDS_OP_SETLAYOUT:
1756 code = l_mdss_req_setlayout_latency;
1757 break;
1758 case CEPH_MDS_OP_SETDIRLAYOUT:
1759 code = l_mdss_req_setdirlayout_latency;
1760 break;
1761 case CEPH_MDS_OP_SETXATTR:
1762 code = l_mdss_req_setxattr_latency;
1763 break;
1764 case CEPH_MDS_OP_RMXATTR:
1765 code = l_mdss_req_rmxattr_latency;
1766 break;
1767 case CEPH_MDS_OP_READDIR:
1768 code = l_mdss_req_readdir_latency;
1769 break;
1770 case CEPH_MDS_OP_SETFILELOCK:
1771 code = l_mdss_req_setfilelock_latency;
1772 break;
1773 case CEPH_MDS_OP_GETFILELOCK:
1774 code = l_mdss_req_getfilelock_latency;
1775 break;
1776 case CEPH_MDS_OP_CREATE:
1777 code = l_mdss_req_create_latency;
1778 break;
1779 case CEPH_MDS_OP_OPEN:
1780 code = l_mdss_req_open_latency;
1781 break;
1782 case CEPH_MDS_OP_MKNOD:
1783 code = l_mdss_req_mknod_latency;
1784 break;
1785 case CEPH_MDS_OP_LINK:
1786 code = l_mdss_req_link_latency;
1787 break;
1788 case CEPH_MDS_OP_UNLINK:
1789 code = l_mdss_req_unlink_latency;
1790 break;
1791 case CEPH_MDS_OP_RMDIR:
1792 code = l_mdss_req_rmdir_latency;
1793 break;
1794 case CEPH_MDS_OP_RENAME:
1795 code = l_mdss_req_rename_latency;
1796 break;
1797 case CEPH_MDS_OP_MKDIR:
1798 code = l_mdss_req_mkdir_latency;
1799 break;
1800 case CEPH_MDS_OP_SYMLINK:
1801 code = l_mdss_req_symlink_latency;
1802 break;
1803 case CEPH_MDS_OP_LSSNAP:
1804 code = l_mdss_req_lssnap_latency;
1805 break;
1806 case CEPH_MDS_OP_MKSNAP:
1807 code = l_mdss_req_mksnap_latency;
1808 break;
1809 case CEPH_MDS_OP_RMSNAP:
1810 code = l_mdss_req_rmsnap_latency;
1811 break;
1812 case CEPH_MDS_OP_RENAMESNAP:
1813 code = l_mdss_req_renamesnap_latency;
1814 break;
1815 default: ceph_abort();
1816 }
1817 logger->tinc(code, lat);
1818 }
1819
1820 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
1821 {
1822 if (!g_conf()->mds_early_reply)
1823 return;
1824
1825 if (mdr->no_early_reply) {
1826 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
1827 return;
1828 }
1829
1830 if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
1831 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
1832 return;
1833 }
1834
1835 if (mdr->alloc_ino) {
1836 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
1837 return;
1838 }
1839
1840 const MClientRequest::const_ref &req = mdr->client_request;
1841 entity_inst_t client_inst = req->get_source_inst();
1842 if (client_inst.name.is_mds())
1843 return;
1844
1845 if (req->is_replay()) {
1846 dout(10) << " no early reply on replay op" << dendl;
1847 return;
1848 }
1849
1850
1851 auto reply = MClientReply::create(*req, 0);
1852 reply->set_unsafe();
1853
1854 // mark xlocks "done", indicating that we are exposing uncommitted changes.
1855 //
1856 //_rename_finish() does not send dentry link/unlink message to replicas.
1857 // so do not set xlocks on dentries "done", the xlocks prevent dentries
1858 // that have projected linkages from getting new replica.
1859 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
1860
1861 dout(10) << "early_reply " << reply->get_result()
1862 << " (" << cpp_strerror(reply->get_result())
1863 << ") " << *req << dendl;
1864
1865 if (tracei || tracedn) {
1866 if (tracei)
1867 mdr->cap_releases.erase(tracei->vino());
1868 if (tracedn)
1869 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1870
1871 set_trace_dist(mdr->session, reply, tracei, tracedn, mdr->snapid,
1872 req->get_dentry_wanted(), mdr);
1873 }
1874
1875 reply->set_extra_bl(mdr->reply_extra_bl);
1876 mds->send_message_client(reply, mdr->session);
1877
1878 mdr->did_early_reply = true;
1879
1880 mds->logger->inc(l_mds_reply);
1881 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
1882 mds->logger->tinc(l_mds_reply_latency, lat);
1883 if (client_inst.name.is_client()) {
1884 mds->sessionmap.hit_session(mdr->session);
1885 }
1886 perf_gather_op_latency(req, lat);
1887 dout(20) << "lat " << lat << dendl;
1888
1889 mdr->mark_event("early_replied");
1890 }
1891
1892 /*
1893 * send given reply
1894 * include a trace to tracei
1895 * Clean up mdr
1896 */
1897 void Server::reply_client_request(MDRequestRef& mdr, const MClientReply::ref &reply)
1898 {
1899 ceph_assert(mdr.get());
1900 const MClientRequest::const_ref &req = mdr->client_request;
1901
1902 dout(7) << "reply_client_request " << reply->get_result()
1903 << " (" << cpp_strerror(reply->get_result())
1904 << ") " << *req << dendl;
1905
1906 mdr->mark_event("replying");
1907
1908 Session *session = mdr->session;
1909
1910 // note successful request in session map?
1911 //
1912 // setfilelock requests are special, they only modify states in MDS memory.
1913 // The states get lost when MDS fails. If Client re-send a completed
1914 // setfilelock request, it means that client did not receive corresponding
1915 // setfilelock reply. So MDS should re-execute the setfilelock request.
1916 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
1917 reply->get_result() == 0 && session) {
1918 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
1919 session->add_completed_request(mdr->reqid.tid, created);
1920 if (mdr->ls) {
1921 mdr->ls->touched_sessions.insert(session->info.inst.name);
1922 }
1923 }
1924
1925 // give any preallocated inos to the session
1926 apply_allocated_inos(mdr, session);
1927
1928 // get tracei/tracedn from mdr?
1929 snapid_t snapid = mdr->snapid;
1930 CInode *tracei = mdr->tracei;
1931 CDentry *tracedn = mdr->tracedn;
1932
1933 bool is_replay = mdr->client_request->is_replay();
1934 bool did_early_reply = mdr->did_early_reply;
1935 entity_inst_t client_inst = req->get_source_inst();
1936 int dentry_wanted = req->get_dentry_wanted();
1937
1938 if (!did_early_reply && !is_replay) {
1939
1940 mds->logger->inc(l_mds_reply);
1941 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
1942 mds->logger->tinc(l_mds_reply_latency, lat);
1943 if (client_inst.name.is_client()) {
1944 mds->sessionmap.hit_session(session);
1945 }
1946 perf_gather_op_latency(req, lat);
1947 dout(20) << "lat " << lat << dendl;
1948
1949 if (tracei)
1950 mdr->cap_releases.erase(tracei->vino());
1951 if (tracedn)
1952 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
1953 }
1954
1955 // drop non-rdlocks before replying, so that we can issue leases
1956 mdcache->request_drop_non_rdlocks(mdr);
1957
1958 // reply at all?
1959 if (!(client_inst.name.is_mds() || !session)) {
1960 // send reply.
1961 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
1962 (tracei || tracedn)) {
1963 if (is_replay) {
1964 if (tracei)
1965 mdcache->try_reconnect_cap(tracei, session);
1966 } else {
1967 // include metadata in reply
1968 set_trace_dist(session, reply, tracei, tracedn,
1969 snapid, dentry_wanted,
1970 mdr);
1971 }
1972 }
1973
1974 // We can set the extra bl unconditionally: if it's already been sent in the
1975 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
1976 reply->set_extra_bl(mdr->reply_extra_bl);
1977
1978 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
1979 mds->send_message_client(reply, session);
1980 }
1981
1982 if (req->is_queued_for_replay() &&
1983 (mdr->has_completed || reply->get_result() < 0)) {
1984 if (reply->get_result() < 0) {
1985 int r = reply->get_result();
1986 derr << "reply_client_request: failed to replay " << *req
1987 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
1988 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
1989 }
1990 mds->queue_one_replay();
1991 }
1992
1993 // clean up request
1994 mdcache->request_finish(mdr);
1995
1996 // take a closer look at tracei, if it happens to be a remote link
1997 if (tracei &&
1998 tracedn &&
1999 tracedn->get_projected_linkage()->is_remote()) {
2000 mdcache->eval_remote(tracedn);
2001 }
2002 }
2003
2004 /*
2005 * pass inode OR dentry (not both, or we may get confused)
2006 *
2007 * trace is in reverse order (i.e. root inode comes last)
2008 */
2009 void Server::set_trace_dist(Session *session, const MClientReply::ref &reply,
2010 CInode *in, CDentry *dn,
2011 snapid_t snapid,
2012 int dentry_wanted,
2013 MDRequestRef& mdr)
2014 {
2015 // skip doing this for debugging purposes?
2016 if (g_conf()->mds_inject_traceless_reply_probability &&
2017 mdr->ls && !mdr->o_trunc &&
2018 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
2019 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2020 return;
2021 }
2022
2023 // inode, dentry, dir, ..., inode
2024 bufferlist bl;
2025 mds_rank_t whoami = mds->get_nodeid();
2026 client_t client = session->get_client();
2027 utime_t now = ceph_clock_now();
2028
2029 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2030
2031 //assert((bool)dn == (bool)dentry_wanted); // not true for snapshot lookups
2032
2033 // realm
2034 if (snapid == CEPH_NOSNAP) {
2035 SnapRealm *realm;
2036 if (in)
2037 realm = in->find_snaprealm();
2038 else
2039 realm = dn->get_dir()->get_inode()->find_snaprealm();
2040 reply->snapbl = realm->get_snap_trace();
2041 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2042 }
2043
2044 // dir + dentry?
2045 if (dn) {
2046 reply->head.is_dentry = 1;
2047 CDir *dir = dn->get_dir();
2048 CInode *diri = dir->get_inode();
2049
2050 diri->encode_inodestat(bl, session, NULL, snapid);
2051 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2052
2053 #ifdef MDS_VERIFY_FRAGSTAT
2054 if (dir->is_complete())
2055 dir->verify_fragstat();
2056 #endif
2057 DirStat ds;
2058 ds.frag = dir->get_frag();
2059 ds.auth = dir->get_dir_auth().first;
2060 if (dir->is_auth())
2061 dir->get_dist_spec(ds.dist, whoami);
2062
2063 dir->encode_dirstat(bl, session->info, ds);
2064 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2065
2066 encode(dn->get_name(), bl);
2067 if (snapid == CEPH_NOSNAP)
2068 mds->locker->issue_client_lease(dn, client, bl, now, session);
2069 else {
2070 //null lease
2071 LeaseStat e;
2072 mds->locker->encode_lease(bl, session->info, e);
2073 }
2074 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
2075 } else
2076 reply->head.is_dentry = 0;
2077
2078 // inode
2079 if (in) {
2080 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2081 dout(20) << "set_trace_dist added in " << *in << dendl;
2082 reply->head.is_target = 1;
2083 } else
2084 reply->head.is_target = 0;
2085
2086 reply->set_trace(bl);
2087 }
2088
2089 void Server::handle_client_request(const MClientRequest::const_ref &req)
2090 {
2091 dout(4) << "handle_client_request " << *req << dendl;
2092
2093 if (mds->logger)
2094 mds->logger->inc(l_mds_request);
2095 if (logger)
2096 logger->inc(l_mdss_handle_client_request);
2097
2098 if (!mdcache->is_open()) {
2099 dout(5) << "waiting for root" << dendl;
2100 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2101 return;
2102 }
2103
2104 // active session?
2105 Session *session = 0;
2106 if (req->get_source().is_client()) {
2107 session = mds->get_session(req);
2108 if (!session) {
2109 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
2110 } else if (session->is_closed() ||
2111 session->is_closing() ||
2112 session->is_killing()) {
2113 dout(5) << "session closed|closing|killing, dropping" << dendl;
2114 session = NULL;
2115 }
2116 if (!session) {
2117 if (req->is_queued_for_replay())
2118 mds->queue_one_replay();
2119 return;
2120 }
2121 }
2122
2123 // old mdsmap?
2124 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2125 // send it? hrm, this isn't ideal; they may get a lot of copies if
2126 // they have a high request rate.
2127 }
2128
2129 // completed request?
2130 bool has_completed = false;
2131 if (req->is_replay() || req->get_retry_attempt()) {
2132 ceph_assert(session);
2133 inodeno_t created;
2134 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2135 has_completed = true;
2136 // Don't send traceless reply if the completed request has created
2137 // new inode. Treat the request as lookup request instead.
2138 if (req->is_replay() ||
2139 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2140 req->get_op() != CEPH_MDS_OP_OPEN &&
2141 req->get_op() != CEPH_MDS_OP_CREATE)) {
2142 dout(5) << "already completed " << req->get_reqid() << dendl;
2143 auto reply = MClientReply::create(*req, 0);
2144 if (created != inodeno_t()) {
2145 bufferlist extra;
2146 encode(created, extra);
2147 reply->set_extra_bl(extra);
2148 }
2149 mds->send_message_client(reply, session);
2150
2151 if (req->is_queued_for_replay())
2152 mds->queue_one_replay();
2153
2154 return;
2155 }
2156 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2157 req->get_op() != CEPH_MDS_OP_CREATE) {
2158 dout(10) << " completed request which created new inode " << created
2159 << ", convert it to lookup request" << dendl;
2160 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2161 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2162 }
2163 }
2164 }
2165
2166 // trim completed_request list
2167 if (req->get_oldest_client_tid() > 0) {
2168 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
2169 ceph_assert(session);
2170 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2171 // Sessions 'completed_requests' was dirtied, mark it to be
2172 // potentially flushed at segment expiry.
2173 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2174
2175 if (session->get_num_trim_requests_warnings() > 0 &&
2176 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
2177 session->reset_num_trim_requests_warnings();
2178 } else {
2179 if (session->get_num_completed_requests() >=
2180 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
2181 session->inc_num_trim_requests_warnings();
2182 stringstream ss;
2183 ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
2184 << req->get_oldest_client_tid() << "), "
2185 << session->get_num_completed_requests()
2186 << " completed requests recorded in session\n";
2187 mds->clog->warn() << ss.str();
2188 dout(20) << __func__ << " " << ss.str() << dendl;
2189 }
2190 }
2191 }
2192
2193 // register + dispatch
2194 MDRequestRef mdr = mdcache->request_start(req);
2195 if (!mdr.get())
2196 return;
2197
2198 if (session) {
2199 mdr->session = session;
2200 session->requests.push_back(&mdr->item_session_request);
2201 }
2202
2203 if (has_completed)
2204 mdr->has_completed = true;
2205
2206 // process embedded cap releases?
2207 // (only if NOT replay!)
2208 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
2209 client_t client = req->get_source().num();
2210 for (const auto &r : req->releases) {
2211 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2212 }
2213 req->releases.clear();
2214 }
2215
2216 dispatch_client_request(mdr);
2217 return;
2218 }
2219
2220 void Server::handle_osd_map()
2221 {
2222 /* Note that we check the OSDMAP_FULL flag directly rather than
2223 * using osdmap_full_flag(), because we want to know "is the flag set"
2224 * rather than "does the flag apply to us?" */
2225 mds->objecter->with_osdmap([this](const OSDMap& o) {
2226 auto pi = o.get_pg_pool(mds->mdsmap->get_metadata_pool());
2227 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
2228 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2229 << o.get_epoch() << dendl;
2230 });
2231 }
2232
2233 void Server::dispatch_client_request(MDRequestRef& mdr)
2234 {
2235 // we shouldn't be waiting on anyone.
2236 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
2237
2238 if (mdr->killed) {
2239 dout(10) << "request " << *mdr << " was killed" << dendl;
2240 return;
2241 } else if (mdr->aborted) {
2242 mdr->aborted = false;
2243 mdcache->request_kill(mdr);
2244 return;
2245 }
2246
2247 const MClientRequest::const_ref &req = mdr->client_request;
2248
2249 if (logger) logger->inc(l_mdss_dispatch_client_request);
2250
2251 dout(7) << "dispatch_client_request " << *req << dendl;
2252
2253 if (req->may_write()) {
2254 if (mdcache->is_readonly()) {
2255 dout(10) << " read-only FS" << dendl;
2256 respond_to_request(mdr, -EROFS);
2257 return;
2258 }
2259 if (mdr->has_more() && mdr->more()->slave_error) {
2260 dout(10) << " got error from slaves" << dendl;
2261 respond_to_request(mdr, mdr->more()->slave_error);
2262 return;
2263 }
2264 }
2265
2266 if (is_full) {
2267 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2268 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2269 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2270 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2271 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2272 req->get_op() == CEPH_MDS_OP_CREATE ||
2273 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2274 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2275 ((req->get_op() == CEPH_MDS_OP_LINK ||
2276 req->get_op() == CEPH_MDS_OP_RENAME) &&
2277 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
2278 ) {
2279
2280 dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2281 respond_to_request(mdr, -ENOSPC);
2282 return;
2283 } else {
2284 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2285 }
2286 }
2287
2288 switch (req->get_op()) {
2289 case CEPH_MDS_OP_LOOKUPHASH:
2290 case CEPH_MDS_OP_LOOKUPINO:
2291 handle_client_lookup_ino(mdr, false, false);
2292 break;
2293 case CEPH_MDS_OP_LOOKUPPARENT:
2294 handle_client_lookup_ino(mdr, true, false);
2295 break;
2296 case CEPH_MDS_OP_LOOKUPNAME:
2297 handle_client_lookup_ino(mdr, false, true);
2298 break;
2299
2300 // inodes ops.
2301 case CEPH_MDS_OP_LOOKUP:
2302 handle_client_getattr(mdr, true);
2303 break;
2304
2305 case CEPH_MDS_OP_LOOKUPSNAP:
2306 // lookupsnap does not reference a CDentry; treat it as a getattr
2307 case CEPH_MDS_OP_GETATTR:
2308 handle_client_getattr(mdr, false);
2309 break;
2310
2311 case CEPH_MDS_OP_SETATTR:
2312 handle_client_setattr(mdr);
2313 break;
2314 case CEPH_MDS_OP_SETLAYOUT:
2315 handle_client_setlayout(mdr);
2316 break;
2317 case CEPH_MDS_OP_SETDIRLAYOUT:
2318 handle_client_setdirlayout(mdr);
2319 break;
2320 case CEPH_MDS_OP_SETXATTR:
2321 handle_client_setxattr(mdr);
2322 break;
2323 case CEPH_MDS_OP_RMXATTR:
2324 handle_client_removexattr(mdr);
2325 break;
2326
2327 case CEPH_MDS_OP_READDIR:
2328 handle_client_readdir(mdr);
2329 break;
2330
2331 case CEPH_MDS_OP_SETFILELOCK:
2332 handle_client_file_setlock(mdr);
2333 break;
2334
2335 case CEPH_MDS_OP_GETFILELOCK:
2336 handle_client_file_readlock(mdr);
2337 break;
2338
2339 // funky.
2340 case CEPH_MDS_OP_CREATE:
2341 if (mdr->has_completed)
2342 handle_client_open(mdr); // already created.. just open
2343 else
2344 handle_client_openc(mdr);
2345 break;
2346
2347 case CEPH_MDS_OP_OPEN:
2348 handle_client_open(mdr);
2349 break;
2350
2351 // namespace.
2352 // no prior locks.
2353 case CEPH_MDS_OP_MKNOD:
2354 handle_client_mknod(mdr);
2355 break;
2356 case CEPH_MDS_OP_LINK:
2357 handle_client_link(mdr);
2358 break;
2359 case CEPH_MDS_OP_UNLINK:
2360 case CEPH_MDS_OP_RMDIR:
2361 handle_client_unlink(mdr);
2362 break;
2363 case CEPH_MDS_OP_RENAME:
2364 handle_client_rename(mdr);
2365 break;
2366 case CEPH_MDS_OP_MKDIR:
2367 handle_client_mkdir(mdr);
2368 break;
2369 case CEPH_MDS_OP_SYMLINK:
2370 handle_client_symlink(mdr);
2371 break;
2372
2373
2374 // snaps
2375 case CEPH_MDS_OP_LSSNAP:
2376 handle_client_lssnap(mdr);
2377 break;
2378 case CEPH_MDS_OP_MKSNAP:
2379 handle_client_mksnap(mdr);
2380 break;
2381 case CEPH_MDS_OP_RMSNAP:
2382 handle_client_rmsnap(mdr);
2383 break;
2384 case CEPH_MDS_OP_RENAMESNAP:
2385 handle_client_renamesnap(mdr);
2386 break;
2387
2388 default:
2389 dout(1) << " unknown client op " << req->get_op() << dendl;
2390 respond_to_request(mdr, -EOPNOTSUPP);
2391 }
2392 }
2393
2394
2395 // ---------------------------------------
2396 // SLAVE REQUESTS
2397
2398 void Server::handle_slave_request(const MMDSSlaveRequest::const_ref &m)
2399 {
2400 dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
2401 mds_rank_t from = mds_rank_t(m->get_source().num());
2402
2403 if (logger) logger->inc(l_mdss_handle_slave_request);
2404
2405 // reply?
2406 if (m->is_reply())
2407 return handle_slave_request_reply(m);
2408
2409 // the purpose of rename notify is enforcing causal message ordering. making sure
2410 // bystanders have received all messages from rename srcdn's auth MDS.
2411 if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
2412 auto reply = MMDSSlaveRequest::create(m->get_reqid(), m->get_attempt(), MMDSSlaveRequest::OP_RENAMENOTIFYACK);
2413 mds->send_message(reply, m->get_connection());
2414 return;
2415 }
2416
2417 CDentry *straydn = NULL;
2418 if (m->straybl.length() > 0) {
2419 straydn = mdcache->add_replica_stray(m->straybl, from);
2420 ceph_assert(straydn);
2421 m->straybl.clear();
2422 }
2423
2424 // am i a new slave?
2425 MDRequestRef mdr;
2426 if (mdcache->have_request(m->get_reqid())) {
2427 // existing?
2428 mdr = mdcache->request_get(m->get_reqid());
2429
2430 // is my request newer?
2431 if (mdr->attempt > m->get_attempt()) {
2432 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2433 << ", dropping " << *m << dendl;
2434 return;
2435 }
2436
2437
2438 if (mdr->attempt < m->get_attempt()) {
2439 // mine is old, close it out
2440 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2441 << ", closing out" << dendl;
2442 mdcache->request_finish(mdr);
2443 mdr.reset();
2444 } else if (mdr->slave_to_mds != from) {
2445 dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
2446 return;
2447 }
2448
2449 if (m->get_op() == MMDSSlaveRequest::OP_FINISH && m->is_abort()) {
2450 mdr->aborted = true;
2451 if (mdr->slave_request) {
2452 // only abort on-going xlock, wrlock and auth pin
2453 ceph_assert(!mdr->slave_did_prepare());
2454 } else {
2455 mdcache->request_finish(mdr);
2456 }
2457 return;
2458 }
2459 }
2460 if (!mdr.get()) {
2461 // new?
2462 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
2463 dout(10) << "missing slave request for " << m->get_reqid()
2464 << " OP_FINISH, must have lost race with a forward" << dendl;
2465 return;
2466 }
2467 mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
2468 mdr->set_op_stamp(m->op_stamp);
2469 }
2470 ceph_assert(mdr->slave_request == 0); // only one at a time, please!
2471
2472 if (straydn) {
2473 mdr->pin(straydn);
2474 mdr->straydn = straydn;
2475 }
2476
2477 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2478 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2479 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2480 return;
2481 } else if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2482 mdr->locks.empty()) {
2483 dout(3) << "not active yet, waiting" << dendl;
2484 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2485 return;
2486 }
2487
2488 mdr->reset_slave_request(m);
2489
2490 dispatch_slave_request(mdr);
2491 }
2492
2493 void Server::handle_slave_request_reply(const MMDSSlaveRequest::const_ref &m)
2494 {
2495 mds_rank_t from = mds_rank_t(m->get_source().num());
2496
2497 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2498 metareqid_t r = m->get_reqid();
2499 if (!mdcache->have_uncommitted_master(r, from)) {
2500 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2501 << from << " reqid " << r << dendl;
2502 return;
2503 }
2504 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2505 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2506 return;
2507 }
2508
2509 if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
2510 metareqid_t r = m->get_reqid();
2511 mdcache->committed_master_slave(r, from);
2512 return;
2513 }
2514
2515 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2516 if (m->get_attempt() != mdr->attempt) {
2517 dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
2518 << m->get_attempt() << dendl;
2519 return;
2520 }
2521
2522 switch (m->get_op()) {
2523 case MMDSSlaveRequest::OP_XLOCKACK:
2524 {
2525 // identify lock, master request
2526 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2527 m->get_object_info());
2528 mdr->more()->slaves.insert(from);
2529 lock->decode_locked_state(m->get_lock_data());
2530 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2531 mdr->locks.emplace_hint(mdr->locks.end(), lock, MutationImpl::LockOp::XLOCK);
2532 mdr->finish_locking(lock);
2533 lock->get_xlock(mdr, mdr->get_client());
2534
2535 ceph_assert(mdr->more()->waiting_on_slave.count(from));
2536 mdr->more()->waiting_on_slave.erase(from);
2537 ceph_assert(mdr->more()->waiting_on_slave.empty());
2538 mdcache->dispatch_request(mdr);
2539 }
2540 break;
2541
2542 case MMDSSlaveRequest::OP_WRLOCKACK:
2543 {
2544 // identify lock, master request
2545 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2546 m->get_object_info());
2547 mdr->more()->slaves.insert(from);
2548 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2549 auto it = mdr->locks.emplace_hint(mdr->locks.end(),
2550 lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
2551 ceph_assert(it->is_remote_wrlock());
2552 ceph_assert(it->wrlock_target == from);
2553
2554 mdr->finish_locking(lock);
2555
2556 ceph_assert(mdr->more()->waiting_on_slave.count(from));
2557 mdr->more()->waiting_on_slave.erase(from);
2558 ceph_assert(mdr->more()->waiting_on_slave.empty());
2559 mdcache->dispatch_request(mdr);
2560 }
2561 break;
2562
2563 case MMDSSlaveRequest::OP_AUTHPINACK:
2564 handle_slave_auth_pin_ack(mdr, m);
2565 break;
2566
2567 case MMDSSlaveRequest::OP_LINKPREPACK:
2568 handle_slave_link_prep_ack(mdr, m);
2569 break;
2570
2571 case MMDSSlaveRequest::OP_RMDIRPREPACK:
2572 handle_slave_rmdir_prep_ack(mdr, m);
2573 break;
2574
2575 case MMDSSlaveRequest::OP_RENAMEPREPACK:
2576 handle_slave_rename_prep_ack(mdr, m);
2577 break;
2578
2579 case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
2580 handle_slave_rename_notify_ack(mdr, m);
2581 break;
2582
2583 default:
2584 ceph_abort();
2585 }
2586 }
2587
2588 void Server::dispatch_slave_request(MDRequestRef& mdr)
2589 {
2590 dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
2591
2592 if (mdr->aborted) {
2593 dout(7) << " abort flag set, finishing" << dendl;
2594 mdcache->request_finish(mdr);
2595 return;
2596 }
2597
2598 if (logger) logger->inc(l_mdss_dispatch_slave_request);
2599
2600 int op = mdr->slave_request->get_op();
2601 switch (op) {
2602 case MMDSSlaveRequest::OP_XLOCK:
2603 case MMDSSlaveRequest::OP_WRLOCK:
2604 {
2605 // identify object
2606 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2607 mdr->slave_request->get_object_info());
2608
2609 if (!lock) {
2610 dout(10) << "don't have object, dropping" << dendl;
2611 ceph_abort(); // can this happen, if we auth pinned properly.
2612 }
2613 if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2614 dout(10) << "not auth for remote xlock attempt, dropping on "
2615 << *lock << " on " << *lock->get_parent() << dendl;
2616 } else {
2617 // use acquire_locks so that we get auth_pinning.
2618 MutationImpl::LockOpVec lov;
2619 for (const auto& p : mdr->locks) {
2620 if (p.is_xlock())
2621 lov.add_xlock(p.lock);
2622 else if (p.is_wrlock())
2623 lov.add_wrlock(p.lock);
2624 }
2625
2626 int replycode = 0;
2627 switch (op) {
2628 case MMDSSlaveRequest::OP_XLOCK:
2629 lov.add_xlock(lock);
2630 replycode = MMDSSlaveRequest::OP_XLOCKACK;
2631 break;
2632 case MMDSSlaveRequest::OP_WRLOCK:
2633 lov.add_wrlock(lock);
2634 replycode = MMDSSlaveRequest::OP_WRLOCKACK;
2635 break;
2636 }
2637
2638 if (!mds->locker->acquire_locks(mdr, lov))
2639 return;
2640
2641 // ack
2642 auto r = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, replycode);
2643 r->set_lock_type(lock->get_type());
2644 lock->get_parent()->set_object_info(r->get_object_info());
2645 if (replycode == MMDSSlaveRequest::OP_XLOCKACK)
2646 lock->encode_locked_state(r->get_lock_data());
2647 mds->send_message(r, mdr->slave_request->get_connection());
2648 }
2649
2650 // done.
2651 mdr->reset_slave_request();
2652 }
2653 break;
2654
2655 case MMDSSlaveRequest::OP_UNXLOCK:
2656 case MMDSSlaveRequest::OP_UNWRLOCK:
2657 {
2658 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2659 mdr->slave_request->get_object_info());
2660 ceph_assert(lock);
2661 auto it = mdr->locks.find(lock);
2662 ceph_assert(it != mdr->locks.end());
2663 bool need_issue = false;
2664 switch (op) {
2665 case MMDSSlaveRequest::OP_UNXLOCK:
2666 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
2667 break;
2668 case MMDSSlaveRequest::OP_UNWRLOCK:
2669 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
2670 break;
2671 }
2672 if (need_issue)
2673 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2674
2675 // done. no ack necessary.
2676 mdr->reset_slave_request();
2677 }
2678 break;
2679
2680 case MMDSSlaveRequest::OP_DROPLOCKS:
2681 mds->locker->drop_locks(mdr.get());
2682 mdr->reset_slave_request();
2683 break;
2684
2685 case MMDSSlaveRequest::OP_AUTHPIN:
2686 handle_slave_auth_pin(mdr);
2687 break;
2688
2689 case MMDSSlaveRequest::OP_LINKPREP:
2690 case MMDSSlaveRequest::OP_UNLINKPREP:
2691 handle_slave_link_prep(mdr);
2692 break;
2693
2694 case MMDSSlaveRequest::OP_RMDIRPREP:
2695 handle_slave_rmdir_prep(mdr);
2696 break;
2697
2698 case MMDSSlaveRequest::OP_RENAMEPREP:
2699 handle_slave_rename_prep(mdr);
2700 break;
2701
2702 case MMDSSlaveRequest::OP_FINISH:
2703 // information about rename imported caps
2704 if (mdr->slave_request->inode_export.length() > 0)
2705 mdr->more()->inode_import = mdr->slave_request->inode_export;
2706 // finish off request.
2707 mdcache->request_finish(mdr);
2708 break;
2709
2710 default:
2711 ceph_abort();
2712 }
2713 }
2714
2715 void Server::handle_slave_auth_pin(MDRequestRef& mdr)
2716 {
2717 dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
2718
2719 // build list of objects
2720 list<MDSCacheObject*> objects;
2721 CInode *auth_pin_freeze = NULL;
2722 bool fail = false, wouldblock = false, readonly = false;
2723
2724 if (mdcache->is_readonly()) {
2725 dout(10) << " read-only FS" << dendl;
2726 readonly = true;
2727 fail = true;
2728 }
2729
2730 if (!fail) {
2731 for (const auto &oi : mdr->slave_request->get_authpins()) {
2732 MDSCacheObject *object = mdcache->get_object(oi);
2733 if (!object) {
2734 dout(10) << " don't have " << oi << dendl;
2735 fail = true;
2736 break;
2737 }
2738
2739 objects.push_back(object);
2740 if (oi == mdr->slave_request->get_authpin_freeze())
2741 auth_pin_freeze = static_cast<CInode*>(object);
2742 }
2743 }
2744
2745 // can we auth pin them?
2746 if (!fail) {
2747 for (list<MDSCacheObject*>::iterator p = objects.begin();
2748 p != objects.end();
2749 ++p) {
2750 if (!(*p)->is_auth()) {
2751 dout(10) << " not auth for " << **p << dendl;
2752 fail = true;
2753 break;
2754 }
2755 if (mdr->is_auth_pinned(*p))
2756 continue;
2757 if (!mdr->can_auth_pin(*p)) {
2758 if (mdr->slave_request->is_nonblock()) {
2759 dout(10) << " can't auth_pin (freezing?) " << **p << " nonblocking" << dendl;
2760 fail = true;
2761 wouldblock = true;
2762 break;
2763 }
2764 // wait
2765 dout(10) << " waiting for authpinnable on " << **p << dendl;
2766 (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2767 mdr->drop_local_auth_pins();
2768
2769 mds->locker->notify_freeze_waiter(*p);
2770 return;
2771 }
2772 }
2773 }
2774
2775 // auth pin!
2776 if (fail) {
2777 mdr->drop_local_auth_pins(); // just in case
2778 } else {
2779 /* freeze authpin wrong inode */
2780 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
2781 mdr->more()->rename_inode != auth_pin_freeze)
2782 mdr->unfreeze_auth_pin(true);
2783
2784 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2785 * on the source inode to complete. This happens after all locks for the rename
2786 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2787 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2788 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2789 * The solution is freeze the inode and prevent other MDRequests from getting new
2790 * auth pins.
2791 */
2792 if (auth_pin_freeze) {
2793 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
2794 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
2795 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
2796 mds->mdlog->flush();
2797 return;
2798 }
2799 }
2800 for (list<MDSCacheObject*>::iterator p = objects.begin();
2801 p != objects.end();
2802 ++p) {
2803 dout(10) << "auth_pinning " << **p << dendl;
2804 mdr->auth_pin(*p);
2805 }
2806 }
2807
2808 // ack!
2809 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
2810
2811 // return list of my auth_pins (if any)
2812 for (const auto &p : mdr->auth_pins) {
2813 MDSCacheObjectInfo info;
2814 p->set_object_info(info);
2815 reply->get_authpins().push_back(info);
2816 if (p == (MDSCacheObject*)auth_pin_freeze)
2817 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
2818 }
2819
2820 if (wouldblock)
2821 reply->mark_error_wouldblock();
2822 if (readonly)
2823 reply->mark_error_rofs();
2824
2825 mds->send_message_mds(reply, mdr->slave_to_mds);
2826
2827 // clean up this request
2828 mdr->reset_slave_request();
2829 return;
2830 }
2831
2832 void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
2833 {
2834 dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
2835 mds_rank_t from = mds_rank_t(ack->get_source().num());
2836
2837 // added auth pins?
2838 set<MDSCacheObject*> pinned;
2839 for (const auto &oi : ack->get_authpins()) {
2840 MDSCacheObject *object = mdcache->get_object(oi);
2841 ceph_assert(object); // we pinned it
2842 dout(10) << " remote has pinned " << *object << dendl;
2843 if (!mdr->is_auth_pinned(object))
2844 mdr->remote_auth_pins[object] = from;
2845 if (oi == ack->get_authpin_freeze())
2846 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
2847 pinned.insert(object);
2848 }
2849
2850 // removed frozen auth pin ?
2851 if (mdr->more()->is_remote_frozen_authpin &&
2852 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
2853 auto p = mdr->remote_auth_pins.find(mdr->more()->rename_inode);
2854 ceph_assert(p != mdr->remote_auth_pins.end());
2855 if (p->second == from) {
2856 mdr->more()->is_remote_frozen_authpin = false;
2857 }
2858 }
2859
2860 // removed auth pins?
2861 auto p = mdr->remote_auth_pins.begin();
2862 while (p != mdr->remote_auth_pins.end()) {
2863 MDSCacheObject* object = p->first;
2864 if (p->second == from && pinned.count(object) == 0) {
2865 dout(10) << " remote has unpinned " << *object << dendl;
2866 mdr->remote_auth_pins.erase(p++);
2867 } else {
2868 ++p;
2869 }
2870 }
2871
2872 if (ack->is_error_rofs()) {
2873 mdr->more()->slave_error = -EROFS;
2874 mdr->aborted = true;
2875 } else if (ack->is_error_wouldblock()) {
2876 mdr->more()->slave_error = -EWOULDBLOCK;
2877 mdr->aborted = true;
2878 }
2879
2880 // note slave
2881 mdr->more()->slaves.insert(from);
2882
2883 // clear from waiting list
2884 ceph_assert(mdr->more()->waiting_on_slave.count(from));
2885 mdr->more()->waiting_on_slave.erase(from);
2886
2887 // go again?
2888 if (mdr->more()->waiting_on_slave.empty())
2889 mdcache->dispatch_request(mdr);
2890 else
2891 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
2892 }
2893
2894
2895 // ---------------------------------------
2896 // HELPERS
2897
2898
2899 /**
2900 * check whether we are permitted to complete a request
2901 *
2902 * Check whether we have permission to perform the operation specified
2903 * by mask on the given inode, based on the capability in the mdr's
2904 * session.
2905 */
2906 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
2907 {
2908 if (mdr->session) {
2909 int r = mdr->session->check_access(
2910 in, mask,
2911 mdr->client_request->get_caller_uid(),
2912 mdr->client_request->get_caller_gid(),
2913 &mdr->client_request->get_caller_gid_list(),
2914 mdr->client_request->head.args.setattr.uid,
2915 mdr->client_request->head.args.setattr.gid);
2916 if (r < 0) {
2917 respond_to_request(mdr, r);
2918 return false;
2919 }
2920 }
2921 return true;
2922 }
2923
2924 /**
2925 * check whether fragment has reached maximum size
2926 *
2927 */
2928 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
2929 {
2930 const auto size = in->get_frag_size();
2931 if (size >= g_conf()->mds_bal_fragment_size_max) {
2932 dout(10) << "fragment " << *in << " size exceeds " << g_conf()->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
2933 respond_to_request(mdr, -ENOSPC);
2934 return false;
2935 }
2936
2937 return true;
2938 }
2939
2940
2941 /** validate_dentry_dir
2942 *
2943 * verify that the dir exists and would own the dname.
2944 * do not check if the dentry exists.
2945 */
2946 CDir *Server::validate_dentry_dir(MDRequestRef& mdr, CInode *diri, std::string_view dname)
2947 {
2948 // make sure parent is a dir?
2949 if (!diri->is_dir()) {
2950 dout(7) << "validate_dentry_dir: not a dir" << dendl;
2951 respond_to_request(mdr, -ENOTDIR);
2952 return NULL;
2953 }
2954
2955 // which dirfrag?
2956 frag_t fg = diri->pick_dirfrag(dname);
2957 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
2958 if (!dir)
2959 return 0;
2960
2961 // frozen?
2962 if (dir->is_frozen()) {
2963 dout(7) << "dir is frozen " << *dir << dendl;
2964 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2965 return NULL;
2966 }
2967
2968 return dir;
2969 }
2970
2971
2972 /** prepare_null_dentry
2973 * prepare a null (or existing) dentry in given dir.
2974 * wait for any dn lock.
2975 */
2976 CDentry* Server::prepare_null_dentry(MDRequestRef& mdr, CDir *dir, std::string_view dname, bool okexist)
2977 {
2978 dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
2979 ceph_assert(dir->is_auth());
2980
2981 client_t client = mdr->get_client();
2982
2983 // does it already exist?
2984 CDentry *dn = dir->lookup(dname);
2985 if (dn) {
2986 /*
2987 if (dn->lock.is_xlocked_by_other(mdr)) {
2988 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
2989 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
2990 return 0;
2991 }
2992 */
2993 if (!dn->get_linkage(client, mdr)->is_null()) {
2994 // name already exists
2995 dout(10) << "dentry " << dname << " exists in " << *dir << dendl;
2996 if (!okexist) {
2997 respond_to_request(mdr, -EEXIST);
2998 return 0;
2999 }
3000 } else {
3001 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3002 dn->first = std::max(dn->first, next_snap);
3003 }
3004 return dn;
3005 }
3006
3007 // make sure dir is complete
3008 if (!dir->is_complete() && (!dir->has_bloom() || dir->is_in_bloom(dname))) {
3009 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
3010 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
3011 return 0;
3012 }
3013
3014 // create
3015 dn = dir->add_null_dentry(dname, mdcache->get_global_snaprealm()->get_newest_seq() + 1);
3016 dn->mark_new();
3017 dout(10) << "prepare_null_dentry added " << *dn << dendl;
3018 return dn;
3019 }
3020
3021 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3022 {
3023 CDentry *straydn = mdr->straydn;
3024 if (straydn) {
3025 string straydname;
3026 in->name_stray_dentry(straydname);
3027 if (straydn->get_name() == straydname)
3028 return straydn;
3029
3030 ceph_assert(!mdr->done_locking);
3031 mdr->unpin(straydn);
3032 }
3033
3034 CDir *straydir = mdcache->get_stray_dir(in);
3035
3036 if (!mdr->client_request->is_replay() &&
3037 !check_fragment_space(mdr, straydir))
3038 return NULL;
3039
3040 straydn = mdcache->get_or_create_stray_dentry(in);
3041 mdr->straydn = straydn;
3042 mdr->pin(straydn);
3043 return straydn;
3044 }
3045
3046 /** prepare_new_inode
3047 *
3048 * create a new inode. set c/m/atime. hit dir pop.
3049 */
3050 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
3051 file_layout_t *layout)
3052 {
3053 CInode *in = new CInode(mdcache);
3054
3055 // Server::prepare_force_open_sessions() can re-open session in closing
3056 // state. In that corner case, session's prealloc_inos are being freed.
3057 // To simplify the code, we disallow using/refilling session's prealloc_ino
3058 // while session is opening.
3059 bool allow_prealloc_inos = !mdr->session->is_opening();
3060
3061 // assign ino
3062 if (allow_prealloc_inos &&
3063 mdr->session->info.prealloc_inos.size()) {
3064 mdr->used_prealloc_ino =
3065 in->inode.ino = mdr->session->take_ino(useino); // prealloc -> used
3066 mds->sessionmap.mark_projected(mdr->session);
3067
3068 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3069 << " (" << mdr->session->info.prealloc_inos
3070 << ", " << mdr->session->info.prealloc_inos.size() << " left)"
3071 << dendl;
3072 } else {
3073 mdr->alloc_ino =
3074 in->inode.ino = mds->inotable->project_alloc_id();
3075 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3076 }
3077
3078 if (useino && useino != in->inode.ino) {
3079 dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
3080 mds->clog->error() << mdr->client_request->get_source()
3081 << " specified ino " << useino
3082 << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
3083 //ceph_abort(); // just for now.
3084 }
3085
3086 if (allow_prealloc_inos &&
3087 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3088 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
3089 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
3090 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
3091 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3092 mds->sessionmap.mark_projected(mdr->session);
3093 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3094 }
3095
3096 in->inode.version = 1;
3097 in->inode.xattr_version = 1;
3098 in->inode.nlink = 1; // FIXME
3099
3100 in->inode.mode = mode;
3101
3102 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
3103 if (in->inode.is_dir()) {
3104 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
3105 } else if (layout) {
3106 in->inode.layout = *layout;
3107 } else {
3108 in->inode.layout = mdcache->default_file_layout;
3109 }
3110
3111 in->inode.truncate_size = -1ull; // not truncated, yet!
3112 in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
3113
3114 CInode *diri = dir->get_inode();
3115
3116 dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
3117
3118 if (diri->inode.mode & S_ISGID) {
3119 dout(10) << " dir is sticky" << dendl;
3120 in->inode.gid = diri->inode.gid;
3121 if (S_ISDIR(mode)) {
3122 dout(10) << " new dir also sticky" << dendl;
3123 in->inode.mode |= S_ISGID;
3124 }
3125 } else
3126 in->inode.gid = mdr->client_request->get_caller_gid();
3127
3128 in->inode.uid = mdr->client_request->get_caller_uid();
3129
3130 in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
3131 mdr->get_op_stamp();
3132
3133 in->inode.change_attr = 0;
3134
3135 const MClientRequest::const_ref &req = mdr->client_request;
3136 if (req->get_data().length()) {
3137 auto p = req->get_data().cbegin();
3138
3139 // xattrs on new inode?
3140 CInode::mempool_xattr_map xattrs;
3141 decode(xattrs, p);
3142 for (const auto &p : xattrs) {
3143 dout(10) << "prepare_new_inode setting xattr " << p.first << dendl;
3144 auto em = in->xattrs.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple(p.second));
3145 if (!em.second)
3146 em.first->second = p.second;
3147 }
3148 }
3149
3150 if (!mds->mdsmap->get_inline_data_enabled() ||
3151 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
3152 in->inode.inline_data.version = CEPH_INLINE_NONE;
3153
3154 mdcache->add_inode(in); // add
3155 dout(10) << "prepare_new_inode " << *in << dendl;
3156 return in;
3157 }
3158
3159 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3160 {
3161 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3162 << " inotablev " << mds->inotable->get_projected_version()
3163 << dendl;
3164 blob->set_ino_alloc(mdr->alloc_ino,
3165 mdr->used_prealloc_ino,
3166 mdr->prealloc_inos,
3167 mdr->client_request->get_source(),
3168 mds->sessionmap.get_projected(),
3169 mds->inotable->get_projected_version());
3170 }
3171
3172 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3173 {
3174 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3175 << " / " << mdr->prealloc_inos
3176 << " / " << mdr->used_prealloc_ino << dendl;
3177
3178 if (mdr->alloc_ino) {
3179 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3180 }
3181 if (mdr->prealloc_inos.size()) {
3182 ceph_assert(session);
3183 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
3184 session->info.prealloc_inos.insert(mdr->prealloc_inos);
3185 mds->sessionmap.mark_dirty(session);
3186 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3187 }
3188 if (mdr->used_prealloc_ino) {
3189 ceph_assert(session);
3190 session->info.used_inos.erase(mdr->used_prealloc_ino);
3191 mds->sessionmap.mark_dirty(session);
3192 }
3193 }
3194
3195 class C_MDS_TryFindInode : public ServerContext {
3196 MDRequestRef mdr;
3197 public:
3198 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3199 void finish(int r) override {
3200 if (r == -ESTALE) // :( find_ino_peers failed
3201 server->respond_to_request(mdr, r);
3202 else
3203 server->dispatch_client_request(mdr);
3204 }
3205 };
3206
3207 class CF_MDS_MDRContextFactory : public MDSContextFactory {
3208 public:
3209 CF_MDS_MDRContextFactory(MDCache *cache, MDRequestRef &mdr) : cache(cache), mdr(mdr) {}
3210 MDSContext *build() {
3211 return new C_MDS_RetryRequest(cache, mdr);
3212 }
3213 private:
3214 MDCache *cache;
3215 MDRequestRef mdr;
3216 };
3217
3218 CDir *Server::traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath)
3219 {
3220 // figure parent dir vs dname
3221 if (refpath.depth() == 0) {
3222 dout(7) << "can't do that to root" << dendl;
3223 respond_to_request(mdr, -EINVAL);
3224 return 0;
3225 }
3226 string dname = refpath.last_dentry();
3227 refpath.pop_dentry();
3228
3229 dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl;
3230
3231 // traverse to parent dir
3232 CInode *diri;
3233 CF_MDS_MDRContextFactory cf(mdcache, mdr);
3234 int r = mdcache->path_traverse(mdr, cf, refpath, &trace, &diri, MDS_TRAVERSE_FORWARD);
3235 if (r > 0) return 0; // delayed
3236 if (r < 0) {
3237 if (r == -ESTALE) {
3238 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3239 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3240 return 0;
3241 }
3242 respond_to_request(mdr, r);
3243 return 0;
3244 }
3245
3246 // is it an auth dir?
3247 CDir *dir = validate_dentry_dir(mdr, diri, dname);
3248 if (!dir)
3249 return 0; // forwarded or waiting for freeze
3250
3251 dout(10) << "traverse_to_auth_dir " << *dir << dendl;
3252 return dir;
3253 }
3254
3255 /* If this returns null, the request has been handled
3256 * as appropriate: forwarded on, or the client's been replied to */
3257 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr, int n,
3258 MutationImpl::LockOpVec& lov,
3259 bool want_auth,
3260 bool no_want_auth, /* for readdir, who doesn't want auth _even_if_ it's
3261 a snapped dir */
3262 file_layout_t **layout,
3263 bool no_lookup) // true if we cannot return a null dentry lease
3264 {
3265 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
3266 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3267
3268 if (mdr->done_locking)
3269 return mdr->in[n];
3270
3271 // traverse
3272 CF_MDS_MDRContextFactory cf(mdcache, mdr);
3273 int r = mdcache->path_traverse(mdr, cf, refpath, &mdr->dn[n], &mdr->in[n], MDS_TRAVERSE_FORWARD);
3274 if (r > 0)
3275 return NULL; // delayed
3276 if (r < 0) { // error
3277 if (r == -ENOENT && n == 0 && !mdr->dn[n].empty()) {
3278 if (!no_lookup) {
3279 mdr->tracedn = mdr->dn[n].back();
3280 }
3281 respond_to_request(mdr, r);
3282 } else if (r == -ESTALE) {
3283 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3284 MDSContext *c = new C_MDS_TryFindInode(this, mdr);
3285 mdcache->find_ino_peers(refpath.get_ino(), c);
3286 } else {
3287 dout(10) << "FAIL on error " << r << dendl;
3288 respond_to_request(mdr, r);
3289 }
3290 return 0;
3291 }
3292 CInode *ref = mdr->in[n];
3293 dout(10) << "ref is " << *ref << dendl;
3294
3295 // fw to inode auth?
3296 if (mdr->snapid != CEPH_NOSNAP && !no_want_auth)
3297 want_auth = true;
3298
3299 if (want_auth) {
3300 if (ref->is_ambiguous_auth()) {
3301 dout(10) << "waiting for single auth on " << *ref << dendl;
3302 ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
3303 return 0;
3304 }
3305 if (!ref->is_auth()) {
3306 dout(10) << "fw to auth for " << *ref << dendl;
3307 mdcache->request_forward(mdr, ref->authority().first);
3308 return 0;
3309 }
3310
3311 // auth_pin?
3312 // do NOT proceed if freezing, as cap release may defer in that case, and
3313 // we could deadlock when we try to lock @ref.
3314 // if we're already auth_pinned, continue; the release has already been processed.
3315 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3316 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3317 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
3318 ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3319 /* If we have any auth pins, this will deadlock.
3320 * But the only way to get here if we've already got auth pins
3321 * is because we're on an inode with snapshots that got updated
3322 * between dispatches of this request. So we're going to drop
3323 * our locks and our auth pins and reacquire them later.
3324 *
3325 * This is safe since we're only in this function when working on
3326 * a single MDS request; otherwise we'd be in
3327 * rdlock_path_xlock_dentry.
3328 */
3329 mds->locker->drop_locks(mdr.get(), NULL);
3330 mdr->drop_local_auth_pins();
3331 if (!mdr->remote_auth_pins.empty())
3332 mds->locker->notify_freeze_waiter(ref);
3333 return 0;
3334 }
3335
3336 mdr->auth_pin(ref);
3337 }
3338
3339 for (int i=0; i<(int)mdr->dn[n].size(); i++)
3340 lov.add_rdlock(&mdr->dn[n][i]->lock);
3341 if (layout)
3342 mds->locker->include_snap_rdlocks_wlayout(ref, lov, layout);
3343 else
3344 mds->locker->include_snap_rdlocks(ref, lov);
3345
3346 // set and pin ref
3347 mdr->pin(ref);
3348 return ref;
3349 }
3350
3351
3352 /** rdlock_path_xlock_dentry
3353 * traverse path to the directory that could/would contain dentry.
3354 * make sure i am auth for that dentry, forward as necessary.
3355 * create null dentry in place (or use existing if okexist).
3356 * get rdlocks on traversed dentries, xlock on new dentry.
3357 */
3358 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr, int n,
3359 MutationImpl::LockOpVec& lov,
3360 bool okexist, bool mustexist, bool alwaysxlock,
3361 file_layout_t **layout)
3362 {
3363 const filepath& refpath = n ? mdr->get_filepath2() : mdr->get_filepath();
3364
3365 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3366
3367 client_t client = mdr->get_client();
3368
3369 if (mdr->done_locking)
3370 return mdr->dn[n].back();
3371
3372 CDir *dir = traverse_to_auth_dir(mdr, mdr->dn[n], refpath);
3373 if (!dir) return 0;
3374
3375 CInode *diri = dir->get_inode();
3376 if (!mdr->reqid.name.is_mds()) {
3377 if (diri->is_system() && !diri->is_root()) {
3378 respond_to_request(mdr, -EROFS);
3379 return 0;
3380 }
3381 }
3382 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
3383 respond_to_request(mdr, -ENOENT);
3384 return 0;
3385 }
3386
3387 // make a null dentry?
3388 std::string_view dname = refpath.last_dentry();
3389 CDentry *dn;
3390 if (mustexist) {
3391 dn = dir->lookup(dname);
3392
3393 // make sure dir is complete
3394 if (!dn && !dir->is_complete() &&
3395 (!dir->has_bloom() || dir->is_in_bloom(dname))) {
3396 dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
3397 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
3398 return 0;
3399 }
3400
3401 // readable?
3402 if (dn && !dn->lock.can_read(client) && dn->lock.get_xlock_by() != mdr) {
3403 dout(10) << "waiting on xlocked dentry " << *dn << dendl;
3404 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
3405 return 0;
3406 }
3407
3408 // exists?
3409 if (!dn || dn->get_linkage(client, mdr)->is_null()) {
3410 dout(7) << "dentry " << dname << " dne in " << *dir << dendl;
3411 respond_to_request(mdr, -ENOENT);
3412 return 0;
3413 }
3414 } else {
3415 dn = prepare_null_dentry(mdr, dir, dname, okexist);
3416 if (!dn)
3417 return 0;
3418 }
3419
3420 mdr->dn[n].push_back(dn);
3421 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
3422 mdr->in[n] = dnl->get_inode();
3423
3424 // -- lock --
3425 // NOTE: rename takes the same set of locks for srcdn
3426 for (int i=0; i<(int)mdr->dn[n].size(); i++)
3427 lov.add_rdlock(&mdr->dn[n][i]->lock);
3428 if (alwaysxlock || dnl->is_null())
3429 lov.add_xlock(&dn->lock); // new dn, xlock
3430 else
3431 lov.add_rdlock(&dn->lock); // existing dn, rdlock
3432 lov.add_wrlock(&dn->get_dir()->inode->filelock); // also, wrlock on dir mtime
3433 lov.add_wrlock(&dn->get_dir()->inode->nestlock); // also, wrlock on dir mtime
3434 if (layout)
3435 mds->locker->include_snap_rdlocks_wlayout(dn->get_dir()->inode, lov, layout);
3436 else
3437 mds->locker->include_snap_rdlocks(dn->get_dir()->inode, lov);
3438
3439 return dn;
3440 }
3441
3442
3443
3444
3445
3446 /**
3447 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3448 *
3449 * @param diri base inode
3450 * @param fg the exact frag we want
3451 * @param mdr request
3452 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3453 */
3454 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3455 {
3456 CDir *dir = diri->get_dirfrag(fg);
3457
3458 // not open and inode not mine?
3459 if (!dir && !diri->is_auth()) {
3460 mds_rank_t inauth = diri->authority().first;
3461 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3462 mdcache->request_forward(mdr, inauth);
3463 return 0;
3464 }
3465
3466 // not open and inode frozen?
3467 if (!dir && diri->is_frozen()) {
3468 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3469 ceph_assert(diri->get_parent_dir());
3470 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3471 return 0;
3472 }
3473
3474 // invent?
3475 if (!dir)
3476 dir = diri->get_or_open_dirfrag(mdcache, fg);
3477
3478 // am i auth for the dirfrag?
3479 if (!dir->is_auth()) {
3480 mds_rank_t auth = dir->authority().first;
3481 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3482 << ", fw to mds." << auth << dendl;
3483 mdcache->request_forward(mdr, auth);
3484 return 0;
3485 }
3486
3487 return dir;
3488 }
3489
3490
3491 // ===============================================================================
3492 // STAT
3493
3494 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3495 {
3496 const MClientRequest::const_ref &req = mdr->client_request;
3497
3498 if (req->get_filepath().depth() == 0 && is_lookup) {
3499 // refpath can't be empty for lookup but it can for
3500 // getattr (we do getattr with empty refpath for mount of '/')
3501 respond_to_request(mdr, -EINVAL);
3502 return;
3503 }
3504
3505 bool want_auth = false;
3506 int mask = req->head.args.getattr.mask;
3507 if (mask & CEPH_STAT_RSTAT)
3508 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3509
3510 MutationImpl::LockOpVec lov;
3511 CInode *ref = rdlock_path_pin_ref(mdr, 0, lov, want_auth, false, NULL,
3512 !is_lookup);
3513 if (!ref) return;
3514
3515 /*
3516 * if client currently holds the EXCL cap on a field, do not rdlock
3517 * it; client's stat() will result in valid info if _either_ EXCL
3518 * cap is held or MDS rdlocks and reads the value here.
3519 *
3520 * handling this case here is easier than weakening rdlock
3521 * semantics... that would cause problems elsewhere.
3522 */
3523 client_t client = mdr->get_client();
3524 int issued = 0;
3525 Capability *cap = ref->get_client_cap(client);
3526 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3527 mdr->snapid <= cap->client_follows))
3528 issued = cap->issued();
3529
3530 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
3531 lov.add_rdlock(&ref->linklock);
3532 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
3533 lov.add_rdlock(&ref->authlock);
3534 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
3535 lov.add_rdlock(&ref->xattrlock);
3536 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3537 // Don't wait on unstable filelock if client is allowed to read file size.
3538 // This can reduce the response time of getattr in the case that multiple
3539 // clients do stat(2) and there are writers.
3540 // The downside of this optimization is that mds may not issue Fs caps along
3541 // with getattr reply. Client may need to send more getattr requests.
3542 if (mdr->is_rdlocked(&ref->filelock)) {
3543 lov.add_rdlock(&ref->filelock);
3544 } else if (ref->filelock.is_stable() ||
3545 ref->filelock.get_num_wrlocks() > 0 ||
3546 !ref->filelock.can_read(mdr->get_client())) {
3547 lov.add_rdlock(&ref->filelock);
3548 mdr->done_locking = false;
3549 }
3550 }
3551
3552 if (!mds->locker->acquire_locks(mdr, lov))
3553 return;
3554
3555 if (!check_access(mdr, ref, MAY_READ))
3556 return;
3557
3558 utime_t now = ceph_clock_now();
3559 mdr->set_mds_stamp(now);
3560
3561 // note which caps are requested, so we return at least a snapshot
3562 // value for them. (currently this matters for xattrs and inline data)
3563 mdr->getattr_caps = mask;
3564
3565 mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
3566
3567 // reply
3568 dout(10) << "reply to stat on " << *req << dendl;
3569 mdr->tracei = ref;
3570 if (is_lookup)
3571 mdr->tracedn = mdr->dn[0].back();
3572 respond_to_request(mdr, 0);
3573 }
3574
3575 struct C_MDS_LookupIno2 : public ServerContext {
3576 MDRequestRef mdr;
3577 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3578 void finish(int r) override {
3579 server->_lookup_ino_2(mdr, r);
3580 }
3581 };
3582
3583 /*
3584 * filepath: ino
3585 */
3586 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3587 bool want_parent, bool want_dentry)
3588 {
3589 const MClientRequest::const_ref &req = mdr->client_request;
3590
3591 if ((uint64_t)req->head.args.lookupino.snapid > 0)
3592 return _lookup_snap_ino(mdr);
3593
3594 inodeno_t ino = req->get_filepath().get_ino();
3595 CInode *in = mdcache->get_inode(ino);
3596 if (in && in->state_test(CInode::STATE_PURGING)) {
3597 respond_to_request(mdr, -ESTALE);
3598 return;
3599 }
3600 if (!in) {
3601 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3602 return;
3603 }
3604
3605 if (mdr && in->snaprealm && !in->snaprealm->have_past_parents_open() &&
3606 !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
3607 return;
3608 }
3609
3610 // check for nothing (not read or write); this still applies the
3611 // path check.
3612 if (!check_access(mdr, in, 0))
3613 return;
3614
3615 CDentry *dn = in->get_projected_parent_dn();
3616 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3617
3618 MutationImpl::LockOpVec lov;
3619 if (dn && (want_parent || want_dentry)) {
3620 mdr->pin(dn);
3621 lov.add_rdlock(&dn->lock);
3622 }
3623
3624 unsigned mask = req->head.args.lookupino.mask;
3625 if (mask) {
3626 Capability *cap = in->get_client_cap(mdr->get_client());
3627 int issued = 0;
3628 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3629 issued = cap->issued();
3630 // permission bits, ACL/security xattrs
3631 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3632 lov.add_rdlock(&in->authlock);
3633 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3634 lov.add_rdlock(&in->xattrlock);
3635
3636 mdr->getattr_caps = mask;
3637 }
3638
3639 if (!lov.empty()) {
3640 if (!mds->locker->acquire_locks(mdr, lov))
3641 return;
3642
3643 if (diri != NULL) {
3644 // need read access to directory inode
3645 if (!check_access(mdr, diri, MAY_READ))
3646 return;
3647 }
3648 }
3649
3650 if (want_parent) {
3651 if (in->is_base()) {
3652 respond_to_request(mdr, -EINVAL);
3653 return;
3654 }
3655 if (!diri || diri->is_stray()) {
3656 respond_to_request(mdr, -ESTALE);
3657 return;
3658 }
3659 dout(10) << "reply to lookup_parent " << *in << dendl;
3660 mdr->tracei = diri;
3661 respond_to_request(mdr, 0);
3662 } else {
3663 if (want_dentry) {
3664 inodeno_t dirino = req->get_filepath2().get_ino();
3665 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3666 respond_to_request(mdr, -ENOENT);
3667 return;
3668 }
3669 dout(10) << "reply to lookup_name " << *in << dendl;
3670 } else
3671 dout(10) << "reply to lookup_ino " << *in << dendl;
3672
3673 mdr->tracei = in;
3674 if (want_dentry)
3675 mdr->tracedn = dn;
3676 respond_to_request(mdr, 0);
3677 }
3678 }
3679
3680 void Server::_lookup_snap_ino(MDRequestRef& mdr)
3681 {
3682 const MClientRequest::const_ref &req = mdr->client_request;
3683
3684 vinodeno_t vino;
3685 vino.ino = req->get_filepath().get_ino();
3686 vino.snapid = (__u64)req->head.args.lookupino.snapid;
3687 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
3688 __u32 hash = req->head.args.lookupino.hash;
3689
3690 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
3691
3692 CInode *in = mdcache->lookup_snap_inode(vino);
3693 if (!in) {
3694 in = mdcache->get_inode(vino.ino);
3695 if (in) {
3696 if (in->state_test(CInode::STATE_PURGING) ||
3697 !in->has_snap_data(vino.snapid)) {
3698 if (in->is_dir() || !parent_ino) {
3699 respond_to_request(mdr, -ESTALE);
3700 return;
3701 }
3702 in = NULL;
3703 }
3704 }
3705 }
3706
3707 if (in) {
3708 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
3709 mdr->snapid = vino.snapid;
3710 mdr->tracei = in;
3711 respond_to_request(mdr, 0);
3712 return;
3713 }
3714
3715 CInode *diri = NULL;
3716 if (parent_ino) {
3717 diri = mdcache->get_inode(parent_ino);
3718 if (!diri) {
3719 mdcache->open_ino(parent_ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
3720 return;
3721 }
3722
3723 if (!diri->is_dir()) {
3724 respond_to_request(mdr, -EINVAL);
3725 return;
3726 }
3727
3728 MutationImpl::LockOpVec lov;
3729 lov.add_rdlock(&diri->dirfragtreelock);
3730 if (!mds->locker->acquire_locks(mdr, lov))
3731 return;
3732
3733 frag_t frag = diri->dirfragtree[hash];
3734 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
3735 if (!dir)
3736 return;
3737
3738 if (!dir->is_complete()) {
3739 if (dir->is_frozen()) {
3740 mds->locker->drop_locks(mdr.get());
3741 mdr->drop_local_auth_pins();
3742 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3743 return;
3744 }
3745 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
3746 return;
3747 }
3748
3749 respond_to_request(mdr, -ESTALE);
3750 } else {
3751 mdcache->open_ino(vino.ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
3752 }
3753 }
3754
3755 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
3756 {
3757 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
3758 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3759
3760 // `r` is a rank if >=0, else an error code
3761 if (r >= 0) {
3762 mds_rank_t dest_rank(r);
3763 if (dest_rank == mds->get_nodeid())
3764 dispatch_client_request(mdr);
3765 else
3766 mdcache->request_forward(mdr, dest_rank);
3767 return;
3768 }
3769
3770 // give up
3771 if (r == -ENOENT || r == -ENODATA)
3772 r = -ESTALE;
3773 respond_to_request(mdr, r);
3774 }
3775
3776
3777 /* This function takes responsibility for the passed mdr*/
3778 void Server::handle_client_open(MDRequestRef& mdr)
3779 {
3780 const MClientRequest::const_ref &req = mdr->client_request;
3781 dout(7) << "open on " << req->get_filepath() << dendl;
3782
3783 int flags = req->head.args.open.flags;
3784 int cmode = ceph_flags_to_mode(flags);
3785 if (cmode < 0) {
3786 respond_to_request(mdr, -EINVAL);
3787 return;
3788 }
3789
3790 bool need_auth = !file_mode_is_readonly(cmode) ||
3791 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
3792
3793 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
3794 dout(7) << "read-only FS" << dendl;
3795 respond_to_request(mdr, -EROFS);
3796 return;
3797 }
3798
3799 MutationImpl::LockOpVec lov;
3800 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, need_auth);
3801 if (!cur)
3802 return;
3803
3804 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
3805 ceph_assert(!need_auth);
3806 mdr->done_locking = false;
3807 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
3808 if (!cur)
3809 return;
3810 }
3811
3812 if (!cur->inode.is_file()) {
3813 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
3814 cmode = CEPH_FILE_MODE_PIN;
3815 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
3816 if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
3817 flags &= ~CEPH_O_TRUNC;
3818 }
3819
3820 dout(10) << "open flags = " << flags
3821 << ", filemode = " << cmode
3822 << ", need_auth = " << need_auth
3823 << dendl;
3824
3825 // regular file?
3826 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
3827 dout(7) << "not a file or dir " << *cur << dendl;
3828 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
3829 return;
3830 }*/
3831 if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
3832 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
3833 respond_to_request(mdr, -EINVAL);
3834 return;
3835 }
3836
3837 if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
3838 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
3839 // we should return -EISDIR for directory, return -EINVAL for other non-regular
3840 respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
3841 return;
3842 }
3843
3844 if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
3845 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
3846 dout(7) << "old client cannot open inline data file " << *cur << dendl;
3847 respond_to_request(mdr, -EPERM);
3848 return;
3849 }
3850
3851 // snapped data is read only
3852 if (mdr->snapid != CEPH_NOSNAP &&
3853 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
3854 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
3855 respond_to_request(mdr, -EROFS);
3856 return;
3857 }
3858
3859 unsigned mask = req->head.args.open.mask;
3860 if (mask) {
3861 Capability *cap = cur->get_client_cap(mdr->get_client());
3862 int issued = 0;
3863 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3864 issued = cap->issued();
3865 // permission bits, ACL/security xattrs
3866 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3867 lov.add_rdlock(&cur->authlock);
3868 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3869 lov.add_rdlock(&cur->xattrlock);
3870
3871 mdr->getattr_caps = mask;
3872 }
3873
3874 // O_TRUNC
3875 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
3876 ceph_assert(cur->is_auth());
3877
3878 lov.add_xlock(&cur->filelock);
3879 if (!mds->locker->acquire_locks(mdr, lov))
3880 return;
3881
3882 if (!check_access(mdr, cur, MAY_WRITE))
3883 return;
3884
3885 // wait for pending truncate?
3886 const auto pi = cur->get_projected_inode();
3887 if (pi->is_truncating()) {
3888 dout(10) << " waiting for pending truncate from " << pi->truncate_from
3889 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
3890 mds->locker->drop_locks(mdr.get());
3891 mdr->drop_local_auth_pins();
3892 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
3893 return;
3894 }
3895
3896 do_open_truncate(mdr, cmode);
3897 return;
3898 }
3899
3900 // sync filelock if snapped.
3901 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
3902 // and that data itself is flushed so that we can read the snapped data off disk.
3903 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
3904 lov.add_rdlock(&cur->filelock);
3905 }
3906
3907 if (!mds->locker->acquire_locks(mdr, lov))
3908 return;
3909
3910 mask = MAY_READ;
3911 if (cmode & CEPH_FILE_MODE_WR)
3912 mask |= MAY_WRITE;
3913 if (!check_access(mdr, cur, mask))
3914 return;
3915
3916 utime_t now = ceph_clock_now();
3917 mdr->set_mds_stamp(now);
3918
3919 if (cur->is_file() || cur->is_dir()) {
3920 if (mdr->snapid == CEPH_NOSNAP) {
3921 // register new cap
3922 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr->session, 0, req->is_replay());
3923 if (cap)
3924 dout(12) << "open issued caps " << ccap_string(cap->pending())
3925 << " for " << req->get_source()
3926 << " on " << *cur << dendl;
3927 } else {
3928 int caps = ceph_caps_for_mode(cmode);
3929 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
3930 << " for " << req->get_source()
3931 << " snapid " << mdr->snapid
3932 << " on " << *cur << dendl;
3933 mdr->snap_caps = caps;
3934 }
3935 }
3936
3937 // increase max_size?
3938 if (cmode & CEPH_FILE_MODE_WR)
3939 mds->locker->check_inode_max_size(cur);
3940
3941 // make sure this inode gets into the journal
3942 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
3943 mdcache->open_file_table.should_log_open(cur)) {
3944 EOpen *le = new EOpen(mds->mdlog);
3945 mdlog->start_entry(le);
3946 le->add_clean_inode(cur);
3947 mdlog->submit_entry(le);
3948 }
3949
3950 // hit pop
3951 if (cmode & CEPH_FILE_MODE_WR)
3952 mds->balancer->hit_inode(cur, META_POP_IWR);
3953 else
3954 mds->balancer->hit_inode(cur, META_POP_IRD,
3955 mdr->client_request->get_source().num());
3956
3957 CDentry *dn = 0;
3958 if (req->get_dentry_wanted()) {
3959 ceph_assert(mdr->dn[0].size());
3960 dn = mdr->dn[0].back();
3961 }
3962
3963 mdr->tracei = cur;
3964 mdr->tracedn = dn;
3965 respond_to_request(mdr, 0);
3966 }
3967
3968 class C_MDS_openc_finish : public ServerLogContext {
3969 CDentry *dn;
3970 CInode *newi;
3971 public:
3972 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
3973 ServerLogContext(s, r), dn(d), newi(ni) {}
3974 void finish(int r) override {
3975 ceph_assert(r == 0);
3976
3977 dn->pop_projected_linkage();
3978
3979 // dirty inode, dn, dir
3980 newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
3981 newi->mark_dirty(newi->inode.version+1, mdr->ls);
3982 newi->mark_dirty_parent(mdr->ls, true);
3983
3984 mdr->apply();
3985
3986 get_mds()->locker->share_inode_max_size(newi);
3987
3988 MDRequestRef null_ref;
3989 get_mds()->mdcache->send_dentry_link(dn, null_ref);
3990
3991 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
3992
3993 server->respond_to_request(mdr, 0);
3994
3995 ceph_assert(g_conf()->mds_kill_openc_at != 1);
3996 }
3997 };
3998
3999 /* This function takes responsibility for the passed mdr*/
4000 void Server::handle_client_openc(MDRequestRef& mdr)
4001 {
4002 const MClientRequest::const_ref &req = mdr->client_request;
4003 client_t client = mdr->get_client();
4004
4005 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4006
4007 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4008 if (cmode < 0) {
4009 respond_to_request(mdr, -EINVAL);
4010 return;
4011 }
4012
4013 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
4014
4015 if (!excl) {
4016 CF_MDS_MDRContextFactory cf(mdcache, mdr);
4017 int r = mdcache->path_traverse(mdr, cf, req->get_filepath(),
4018 &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
4019 if (r > 0) return;
4020 if (r == 0) {
4021 // it existed.
4022 handle_client_open(mdr);
4023 return;
4024 }
4025 if (r < 0 && r != -ENOENT) {
4026 if (r == -ESTALE) {
4027 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
4028 MDSContext *c = new C_MDS_TryFindInode(this, mdr);
4029 mdcache->find_ino_peers(req->get_filepath().get_ino(), c);
4030 } else {
4031 dout(10) << "FAIL on error " << r << dendl;
4032 respond_to_request(mdr, r);
4033 }
4034 return;
4035 }
4036 }
4037
4038 MutationImpl::LockOpVec lov;
4039 file_layout_t *dir_layout = nullptr;
4040 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov,
4041 !excl, false, false, &dir_layout);
4042 if (!dn) return;
4043 if (mdr->snapid != CEPH_NOSNAP) {
4044 respond_to_request(mdr, -EROFS);
4045 return;
4046 }
4047 // set layout
4048 file_layout_t layout;
4049 if (dir_layout)
4050 layout = *dir_layout;
4051 else
4052 layout = mdcache->default_file_layout;
4053
4054 // What kind of client caps are required to complete this operation
4055 uint64_t access = MAY_WRITE;
4056
4057 const auto default_layout = layout;
4058
4059 // fill in any special params from client
4060 if (req->head.args.open.stripe_unit)
4061 layout.stripe_unit = req->head.args.open.stripe_unit;
4062 if (req->head.args.open.stripe_count)
4063 layout.stripe_count = req->head.args.open.stripe_count;
4064 if (req->head.args.open.object_size)
4065 layout.object_size = req->head.args.open.object_size;
4066 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4067 (__s32)req->head.args.open.pool >= 0) {
4068 layout.pool_id = req->head.args.open.pool;
4069
4070 // make sure we have as new a map as the client
4071 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4072 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4073 return;
4074 }
4075 }
4076
4077 // If client doesn't have capability to modify layout pools, then
4078 // only permit this request if the requested pool matches what the
4079 // file would have inherited anyway from its parent.
4080 if (default_layout != layout) {
4081 access |= MAY_SET_VXATTR;
4082 }
4083
4084 if (!layout.is_valid()) {
4085 dout(10) << " invalid initial file layout" << dendl;
4086 respond_to_request(mdr, -EINVAL);
4087 return;
4088 }
4089 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4090 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4091 respond_to_request(mdr, -EINVAL);
4092 return;
4093 }
4094
4095 // created null dn.
4096 CDir *dir = dn->get_dir();
4097 CInode *diri = dir->get_inode();
4098 lov.add_rdlock(&diri->authlock);
4099 if (!mds->locker->acquire_locks(mdr, lov))
4100 return;
4101
4102 if (!check_access(mdr, diri, access))
4103 return;
4104
4105 if (!check_fragment_space(mdr, dir))
4106 return;
4107
4108 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4109
4110 if (!dnl->is_null()) {
4111 // it existed.
4112 ceph_assert(req->head.args.open.flags & CEPH_O_EXCL);
4113 dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
4114 mdr->tracei = dnl->get_inode();
4115 mdr->tracedn = dn;
4116 respond_to_request(mdr, -EEXIST);
4117 return;
4118 }
4119
4120 // create inode.
4121 CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4122 req->head.args.open.mode | S_IFREG, &layout);
4123 ceph_assert(in);
4124
4125 // it's a file.
4126 dn->push_projected_linkage(in);
4127
4128 in->inode.version = dn->pre_dirty();
4129 if (layout.pool_id != mdcache->default_file_layout.pool_id)
4130 in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
4131 in->inode.update_backtrace();
4132 in->inode.rstat.rfiles = 1;
4133
4134 SnapRealm *realm = diri->find_snaprealm();
4135 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4136 ceph_assert(follows >= realm->get_newest_seq());
4137
4138 ceph_assert(dn->first == follows+1);
4139 in->first = dn->first;
4140
4141 // do the open
4142 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr->session, realm, req->is_replay());
4143 in->authlock.set_state(LOCK_EXCL);
4144 in->xattrlock.set_state(LOCK_EXCL);
4145
4146 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
4147 in->inode.client_ranges[client].range.first = 0;
4148 in->inode.client_ranges[client].range.last = in->inode.get_layout_size_increment();
4149 in->inode.client_ranges[client].follows = follows;
4150 cap->mark_clientwriteable();
4151 }
4152
4153 // prepare finisher
4154 mdr->ls = mdlog->get_current_segment();
4155 EUpdate *le = new EUpdate(mdlog, "openc");
4156 mdlog->start_entry(le);
4157 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4158 journal_allocated_inos(mdr, &le->metablob);
4159 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4160 le->metablob.add_primary_dentry(dn, in, true, true, true);
4161
4162 // make sure this inode gets into the journal
4163 le->metablob.add_opened_ino(in->ino());
4164
4165 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in);
4166
4167 if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
4168 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4169 // add the file created flag onto the reply if create_flags features is supported
4170 encode(in->inode.ino, mdr->reply_extra_bl);
4171 }
4172
4173 journal_and_reply(mdr, in, dn, le, fin);
4174
4175 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4176 // have overshot the split size (multiple opencs in flight), so here is
4177 // an early chance to split the dir if this openc makes it oversized.
4178 mds->balancer->maybe_fragment(dir, false);
4179 }
4180
4181
4182
4183 void Server::handle_client_readdir(MDRequestRef& mdr)
4184 {
4185 const MClientRequest::const_ref &req = mdr->client_request;
4186 client_t client = req->get_source().num();
4187 MutationImpl::LockOpVec lov;
4188 CInode *diri = rdlock_path_pin_ref(mdr, 0, lov, false, true);
4189 if (!diri) return;
4190
4191 // it's a directory, right?
4192 if (!diri->is_dir()) {
4193 // not a dir
4194 dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
4195 respond_to_request(mdr, -ENOTDIR);
4196 return;
4197 }
4198
4199 lov.add_rdlock(&diri->filelock);
4200 lov.add_rdlock(&diri->dirfragtreelock);
4201
4202 if (!mds->locker->acquire_locks(mdr, lov))
4203 return;
4204
4205 if (!check_access(mdr, diri, MAY_READ))
4206 return;
4207
4208 // which frag?
4209 frag_t fg = (__u32)req->head.args.readdir.frag;
4210 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4211 string offset_str = req->get_path2();
4212
4213 __u32 offset_hash = 0;
4214 if (!offset_str.empty())
4215 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4216 else
4217 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4218
4219 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4220 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4221
4222 // does the frag exist?
4223 if (diri->dirfragtree[fg.value()] != fg) {
4224 frag_t newfg;
4225 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4226 if (fg.contains((unsigned)offset_hash)) {
4227 newfg = diri->dirfragtree[offset_hash];
4228 } else {
4229 // client actually wants next frag
4230 newfg = diri->dirfragtree[fg.value()];
4231 }
4232 } else {
4233 offset_str.clear();
4234 newfg = diri->dirfragtree[fg.value()];
4235 }
4236 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4237 fg = newfg;
4238 }
4239
4240 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4241 if (!dir) return;
4242
4243 // ok!
4244 dout(10) << "handle_client_readdir on " << *dir << dendl;
4245 ceph_assert(dir->is_auth());
4246
4247 if (!dir->is_complete()) {
4248 if (dir->is_frozen()) {
4249 dout(7) << "dir is frozen " << *dir << dendl;
4250 mds->locker->drop_locks(mdr.get());
4251 mdr->drop_local_auth_pins();
4252 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4253 return;
4254 }
4255 // fetch
4256 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4257 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4258 return;
4259 }
4260
4261 #ifdef MDS_VERIFY_FRAGSTAT
4262 dir->verify_fragstat();
4263 #endif
4264
4265 utime_t now = ceph_clock_now();
4266 mdr->set_mds_stamp(now);
4267
4268 snapid_t snapid = mdr->snapid;
4269 dout(10) << "snapid " << snapid << dendl;
4270
4271 SnapRealm *realm = diri->find_snaprealm();
4272
4273 unsigned max = req->head.args.readdir.max_entries;
4274 if (!max)
4275 max = dir->get_num_any(); // whatever, something big.
4276 unsigned max_bytes = req->head.args.readdir.max_bytes;
4277 if (!max_bytes)
4278 // make sure at least one item can be encoded
4279 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
4280
4281 // start final blob
4282 bufferlist dirbl;
4283 DirStat ds;
4284 ds.frag = dir->get_frag();
4285 ds.auth = dir->get_dir_auth().first;
4286 if (dir->is_auth())
4287 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4288
4289 dir->encode_dirstat(dirbl, mdr->session->info, ds);
4290
4291 // count bytes available.
4292 // this isn't perfect, but we should capture the main variable/unbounded size items!
4293 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4294 int bytes_left = max_bytes - front_bytes;
4295 bytes_left -= realm->get_snap_trace().length();
4296
4297 // build dir contents
4298 bufferlist dnbl;
4299 __u32 numfiles = 0;
4300 bool start = !offset_hash && offset_str.empty();
4301 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4302 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
4303 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4304 bool end = (it == dir->end());
4305 for (; !end && numfiles < max; end = (it == dir->end())) {
4306 CDentry *dn = it->second;
4307 ++it;
4308
4309 if (dn->state_test(CDentry::STATE_PURGING))
4310 continue;
4311
4312 bool dnp = dn->use_projected(client, mdr);
4313 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4314
4315 if (dnl->is_null())
4316 continue;
4317
4318 if (dn->last < snapid || dn->first > snapid) {
4319 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4320 continue;
4321 }
4322
4323 if (!start) {
4324 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4325 if (!(offset_key < dn->key()))
4326 continue;
4327 }
4328
4329 CInode *in = dnl->get_inode();
4330
4331 if (in && in->ino() == CEPH_INO_CEPH)
4332 continue;
4333
4334 // remote link?
4335 // better for the MDS to do the work, if we think the client will stat any of these files.
4336 if (dnl->is_remote() && !in) {
4337 in = mdcache->get_inode(dnl->get_remote_ino());
4338 if (in) {
4339 dn->link_remote(dnl, in);
4340 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4341 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4342 continue;
4343 } else {
4344 // touch everything i _do_ have
4345 for (auto &p : *dir) {
4346 if (!p.second->get_linkage()->is_null())
4347 mdcache->lru.lru_touch(p.second);
4348 }
4349
4350 // already issued caps and leases, reply immediately.
4351 if (dnbl.length() > 0) {
4352 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4353 dout(10) << " open remote dentry after caps were issued, stopping at "
4354 << dnbl.length() << " < " << bytes_left << dendl;
4355 break;
4356 }
4357
4358 mds->locker->drop_locks(mdr.get());
4359 mdr->drop_local_auth_pins();
4360 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4361 return;
4362 }
4363 }
4364 ceph_assert(in);
4365
4366 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
4367 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4368 break;
4369 }
4370
4371 unsigned start_len = dnbl.length();
4372
4373 // dentry
4374 dout(12) << "including dn " << *dn << dendl;
4375 encode(dn->get_name(), dnbl);
4376 mds->locker->issue_client_lease(dn, client, dnbl, now, mdr->session);
4377
4378 // inode
4379 dout(12) << "including inode " << *in << dendl;
4380 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4381 if (r < 0) {
4382 // chop off dn->name, lease
4383 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4384 bufferlist keep;
4385 keep.substr_of(dnbl, 0, start_len);
4386 dnbl.swap(keep);
4387 break;
4388 }
4389 ceph_assert(r >= 0);
4390 numfiles++;
4391
4392 // touch dn
4393 mdcache->lru.lru_touch(dn);
4394 }
4395
4396 __u16 flags = 0;
4397 if (end) {
4398 flags = CEPH_READDIR_FRAG_END;
4399 if (start)
4400 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4401 }
4402 // client only understand END and COMPLETE flags ?
4403 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4404 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4405 }
4406
4407 // finish final blob
4408 encode(numfiles, dirbl);
4409 encode(flags, dirbl);
4410 dirbl.claim_append(dnbl);
4411
4412 // yay, reply
4413 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4414 << " bytes=" << dirbl.length()
4415 << " start=" << (int)start
4416 << " end=" << (int)end
4417 << dendl;
4418 mdr->reply_extra_bl = dirbl;
4419
4420 // bump popularity. NOTE: this doesn't quite capture it.
4421 mds->balancer->hit_dir(dir, META_POP_IRD, -1, numfiles);
4422
4423 // reply
4424 mdr->tracei = diri;
4425 respond_to_request(mdr, 0);
4426 }
4427
4428
4429
4430 // ===============================================================================
4431 // INODE UPDATES
4432
4433
4434 /*
4435 * finisher for basic inode updates
4436 */
4437 class C_MDS_inode_update_finish : public ServerLogContext {
4438 CInode *in;
4439 bool truncating_smaller, changed_ranges, new_realm;
4440 public:
4441 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
4442 bool sm=false, bool cr=false, bool nr=false) :
4443 ServerLogContext(s, r), in(i),
4444 truncating_smaller(sm), changed_ranges(cr), new_realm(nr) { }
4445 void finish(int r) override {
4446 ceph_assert(r == 0);
4447
4448 // apply
4449 in->pop_and_dirty_projected_inode(mdr->ls);
4450 mdr->apply();
4451
4452 MDSRank *mds = get_mds();
4453
4454 // notify any clients
4455 if (truncating_smaller && in->inode.is_truncating()) {
4456 mds->locker->issue_truncate(in);
4457 mds->mdcache->truncate_inode(in, mdr->ls);
4458 }
4459
4460 if (new_realm) {
4461 int op = CEPH_SNAP_OP_SPLIT;
4462 mds->mdcache->send_snap_update(in, 0, op);
4463 mds->mdcache->do_realm_invalidate_and_update_notify(in, op);
4464 }
4465
4466 get_mds()->balancer->hit_inode(in, META_POP_IWR);
4467
4468 server->respond_to_request(mdr, 0);
4469
4470 if (changed_ranges)
4471 get_mds()->locker->share_inode_max_size(in);
4472 }
4473 };
4474
4475 void Server::handle_client_file_setlock(MDRequestRef& mdr)
4476 {
4477 const MClientRequest::const_ref &req = mdr->client_request;
4478 MutationImpl::LockOpVec lov;
4479
4480 // get the inode to operate on, and set up any locks needed for that
4481 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
4482 if (!cur)
4483 return;
4484
4485 lov.add_xlock(&cur->flocklock);
4486 /* acquire_locks will return true if it gets the locks. If it fails,
4487 it will redeliver this request at a later date, so drop the request.
4488 */
4489 if (!mds->locker->acquire_locks(mdr, lov)) {
4490 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
4491 return;
4492 }
4493
4494 // copy the lock change into a ceph_filelock so we can store/apply it
4495 ceph_filelock set_lock;
4496 set_lock.start = req->head.args.filelock_change.start;
4497 set_lock.length = req->head.args.filelock_change.length;
4498 set_lock.client = req->get_orig_source().num();
4499 set_lock.owner = req->head.args.filelock_change.owner;
4500 set_lock.pid = req->head.args.filelock_change.pid;
4501 set_lock.type = req->head.args.filelock_change.type;
4502 bool will_wait = req->head.args.filelock_change.wait;
4503
4504 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
4505
4506 ceph_lock_state_t *lock_state = NULL;
4507 bool interrupt = false;
4508
4509 // get the appropriate lock state
4510 switch (req->head.args.filelock_change.rule) {
4511 case CEPH_LOCK_FLOCK_INTR:
4512 interrupt = true;
4513 // fall-thru
4514 case CEPH_LOCK_FLOCK:
4515 lock_state = cur->get_flock_lock_state();
4516 break;
4517
4518 case CEPH_LOCK_FCNTL_INTR:
4519 interrupt = true;
4520 // fall-thru
4521 case CEPH_LOCK_FCNTL:
4522 lock_state = cur->get_fcntl_lock_state();
4523 break;
4524
4525 default:
4526 dout(10) << "got unknown lock type " << set_lock.type
4527 << ", dropping request!" << dendl;
4528 respond_to_request(mdr, -EOPNOTSUPP);
4529 return;
4530 }
4531
4532 dout(10) << " state prior to lock change: " << *lock_state << dendl;
4533 if (CEPH_LOCK_UNLOCK == set_lock.type) {
4534 list<ceph_filelock> activated_locks;
4535 MDSContext::vec waiters;
4536 if (lock_state->is_waiting(set_lock)) {
4537 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
4538 lock_state->remove_waiting(set_lock);
4539 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4540 } else if (!interrupt) {
4541 dout(10) << " unlock attempt on " << set_lock << dendl;
4542 lock_state->remove_lock(set_lock, activated_locks);
4543 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4544 }
4545 mds->queue_waiters(waiters);
4546
4547 respond_to_request(mdr, 0);
4548 } else {
4549 dout(10) << " lock attempt on " << set_lock << dendl;
4550 bool deadlock = false;
4551 if (mdr->more()->flock_was_waiting &&
4552 !lock_state->is_waiting(set_lock)) {
4553 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
4554 respond_to_request(mdr, -EINTR);
4555 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
4556 dout(10) << " it failed on this attempt" << dendl;
4557 // couldn't set lock right now
4558 if (deadlock) {
4559 respond_to_request(mdr, -EDEADLK);
4560 } else if (!will_wait) {
4561 respond_to_request(mdr, -EWOULDBLOCK);
4562 } else {
4563 dout(10) << " added to waiting list" << dendl;
4564 ceph_assert(lock_state->is_waiting(set_lock));
4565 mdr->more()->flock_was_waiting = true;
4566 mds->locker->drop_locks(mdr.get());
4567 mdr->drop_local_auth_pins();
4568 mdr->mark_event("failed to add lock, waiting");
4569 mdr->mark_nowarn();
4570 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
4571 }
4572 } else
4573 respond_to_request(mdr, 0);
4574 }
4575 dout(10) << " state after lock change: " << *lock_state << dendl;
4576 }
4577
4578 void Server::handle_client_file_readlock(MDRequestRef& mdr)
4579 {
4580 const MClientRequest::const_ref &req = mdr->client_request;
4581 MutationImpl::LockOpVec lov;
4582
4583 // get the inode to operate on, and set up any locks needed for that
4584 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
4585 if (!cur)
4586 return;
4587
4588 /* acquire_locks will return true if it gets the locks. If it fails,
4589 it will redeliver this request at a later date, so drop the request.
4590 */
4591 lov.add_rdlock(&cur->flocklock);
4592 if (!mds->locker->acquire_locks(mdr, lov)) {
4593 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
4594 return;
4595 }
4596
4597 // copy the lock change into a ceph_filelock so we can store/apply it
4598 ceph_filelock checking_lock;
4599 checking_lock.start = req->head.args.filelock_change.start;
4600 checking_lock.length = req->head.args.filelock_change.length;
4601 checking_lock.client = req->get_orig_source().num();
4602 checking_lock.owner = req->head.args.filelock_change.owner;
4603 checking_lock.pid = req->head.args.filelock_change.pid;
4604 checking_lock.type = req->head.args.filelock_change.type;
4605
4606 // get the appropriate lock state
4607 ceph_lock_state_t *lock_state = NULL;
4608 switch (req->head.args.filelock_change.rule) {
4609 case CEPH_LOCK_FLOCK:
4610 lock_state = cur->get_flock_lock_state();
4611 break;
4612
4613 case CEPH_LOCK_FCNTL:
4614 lock_state = cur->get_fcntl_lock_state();
4615 break;
4616
4617 default:
4618 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
4619 respond_to_request(mdr, -EINVAL);
4620 return;
4621 }
4622 lock_state->look_for_lock(checking_lock);
4623
4624 bufferlist lock_bl;
4625 encode(checking_lock, lock_bl);
4626
4627 mdr->reply_extra_bl = lock_bl;
4628 respond_to_request(mdr, 0);
4629 }
4630
4631 void Server::handle_client_setattr(MDRequestRef& mdr)
4632 {
4633 const MClientRequest::const_ref &req = mdr->client_request;
4634 MutationImpl::LockOpVec lov;
4635 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
4636 if (!cur) return;
4637
4638 if (mdr->snapid != CEPH_NOSNAP) {
4639 respond_to_request(mdr, -EROFS);
4640 return;
4641 }
4642 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
4643 respond_to_request(mdr, -EPERM);
4644 return;
4645 }
4646
4647 __u32 mask = req->head.args.setattr.mask;
4648 __u32 access_mask = MAY_WRITE;
4649
4650 // xlock inode
4651 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
4652 lov.add_xlock(&cur->authlock);
4653 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
4654 lov.add_xlock(&cur->filelock);
4655 if (mask & CEPH_SETATTR_CTIME)
4656 lov.add_wrlock(&cur->versionlock);
4657
4658 if (!mds->locker->acquire_locks(mdr, lov))
4659 return;
4660
4661 if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
4662 access_mask |= MAY_CHOWN;
4663
4664 if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
4665 access_mask |= MAY_CHGRP;
4666
4667 if (!check_access(mdr, cur, access_mask))
4668 return;
4669
4670 // trunc from bigger -> smaller?
4671 auto pip = cur->get_projected_inode();
4672
4673 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
4674
4675 // ENOSPC on growing file while full, but allow shrinks
4676 if (is_full && req->head.args.setattr.size > old_size) {
4677 dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
4678 respond_to_request(mdr, -ENOSPC);
4679 return;
4680 }
4681
4682 bool truncating_smaller = false;
4683 if (mask & CEPH_SETATTR_SIZE) {
4684 truncating_smaller = req->head.args.setattr.size < old_size;
4685 if (truncating_smaller && pip->is_truncating()) {
4686 dout(10) << " waiting for pending truncate from " << pip->truncate_from
4687 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
4688 mds->locker->drop_locks(mdr.get());
4689 mdr->drop_local_auth_pins();
4690 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4691 return;
4692 }
4693 }
4694
4695 bool changed_ranges = false;
4696
4697 // project update
4698 mdr->ls = mdlog->get_current_segment();
4699 EUpdate *le = new EUpdate(mdlog, "setattr");
4700 mdlog->start_entry(le);
4701
4702 auto &pi = cur->project_inode();
4703
4704 if (mask & CEPH_SETATTR_UID)
4705 pi.inode.uid = req->head.args.setattr.uid;
4706 if (mask & CEPH_SETATTR_GID)
4707 pi.inode.gid = req->head.args.setattr.gid;
4708
4709 if (mask & CEPH_SETATTR_MODE)
4710 pi.inode.mode = (pi.inode.mode & ~07777) | (req->head.args.setattr.mode & 07777);
4711 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
4712 S_ISREG(pi.inode.mode) &&
4713 (pi.inode.mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
4714 pi.inode.mode &= ~(S_ISUID|S_ISGID);
4715 }
4716
4717 if (mask & CEPH_SETATTR_MTIME)
4718 pi.inode.mtime = req->head.args.setattr.mtime;
4719 if (mask & CEPH_SETATTR_ATIME)
4720 pi.inode.atime = req->head.args.setattr.atime;
4721 if (mask & CEPH_SETATTR_BTIME)
4722 pi.inode.btime = req->head.args.setattr.btime;
4723 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
4724 pi.inode.time_warp_seq++; // maybe not a timewarp, but still a serialization point.
4725 if (mask & CEPH_SETATTR_SIZE) {
4726 if (truncating_smaller) {
4727 pi.inode.truncate(old_size, req->head.args.setattr.size);
4728 le->metablob.add_truncate_start(cur->ino());
4729 } else {
4730 pi.inode.size = req->head.args.setattr.size;
4731 pi.inode.rstat.rbytes = pi.inode.size;
4732 }
4733 pi.inode.mtime = mdr->get_op_stamp();
4734
4735 // adjust client's max_size?
4736 CInode::mempool_inode::client_range_map new_ranges;
4737 bool max_increased = false;
4738 mds->locker->calc_new_client_ranges(cur, pi.inode.size, true, &new_ranges, &max_increased);
4739 if (pi.inode.client_ranges != new_ranges) {
4740 dout(10) << " client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl;
4741 pi.inode.client_ranges = new_ranges;
4742 changed_ranges = true;
4743 }
4744 }
4745
4746 pi.inode.version = cur->pre_dirty();
4747 pi.inode.ctime = mdr->get_op_stamp();
4748 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
4749 pi.inode.rstat.rctime = mdr->get_op_stamp();
4750 pi.inode.change_attr++;
4751
4752 // log + wait
4753 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4754 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4755 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4756
4757 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
4758 truncating_smaller, changed_ranges));
4759
4760 // flush immediately if there are readers/writers waiting
4761 if (mdr->is_xlocked(&cur->filelock) &&
4762 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
4763 mds->mdlog->flush();
4764 }
4765
4766 /* Takes responsibility for mdr */
4767 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
4768 {
4769 CInode *in = mdr->in[0];
4770 client_t client = mdr->get_client();
4771 ceph_assert(in);
4772
4773 dout(10) << "do_open_truncate " << *in << dendl;
4774
4775 SnapRealm *realm = in->find_snaprealm();
4776 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr->session, realm, mdr->client_request->is_replay());
4777
4778 mdr->ls = mdlog->get_current_segment();
4779 EUpdate *le = new EUpdate(mdlog, "open_truncate");
4780 mdlog->start_entry(le);
4781
4782 // prepare
4783 auto &pi = in->project_inode();
4784 pi.inode.version = in->pre_dirty();
4785 pi.inode.mtime = pi.inode.ctime = mdr->get_op_stamp();
4786 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
4787 pi.inode.rstat.rctime = mdr->get_op_stamp();
4788 pi.inode.change_attr++;
4789
4790 uint64_t old_size = std::max<uint64_t>(pi.inode.size, mdr->client_request->head.args.open.old_size);
4791 if (old_size > 0) {
4792 pi.inode.truncate(old_size, 0);
4793 le->metablob.add_truncate_start(in->ino());
4794 }
4795
4796 bool changed_ranges = false;
4797 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
4798 pi.inode.client_ranges[client].range.first = 0;
4799 pi.inode.client_ranges[client].range.last = pi.inode.get_layout_size_increment();
4800 pi.inode.client_ranges[client].follows = realm->get_newest_seq();
4801 changed_ranges = true;
4802 cap->mark_clientwriteable();
4803 }
4804
4805 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
4806
4807 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
4808 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
4809
4810 // make sure ino gets into the journal
4811 le->metablob.add_opened_ino(in->ino());
4812
4813 mdr->o_trunc = true;
4814
4815 CDentry *dn = 0;
4816 if (mdr->client_request->get_dentry_wanted()) {
4817 ceph_assert(mdr->dn[0].size());
4818 dn = mdr->dn[0].back();
4819 }
4820
4821 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
4822 changed_ranges));
4823 // Although the `open` part can give an early reply, the truncation won't
4824 // happen until our EUpdate is persistent, to give the client a prompt
4825 // response we must also flush that event.
4826 mdlog->flush();
4827 }
4828
4829
4830 /* This function cleans up the passed mdr */
4831 void Server::handle_client_setlayout(MDRequestRef& mdr)
4832 {
4833 const MClientRequest::const_ref &req = mdr->client_request;
4834 MutationImpl::LockOpVec lov;
4835 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true);
4836 if (!cur) return;
4837
4838 if (mdr->snapid != CEPH_NOSNAP) {
4839 respond_to_request(mdr, -EROFS);
4840 return;
4841 }
4842 if (!cur->is_file()) {
4843 respond_to_request(mdr, -EINVAL);
4844 return;
4845 }
4846 if (cur->get_projected_inode()->size ||
4847 cur->get_projected_inode()->truncate_seq > 1) {
4848 respond_to_request(mdr, -ENOTEMPTY);
4849 return;
4850 }
4851
4852 // validate layout
4853 file_layout_t layout = cur->get_projected_inode()->layout;
4854 // save existing layout for later
4855 const auto old_layout = layout;
4856
4857 int access = MAY_WRITE;
4858
4859 if (req->head.args.setlayout.layout.fl_object_size > 0)
4860 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4861 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4862 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4863 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4864 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4865 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4866 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4867
4868 // make sure we have as new a map as the client
4869 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4870 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4871 return;
4872 }
4873 }
4874
4875 // Don't permit layout modifications without 'p' caps
4876 if (layout != old_layout) {
4877 access |= MAY_SET_VXATTR;
4878 }
4879
4880 if (!layout.is_valid()) {
4881 dout(10) << "bad layout" << dendl;
4882 respond_to_request(mdr, -EINVAL);
4883 return;
4884 }
4885 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4886 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4887 respond_to_request(mdr, -EINVAL);
4888 return;
4889 }
4890
4891 lov.add_xlock(&cur->filelock);
4892 if (!mds->locker->acquire_locks(mdr, lov))
4893 return;
4894
4895 if (!check_access(mdr, cur, access))
4896 return;
4897
4898 // project update
4899 auto &pi = cur->project_inode();
4900 pi.inode.layout = layout;
4901 // add the old pool to the inode
4902 pi.inode.add_old_pool(old_layout.pool_id);
4903 pi.inode.version = cur->pre_dirty();
4904 pi.inode.ctime = mdr->get_op_stamp();
4905 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
4906 pi.inode.rstat.rctime = mdr->get_op_stamp();
4907 pi.inode.change_attr++;
4908
4909 // log + wait
4910 mdr->ls = mdlog->get_current_segment();
4911 EUpdate *le = new EUpdate(mdlog, "setlayout");
4912 mdlog->start_entry(le);
4913 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4914 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
4915 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
4916
4917 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
4918 }
4919
4920 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
4921 {
4922 const MClientRequest::const_ref &req = mdr->client_request;
4923 MutationImpl::LockOpVec lov;
4924 file_layout_t *dir_layout = nullptr;
4925 CInode *cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout);
4926 if (!cur) return;
4927
4928 if (mdr->snapid != CEPH_NOSNAP) {
4929 respond_to_request(mdr, -EROFS);
4930 return;
4931 }
4932
4933 if (!cur->is_dir()) {
4934 respond_to_request(mdr, -ENOTDIR);
4935 return;
4936 }
4937
4938 lov.add_xlock(&cur->policylock);
4939 if (!mds->locker->acquire_locks(mdr, lov))
4940 return;
4941
4942 // validate layout
4943 const auto old_pi = cur->get_projected_inode();
4944 file_layout_t layout;
4945 if (old_pi->has_layout())
4946 layout = old_pi->layout;
4947 else if (dir_layout)
4948 layout = *dir_layout;
4949 else
4950 layout = mdcache->default_file_layout;
4951
4952 // Level of access required to complete
4953 int access = MAY_WRITE;
4954
4955 const auto old_layout = layout;
4956
4957 if (req->head.args.setlayout.layout.fl_object_size > 0)
4958 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
4959 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
4960 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
4961 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
4962 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
4963 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
4964 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
4965 // make sure we have as new a map as the client
4966 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4967 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4968 return;
4969 }
4970 }
4971
4972 if (layout != old_layout) {
4973 access |= MAY_SET_VXATTR;
4974 }
4975
4976 if (!layout.is_valid()) {
4977 dout(10) << "bad layout" << dendl;
4978 respond_to_request(mdr, -EINVAL);
4979 return;
4980 }
4981 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4982 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4983 respond_to_request(mdr, -EINVAL);
4984 return;
4985 }
4986
4987 if (!check_access(mdr, cur, access))
4988 return;
4989
4990 auto &pi = cur->project_inode();
4991 pi.inode.layout = layout;
4992 pi.inode.version = cur->pre_dirty();
4993
4994 // log + wait
4995 mdr->ls = mdlog->get_current_segment();
4996 EUpdate *le = new EUpdate(mdlog, "setlayout");
4997 mdlog->start_entry(le);
4998 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4999 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5000 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5001
5002 mdr->no_early_reply = true;
5003 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5004 }
5005
5006 // XATTRS
5007
5008 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5009 file_layout_t *layout, bool validate)
5010 {
5011 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
5012 try {
5013 if (name == "layout") {
5014 string::iterator begin = value.begin();
5015 string::iterator end = value.end();
5016 keys_and_values<string::iterator> p; // create instance of parser
5017 std::map<string, string> m; // map to receive results
5018 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5019 return -EINVAL;
5020 }
5021 string left(begin, end);
5022 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5023 if (begin != end)
5024 return -EINVAL;
5025 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5026 // Skip validation on each attr, we do it once at the end (avoid
5027 // rejecting intermediate states if the overall result is ok)
5028 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
5029 osdmap, layout, false);
5030 if (r < 0)
5031 return r;
5032 }
5033 } else if (name == "layout.object_size") {
5034 layout->object_size = boost::lexical_cast<unsigned>(value);
5035 } else if (name == "layout.stripe_unit") {
5036 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5037 } else if (name == "layout.stripe_count") {
5038 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5039 } else if (name == "layout.pool") {
5040 try {
5041 layout->pool_id = boost::lexical_cast<unsigned>(value);
5042 } catch (boost::bad_lexical_cast const&) {
5043 int64_t pool = osdmap.lookup_pg_pool_name(value);
5044 if (pool < 0) {
5045 dout(10) << " unknown pool " << value << dendl;
5046 return -ENOENT;
5047 }
5048 layout->pool_id = pool;
5049 }
5050 } else if (name == "layout.pool_namespace") {
5051 layout->pool_ns = value;
5052 } else {
5053 dout(10) << " unknown layout vxattr " << name << dendl;
5054 return -EINVAL;
5055 }
5056 } catch (boost::bad_lexical_cast const&) {
5057 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5058 return -EINVAL;
5059 }
5060
5061 if (validate && !layout->is_valid()) {
5062 dout(10) << "bad layout" << dendl;
5063 return -EINVAL;
5064 }
5065 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
5066 dout(10) << " invalid data pool " << layout->pool_id << dendl;
5067 return -EINVAL;
5068 }
5069 return 0;
5070 }
5071
5072 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5073 {
5074 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5075 try {
5076 if (name == "quota") {
5077 string::iterator begin = value.begin();
5078 string::iterator end = value.end();
5079 if (begin == end) {
5080 // keep quota unchanged. (for create_quota_realm())
5081 return 0;
5082 }
5083 keys_and_values<string::iterator> p; // create instance of parser
5084 std::map<string, string> m; // map to receive results
5085 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5086 return -EINVAL;
5087 }
5088 string left(begin, end);
5089 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5090 if (begin != end)
5091 return -EINVAL;
5092 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5093 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5094 if (r < 0)
5095 return r;
5096 }
5097 } else if (name == "quota.max_bytes") {
5098 int64_t q = boost::lexical_cast<int64_t>(value);
5099 if (q < 0)
5100 return -EINVAL;
5101 quota->max_bytes = q;
5102 } else if (name == "quota.max_files") {
5103 int64_t q = boost::lexical_cast<int64_t>(value);
5104 if (q < 0)
5105 return -EINVAL;
5106 quota->max_files = q;
5107 } else {
5108 dout(10) << " unknown quota vxattr " << name << dendl;
5109 return -EINVAL;
5110 }
5111 } catch (boost::bad_lexical_cast const&) {
5112 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5113 return -EINVAL;
5114 }
5115
5116 if (!quota->is_valid()) {
5117 dout(10) << "bad quota" << dendl;
5118 return -EINVAL;
5119 }
5120 return 0;
5121 }
5122
5123 void Server::create_quota_realm(CInode *in)
5124 {
5125 dout(10) << __func__ << " " << *in << dendl;
5126
5127 auto req = MClientRequest::create(CEPH_MDS_OP_SETXATTR);
5128 req->set_filepath(filepath(in->ino()));
5129 req->set_string2("ceph.quota");
5130 // empty vxattr value
5131 req->set_tid(mds->issue_tid());
5132
5133 mds->send_message_mds(req, in->authority().first);
5134 }
5135
5136 /*
5137 * Verify that the file layout attribute carried by client
5138 * is well-formatted.
5139 * Return 0 on success, otherwise this function takes
5140 * responsibility for the passed mdr.
5141 */
5142 int Server::check_layout_vxattr(MDRequestRef& mdr,
5143 string name,
5144 string value,
5145 file_layout_t *layout)
5146 {
5147 const MClientRequest::const_ref &req = mdr->client_request;
5148 epoch_t epoch;
5149 int r;
5150
5151 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5152 r = parse_layout_vxattr(name, value, osdmap, layout);
5153 epoch = osdmap.get_epoch();
5154 });
5155
5156 if (r == -ENOENT) {
5157
5158 // we don't have the specified pool, make sure our map
5159 // is newer than or as new as the client.
5160 epoch_t req_epoch = req->get_osdmap_epoch();
5161
5162 if (req_epoch > epoch) {
5163
5164 // well, our map is older. consult mds.
5165 Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
5166
5167 if (!mds->objecter->wait_for_map(req_epoch, fin))
5168 return r; // wait, fin will retry this request later
5169
5170 delete fin;
5171
5172 // now we have at least as new a map as the client, try again.
5173 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5174 r = parse_layout_vxattr(name, value, osdmap, layout);
5175 epoch = osdmap.get_epoch();
5176 });
5177
5178 ceph_assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
5179
5180 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5181
5182 // For compatibility with client w/ old code, we still need get the
5183 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5184 // we can remove those code.
5185 mdr->waited_for_osdmap = true;
5186 mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
5187 mds, new C_MDS_RetryRequest(mdcache, mdr)));
5188 return r;
5189 }
5190 }
5191
5192 if (r < 0) {
5193
5194 if (r == -ENOENT)
5195 r = -EINVAL;
5196
5197 respond_to_request(mdr, r);
5198 return r;
5199 }
5200
5201 // all is well
5202 return 0;
5203 }
5204
5205 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
5206 file_layout_t *dir_layout,
5207 MutationImpl::LockOpVec& lov)
5208 {
5209 const MClientRequest::const_ref &req = mdr->client_request;
5210 string name(req->get_path2());
5211 bufferlist bl = req->get_data();
5212 string value (bl.c_str(), bl.length());
5213 dout(10) << "handle_set_vxattr " << name
5214 << " val " << value.length()
5215 << " bytes on " << *cur
5216 << dendl;
5217
5218 CInode::mempool_inode *pip = nullptr;
5219 string rest;
5220
5221 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5222 return;
5223 }
5224
5225 bool new_realm = false;
5226 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5227 if (!cur->is_dir()) {
5228 respond_to_request(mdr, -EINVAL);
5229 return;
5230 }
5231
5232 file_layout_t layout;
5233 if (cur->get_projected_inode()->has_layout())
5234 layout = cur->get_projected_inode()->layout;
5235 else if (dir_layout)
5236 layout = *dir_layout;
5237 else
5238 layout = mdcache->default_file_layout;
5239
5240 rest = name.substr(name.find("layout"));
5241 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5242 return;
5243
5244 lov.add_xlock(&cur->policylock);
5245 if (!mds->locker->acquire_locks(mdr, lov))
5246 return;
5247
5248 auto &pi = cur->project_inode();
5249 pi.inode.layout = layout;
5250 mdr->no_early_reply = true;
5251 pip = &pi.inode;
5252 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
5253 if (!cur->is_file()) {
5254 respond_to_request(mdr, -EINVAL);
5255 return;
5256 }
5257 if (cur->get_projected_inode()->size ||
5258 cur->get_projected_inode()->truncate_seq > 1) {
5259 respond_to_request(mdr, -ENOTEMPTY);
5260 return;
5261 }
5262 file_layout_t layout = cur->get_projected_inode()->layout;
5263 rest = name.substr(name.find("layout"));
5264 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5265 return;
5266
5267 lov.add_xlock(&cur->filelock);
5268 if (!mds->locker->acquire_locks(mdr, lov))
5269 return;
5270
5271 auto &pi = cur->project_inode();
5272 int64_t old_pool = pi.inode.layout.pool_id;
5273 pi.inode.add_old_pool(old_pool);
5274 pi.inode.layout = layout;
5275 pip = &pi.inode;
5276 } else if (name.compare(0, 10, "ceph.quota") == 0) {
5277 if (!cur->is_dir() || cur->is_root()) {
5278 respond_to_request(mdr, -EINVAL);
5279 return;
5280 }
5281
5282 quota_info_t quota = cur->get_projected_inode()->quota;
5283
5284 rest = name.substr(name.find("quota"));
5285 int r = parse_quota_vxattr(rest, value, &quota);
5286 if (r < 0) {
5287 respond_to_request(mdr, r);
5288 return;
5289 }
5290
5291 lov.add_xlock(&cur->policylock);
5292 if (quota.is_enable() && !cur->get_projected_srnode()) {
5293 lov.add_xlock(&cur->snaplock);
5294 new_realm = true;
5295 }
5296
5297 if (!mds->locker->acquire_locks(mdr, lov))
5298 return;
5299
5300 auto &pi = cur->project_inode(false, new_realm);
5301 pi.inode.quota = quota;
5302
5303 if (new_realm) {
5304 SnapRealm *realm = cur->find_snaprealm();
5305 auto seq = realm->get_newest_seq();
5306 auto &newsnap = *pi.snapnode;
5307 newsnap.created = seq;
5308 newsnap.seq = seq;
5309 }
5310 mdr->no_early_reply = true;
5311 pip = &pi.inode;
5312
5313 client_t exclude_ct = mdr->get_client();
5314 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
5315 } else if (name.find("ceph.dir.pin") == 0) {
5316 if (!cur->is_dir() || cur->is_root()) {
5317 respond_to_request(mdr, -EINVAL);
5318 return;
5319 }
5320
5321 mds_rank_t rank;
5322 try {
5323 rank = boost::lexical_cast<mds_rank_t>(value);
5324 if (rank < 0) rank = MDS_RANK_NONE;
5325 } catch (boost::bad_lexical_cast const&) {
5326 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5327 respond_to_request(mdr, -EINVAL);
5328 return;
5329 }
5330
5331 lov.add_xlock(&cur->policylock);
5332 if (!mds->locker->acquire_locks(mdr, lov))
5333 return;
5334
5335 auto &pi = cur->project_inode();
5336 cur->set_export_pin(rank);
5337 pip = &pi.inode;
5338 } else {
5339 dout(10) << " unknown vxattr " << name << dendl;
5340 respond_to_request(mdr, -EINVAL);
5341 return;
5342 }
5343
5344 pip->change_attr++;
5345 pip->ctime = mdr->get_op_stamp();
5346 if (mdr->get_op_stamp() > pip->rstat.rctime)
5347 pip->rstat.rctime = mdr->get_op_stamp();
5348 pip->version = cur->pre_dirty();
5349 if (cur->is_file())
5350 pip->update_backtrace();
5351
5352 // log + wait
5353 mdr->ls = mdlog->get_current_segment();
5354 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
5355 mdlog->start_entry(le);
5356 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5357 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5358 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5359
5360 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5361 false, false, new_realm));
5362 return;
5363 }
5364
5365 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur,
5366 file_layout_t *dir_layout,
5367 MutationImpl::LockOpVec& lov)
5368 {
5369 const MClientRequest::const_ref &req = mdr->client_request;
5370 string name(req->get_path2());
5371
5372 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
5373
5374 if (name == "ceph.dir.layout") {
5375 if (!cur->is_dir()) {
5376 respond_to_request(mdr, -ENODATA);
5377 return;
5378 }
5379 if (cur->is_root()) {
5380 dout(10) << "can't remove layout policy on the root directory" << dendl;
5381 respond_to_request(mdr, -EINVAL);
5382 return;
5383 }
5384
5385 if (!cur->get_projected_inode()->has_layout()) {
5386 respond_to_request(mdr, -ENODATA);
5387 return;
5388 }
5389
5390 lov.add_xlock(&cur->policylock);
5391 if (!mds->locker->acquire_locks(mdr, lov))
5392 return;
5393
5394 auto &pi = cur->project_inode();
5395 pi.inode.clear_layout();
5396 pi.inode.version = cur->pre_dirty();
5397
5398 // log + wait
5399 mdr->ls = mdlog->get_current_segment();
5400 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
5401 mdlog->start_entry(le);
5402 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5403 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5404 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5405
5406 mdr->no_early_reply = true;
5407 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5408 return;
5409 } else if (name == "ceph.dir.layout.pool_namespace"
5410 || name == "ceph.file.layout.pool_namespace") {
5411 // Namespace is the only layout field that has a meaningful
5412 // null/none value (empty string, means default layout). Is equivalent
5413 // to a setxattr with empty string: pass through the empty payload of
5414 // the rmxattr request to do this.
5415 handle_set_vxattr(mdr, cur, dir_layout, lov);
5416 return;
5417 }
5418
5419 respond_to_request(mdr, -ENODATA);
5420 }
5421
5422 class C_MDS_inode_xattr_update_finish : public ServerLogContext {
5423 CInode *in;
5424 public:
5425
5426 C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
5427 ServerLogContext(s, r), in(i) { }
5428 void finish(int r) override {
5429 ceph_assert(r == 0);
5430
5431 // apply
5432 in->pop_and_dirty_projected_inode(mdr->ls);
5433
5434 mdr->apply();
5435
5436 get_mds()->balancer->hit_inode(in, META_POP_IWR);
5437
5438 server->respond_to_request(mdr, 0);
5439 }
5440 };
5441
5442 void Server::handle_client_setxattr(MDRequestRef& mdr)
5443 {
5444 const MClientRequest::const_ref &req = mdr->client_request;
5445 string name(req->get_path2());
5446 MutationImpl::LockOpVec lov;
5447 CInode *cur;
5448
5449 file_layout_t *dir_layout = NULL;
5450 if (name.compare(0, 15, "ceph.dir.layout") == 0)
5451 cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout);
5452 else
5453 cur = rdlock_path_pin_ref(mdr, 0, lov, true);
5454 if (!cur)
5455 return;
5456
5457 if (mdr->snapid != CEPH_NOSNAP) {
5458 respond_to_request(mdr, -EROFS);
5459 return;
5460 }
5461
5462 int flags = req->head.args.setxattr.flags;
5463
5464 // magic ceph.* namespace?
5465 if (name.compare(0, 5, "ceph.") == 0) {
5466 handle_set_vxattr(mdr, cur, dir_layout, lov);
5467 return;
5468 }
5469
5470 lov.add_xlock(&cur->xattrlock);
5471 if (!mds->locker->acquire_locks(mdr, lov))
5472 return;
5473
5474 if (!check_access(mdr, cur, MAY_WRITE))
5475 return;
5476
5477 auto pxattrs = cur->get_projected_xattrs();
5478 size_t len = req->get_data().length();
5479 size_t inc = len + name.length();
5480
5481 // check xattrs kv pairs size
5482 size_t cur_xattrs_size = 0;
5483 for (const auto& p : *pxattrs) {
5484 if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
5485 continue;
5486 }
5487 cur_xattrs_size += p.first.length() + p.second.length();
5488 }
5489
5490 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
5491 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
5492 << cur_xattrs_size << ", inc " << inc << dendl;
5493 respond_to_request(mdr, -ENOSPC);
5494 return;
5495 }
5496
5497 if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(mempool::mds_co::string(name))) {
5498 dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
5499 respond_to_request(mdr, -EEXIST);
5500 return;
5501 }
5502 if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(mempool::mds_co::string(name))) {
5503 dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
5504 respond_to_request(mdr, -ENODATA);
5505 return;
5506 }
5507
5508 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
5509
5510 // project update
5511 auto &pi = cur->project_inode(true);
5512 pi.inode.version = cur->pre_dirty();
5513 pi.inode.ctime = mdr->get_op_stamp();
5514 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5515 pi.inode.rstat.rctime = mdr->get_op_stamp();
5516 pi.inode.change_attr++;
5517 pi.inode.xattr_version++;
5518 auto &px = *pi.xattrs;
5519 if ((flags & CEPH_XATTR_REMOVE)) {
5520 px.erase(mempool::mds_co::string(name));
5521 } else {
5522 bufferptr b = buffer::create(len);
5523 if (len)
5524 req->get_data().copy(0, len, b.c_str());
5525 auto em = px.emplace(std::piecewise_construct, std::forward_as_tuple(mempool::mds_co::string(name)), std::forward_as_tuple(b));
5526 if (!em.second)
5527 em.first->second = b;
5528 }
5529
5530 // log + wait
5531 mdr->ls = mdlog->get_current_segment();
5532 EUpdate *le = new EUpdate(mdlog, "setxattr");
5533 mdlog->start_entry(le);
5534 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5535 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5536 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5537
5538 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5539 }
5540
5541 void Server::handle_client_removexattr(MDRequestRef& mdr)
5542 {
5543 const MClientRequest::const_ref &req = mdr->client_request;
5544 std::string name(req->get_path2());
5545
5546 MutationImpl::LockOpVec lov;
5547 file_layout_t *dir_layout = nullptr;
5548 CInode *cur;
5549 if (name == "ceph.dir.layout")
5550 cur = rdlock_path_pin_ref(mdr, 0, lov, true, false, &dir_layout);
5551 else
5552 cur = rdlock_path_pin_ref(mdr, 0, lov, true);
5553 if (!cur)
5554 return;
5555
5556 if (mdr->snapid != CEPH_NOSNAP) {
5557 respond_to_request(mdr, -EROFS);
5558 return;
5559 }
5560
5561 if (name.compare(0, 5, "ceph.") == 0) {
5562 handle_remove_vxattr(mdr, cur, dir_layout, lov);
5563 return;
5564 }
5565
5566 lov.add_xlock(&cur->xattrlock);
5567 if (!mds->locker->acquire_locks(mdr, lov))
5568 return;
5569
5570 auto pxattrs = cur->get_projected_xattrs();
5571 if (pxattrs->count(mempool::mds_co::string(name)) == 0) {
5572 dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
5573 respond_to_request(mdr, -ENODATA);
5574 return;
5575 }
5576
5577 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
5578
5579 // project update
5580 auto &pi = cur->project_inode(true);
5581 auto &px = *pi.xattrs;
5582 pi.inode.version = cur->pre_dirty();
5583 pi.inode.ctime = mdr->get_op_stamp();
5584 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5585 pi.inode.rstat.rctime = mdr->get_op_stamp();
5586 pi.inode.change_attr++;
5587 pi.inode.xattr_version++;
5588 px.erase(mempool::mds_co::string(name));
5589
5590 // log + wait
5591 mdr->ls = mdlog->get_current_segment();
5592 EUpdate *le = new EUpdate(mdlog, "removexattr");
5593 mdlog->start_entry(le);
5594 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5595 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5596 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5597
5598 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5599 }
5600
5601
5602 // =================================================================
5603 // DIRECTORY and NAMESPACE OPS
5604
5605
5606 // ------------------------------------------------
5607
5608 // MKNOD
5609
5610 class C_MDS_mknod_finish : public ServerLogContext {
5611 CDentry *dn;
5612 CInode *newi;
5613 public:
5614 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
5615 ServerLogContext(s, r), dn(d), newi(ni) {}
5616 void finish(int r) override {
5617 ceph_assert(r == 0);
5618
5619 // link the inode
5620 dn->pop_projected_linkage();
5621
5622 // be a bit hacky with the inode version, here.. we decrement it
5623 // just to keep mark_dirty() happen. (we didn't bother projecting
5624 // a new version of hte inode since it's just been created)
5625 newi->inode.version--;
5626 newi->mark_dirty(newi->inode.version + 1, mdr->ls);
5627 newi->mark_dirty_parent(mdr->ls, true);
5628
5629 // mkdir?
5630 if (newi->inode.is_dir()) {
5631 CDir *dir = newi->get_dirfrag(frag_t());
5632 ceph_assert(dir);
5633 dir->fnode.version--;
5634 dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
5635 dir->mark_new(mdr->ls);
5636 }
5637
5638 mdr->apply();
5639
5640 MDRequestRef null_ref;
5641 get_mds()->mdcache->send_dentry_link(dn, null_ref);
5642
5643 if (newi->inode.is_file())
5644 get_mds()->locker->share_inode_max_size(newi);
5645
5646 // hit pop
5647 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
5648
5649 // reply
5650 server->respond_to_request(mdr, 0);
5651 }
5652 };
5653
5654
5655 void Server::handle_client_mknod(MDRequestRef& mdr)
5656 {
5657 const MClientRequest::const_ref &req = mdr->client_request;
5658 client_t client = mdr->get_client();
5659 MutationImpl::LockOpVec lov;
5660 file_layout_t *dir_layout = nullptr;
5661 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false,
5662 &dir_layout);
5663 if (!dn) return;
5664 if (mdr->snapid != CEPH_NOSNAP) {
5665 respond_to_request(mdr, -EROFS);
5666 return;
5667 }
5668 CInode *diri = dn->get_dir()->get_inode();
5669 lov.add_rdlock(&diri->authlock);
5670 if (!mds->locker->acquire_locks(mdr, lov))
5671 return;
5672
5673 if (!check_access(mdr, diri, MAY_WRITE))
5674 return;
5675
5676 if (!check_fragment_space(mdr, dn->get_dir()))
5677 return;
5678
5679 unsigned mode = req->head.args.mknod.mode;
5680 if ((mode & S_IFMT) == 0)
5681 mode |= S_IFREG;
5682
5683 // set layout
5684 file_layout_t layout;
5685 if (dir_layout && S_ISREG(mode))
5686 layout = *dir_layout;
5687 else
5688 layout = mdcache->default_file_layout;
5689
5690 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
5691 ceph_assert(newi);
5692
5693 dn->push_projected_linkage(newi);
5694
5695 newi->inode.rdev = req->head.args.mknod.rdev;
5696 newi->inode.version = dn->pre_dirty();
5697 newi->inode.rstat.rfiles = 1;
5698 if (layout.pool_id != mdcache->default_file_layout.pool_id)
5699 newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
5700 newi->inode.update_backtrace();
5701
5702 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
5703 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5704 ceph_assert(follows >= realm->get_newest_seq());
5705
5706 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5707 // want to write to it (e.g., if they are reexporting NFS)
5708 if (S_ISREG(newi->inode.mode)) {
5709 // issue a cap on the file
5710 int cmode = CEPH_FILE_MODE_RDWR;
5711 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5712 if (cap) {
5713 cap->set_wanted(0);
5714
5715 // put locks in excl mode
5716 newi->filelock.set_state(LOCK_EXCL);
5717 newi->authlock.set_state(LOCK_EXCL);
5718 newi->xattrlock.set_state(LOCK_EXCL);
5719
5720 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
5721 newi->inode.client_ranges[client].range.first = 0;
5722 newi->inode.client_ranges[client].range.last = newi->inode.get_layout_size_increment();
5723 newi->inode.client_ranges[client].follows = follows;
5724 cap->mark_clientwriteable();
5725 }
5726 }
5727
5728 ceph_assert(dn->first == follows + 1);
5729 newi->first = dn->first;
5730
5731 dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
5732
5733 // prepare finisher
5734 mdr->ls = mdlog->get_current_segment();
5735 EUpdate *le = new EUpdate(mdlog, "mknod");
5736 mdlog->start_entry(le);
5737 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5738 journal_allocated_inos(mdr, &le->metablob);
5739
5740 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
5741 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5742 le->metablob.add_primary_dentry(dn, newi, true, true, true);
5743
5744 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5745 }
5746
5747
5748
5749 // MKDIR
5750 /* This function takes responsibility for the passed mdr*/
5751 void Server::handle_client_mkdir(MDRequestRef& mdr)
5752 {
5753 const MClientRequest::const_ref &req = mdr->client_request;
5754 if (req->get_filepath().is_last_dot_or_dotdot()) {
5755 respond_to_request(mdr, -EEXIST);
5756 return;
5757 }
5758
5759 MutationImpl::LockOpVec lov;
5760 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false);
5761 if (!dn) return;
5762 if (mdr->snapid != CEPH_NOSNAP) {
5763 respond_to_request(mdr, -EROFS);
5764 return;
5765 }
5766 CDir *dir = dn->get_dir();
5767 CInode *diri = dir->get_inode();
5768 lov.add_rdlock(&diri->authlock);
5769 if (!mds->locker->acquire_locks(mdr, lov))
5770 return;
5771
5772 // mkdir check access
5773 if (!check_access(mdr, diri, MAY_WRITE))
5774 return;
5775
5776 if (!check_fragment_space(mdr, dir))
5777 return;
5778
5779 // new inode
5780 unsigned mode = req->head.args.mkdir.mode;
5781 mode &= ~S_IFMT;
5782 mode |= S_IFDIR;
5783 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5784 ceph_assert(newi);
5785
5786 // it's a directory.
5787 dn->push_projected_linkage(newi);
5788
5789 newi->inode.version = dn->pre_dirty();
5790 newi->inode.rstat.rsubdirs = 1;
5791 newi->inode.update_backtrace();
5792
5793 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
5794 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5795 ceph_assert(follows >= realm->get_newest_seq());
5796
5797 dout(12) << " follows " << follows << dendl;
5798 ceph_assert(dn->first == follows + 1);
5799 newi->first = dn->first;
5800
5801 // ...and that new dir is empty.
5802 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
5803 newdir->state_set(CDir::STATE_CREATING);
5804 newdir->mark_complete();
5805 newdir->fnode.version = newdir->pre_dirty();
5806
5807 // prepare finisher
5808 mdr->ls = mdlog->get_current_segment();
5809 EUpdate *le = new EUpdate(mdlog, "mkdir");
5810 mdlog->start_entry(le);
5811 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5812 journal_allocated_inos(mdr, &le->metablob);
5813 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5814 le->metablob.add_primary_dentry(dn, newi, true, true);
5815 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
5816
5817 // issue a cap on the directory
5818 int cmode = CEPH_FILE_MODE_RDWR;
5819 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
5820 if (cap) {
5821 cap->set_wanted(0);
5822
5823 // put locks in excl mode
5824 newi->filelock.set_state(LOCK_EXCL);
5825 newi->authlock.set_state(LOCK_EXCL);
5826 newi->xattrlock.set_state(LOCK_EXCL);
5827 }
5828
5829 // make sure this inode gets into the journal
5830 le->metablob.add_opened_ino(newi->ino());
5831
5832 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5833 }
5834
5835
5836 // SYMLINK
5837
5838 void Server::handle_client_symlink(MDRequestRef& mdr)
5839 {
5840 const MClientRequest::const_ref &req = mdr->client_request;
5841 MutationImpl::LockOpVec lov;
5842 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false);
5843 if (!dn) return;
5844 if (mdr->snapid != CEPH_NOSNAP) {
5845 respond_to_request(mdr, -EROFS);
5846 return;
5847 }
5848 CDir *dir = dn->get_dir();
5849 CInode *diri = dir->get_inode();
5850 lov.add_rdlock(&diri->authlock);
5851 if (!mds->locker->acquire_locks(mdr, lov))
5852 return;
5853
5854 if (!check_access(mdr, diri, MAY_WRITE))
5855 return;
5856
5857 if (!check_fragment_space(mdr, dir))
5858 return;
5859
5860 unsigned mode = S_IFLNK | 0777;
5861 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
5862 ceph_assert(newi);
5863
5864 // it's a symlink
5865 dn->push_projected_linkage(newi);
5866
5867 newi->symlink = req->get_path2();
5868 newi->inode.size = newi->symlink.length();
5869 newi->inode.rstat.rbytes = newi->inode.size;
5870 newi->inode.rstat.rfiles = 1;
5871 newi->inode.version = dn->pre_dirty();
5872 newi->inode.update_backtrace();
5873
5874 newi->first = dn->first;
5875
5876 // prepare finisher
5877 mdr->ls = mdlog->get_current_segment();
5878 EUpdate *le = new EUpdate(mdlog, "symlink");
5879 mdlog->start_entry(le);
5880 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5881 journal_allocated_inos(mdr, &le->metablob);
5882 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
5883 le->metablob.add_primary_dentry(dn, newi, true, true);
5884
5885 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
5886 }
5887
5888
5889
5890
5891
5892 // LINK
5893
5894 void Server::handle_client_link(MDRequestRef& mdr)
5895 {
5896 const MClientRequest::const_ref &req = mdr->client_request;
5897
5898 dout(7) << "handle_client_link " << req->get_filepath()
5899 << " to " << req->get_filepath2()
5900 << dendl;
5901
5902 MutationImpl::LockOpVec lov;
5903
5904 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, lov, false, false, false);
5905 if (!dn) return;
5906 CInode *targeti = rdlock_path_pin_ref(mdr, 1, lov, false);
5907 if (!targeti) return;
5908 if (mdr->snapid != CEPH_NOSNAP) {
5909 respond_to_request(mdr, -EROFS);
5910 return;
5911 }
5912
5913 CDir *dir = dn->get_dir();
5914 dout(7) << "handle_client_link link " << dn->get_name() << " in " << *dir << dendl;
5915 dout(7) << "target is " << *targeti << dendl;
5916 if (targeti->is_dir()) {
5917 // if srcdn is replica, need to make sure its linkage is correct
5918 vector<CDentry*>& trace = mdr->dn[1];
5919 if (trace.empty() ||
5920 trace.back()->is_auth() ||
5921 trace.back()->lock.can_read(mdr->get_client())) {
5922 dout(7) << "target is a dir, failing..." << dendl;
5923 respond_to_request(mdr, -EINVAL);
5924 return;
5925 }
5926 }
5927
5928 lov.erase_rdlock(&targeti->snaplock);
5929 lov.add_xlock(&targeti->snaplock);
5930 lov.add_xlock(&targeti->linklock);
5931
5932 if (!mds->locker->acquire_locks(mdr, lov))
5933 return;
5934
5935 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
5936 if (!check_access(mdr, targeti, MAY_WRITE))
5937 return;
5938
5939 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
5940 return;
5941
5942 if (!check_fragment_space(mdr, dir))
5943 return;
5944 }
5945
5946 // go!
5947 ceph_assert(g_conf()->mds_kill_link_at != 1);
5948
5949 // local or remote?
5950 if (targeti->is_auth())
5951 _link_local(mdr, dn, targeti);
5952 else
5953 _link_remote(mdr, true, dn, targeti);
5954 }
5955
5956
5957 class C_MDS_link_local_finish : public ServerLogContext {
5958 CDentry *dn;
5959 CInode *targeti;
5960 version_t dnpv;
5961 version_t tipv;
5962 bool adjust_realm;
5963 public:
5964 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
5965 version_t dnpv_, version_t tipv_, bool ar) :
5966 ServerLogContext(s, r), dn(d), targeti(ti),
5967 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
5968 void finish(int r) override {
5969 ceph_assert(r == 0);
5970 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
5971 }
5972 };
5973
5974
5975 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
5976 {
5977 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
5978
5979 mdr->ls = mdlog->get_current_segment();
5980
5981 // predirty NEW dentry
5982 version_t dnpv = dn->pre_dirty();
5983 version_t tipv = targeti->pre_dirty();
5984
5985 // project inode update
5986 auto &pi = targeti->project_inode();
5987 pi.inode.nlink++;
5988 pi.inode.ctime = mdr->get_op_stamp();
5989 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5990 pi.inode.rstat.rctime = mdr->get_op_stamp();
5991 pi.inode.change_attr++;
5992 pi.inode.version = tipv;
5993
5994 bool adjust_realm = false;
5995 if (!targeti->is_projected_snaprealm_global()) {
5996 sr_t *newsnap = targeti->project_snaprealm();
5997 targeti->mark_snaprealm_global(newsnap);
5998 targeti->record_snaprealm_parent_dentry(newsnap, NULL, targeti->get_projected_parent_dn(), true);
5999 adjust_realm = true;
6000 }
6001
6002 // log + wait
6003 EUpdate *le = new EUpdate(mdlog, "link_local");
6004 mdlog->start_entry(le);
6005 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6006 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
6007 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
6008 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6009 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
6010
6011 // do this after predirty_*, to avoid funky extra dnl arg
6012 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6013
6014 journal_and_reply(mdr, targeti, dn, le,
6015 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
6016 }
6017
6018 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
6019 version_t dnpv, version_t tipv, bool adjust_realm)
6020 {
6021 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
6022
6023 // link and unlock the NEW dentry
6024 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6025 if (!dnl->get_inode())
6026 dn->link_remote(dnl, targeti);
6027 dn->mark_dirty(dnpv, mdr->ls);
6028
6029 // target inode
6030 targeti->pop_and_dirty_projected_inode(mdr->ls);
6031
6032 mdr->apply();
6033
6034 MDRequestRef null_ref;
6035 mdcache->send_dentry_link(dn, null_ref);
6036
6037 if (adjust_realm) {
6038 int op = CEPH_SNAP_OP_SPLIT;
6039 mds->mdcache->send_snap_update(targeti, 0, op);
6040 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6041 }
6042
6043 // bump target popularity
6044 mds->balancer->hit_inode(targeti, META_POP_IWR);
6045 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6046
6047 // reply
6048 respond_to_request(mdr, 0);
6049 }
6050
6051
6052 // link / unlink remote
6053
6054 class C_MDS_link_remote_finish : public ServerLogContext {
6055 bool inc;
6056 CDentry *dn;
6057 CInode *targeti;
6058 version_t dpv;
6059 public:
6060 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
6061 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
6062 dpv(d->get_projected_version()) {}
6063 void finish(int r) override {
6064 ceph_assert(r == 0);
6065 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
6066 }
6067 };
6068
6069 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
6070 {
6071 dout(10) << "_link_remote "
6072 << (inc ? "link ":"unlink ")
6073 << *dn << " to " << *targeti << dendl;
6074
6075 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6076 mds_rank_t linkauth = targeti->authority().first;
6077 if (mdr->more()->witnessed.count(linkauth) == 0) {
6078 if (mds->is_cluster_degraded() &&
6079 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
6080 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
6081 if (mdr->more()->waiting_on_slave.empty())
6082 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
6083 return;
6084 }
6085
6086 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
6087 int op;
6088 if (inc)
6089 op = MMDSSlaveRequest::OP_LINKPREP;
6090 else
6091 op = MMDSSlaveRequest::OP_UNLINKPREP;
6092 auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, op);
6093 targeti->set_object_info(req->get_object_info());
6094 req->op_stamp = mdr->get_op_stamp();
6095 if (auto& desti_srnode = mdr->more()->desti_srnode)
6096 encode(*desti_srnode, req->desti_snapbl);
6097 mds->send_message_mds(req, linkauth);
6098
6099 ceph_assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
6100 mdr->more()->waiting_on_slave.insert(linkauth);
6101 return;
6102 }
6103 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
6104
6105 ceph_assert(g_conf()->mds_kill_link_at != 2);
6106
6107 if (auto& desti_srnode = mdr->more()->desti_srnode) {
6108 delete desti_srnode;
6109 desti_srnode = NULL;
6110 }
6111
6112 mdr->set_mds_stamp(ceph_clock_now());
6113
6114 // add to event
6115 mdr->ls = mdlog->get_current_segment();
6116 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
6117 mdlog->start_entry(le);
6118 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6119 if (!mdr->more()->witnessed.empty()) {
6120 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6121 le->reqid = mdr->reqid;
6122 le->had_slaves = true;
6123 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6124 }
6125
6126 if (inc) {
6127 dn->pre_dirty();
6128 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
6129 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6130 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6131 } else {
6132 dn->pre_dirty();
6133 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
6134 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6135 le->metablob.add_null_dentry(dn, true);
6136 dn->push_projected_linkage();
6137 }
6138
6139 journal_and_reply(mdr, targeti, dn, le, new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
6140 }
6141
6142 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
6143 CDentry *dn, CInode *targeti,
6144 version_t dpv)
6145 {
6146 dout(10) << "_link_remote_finish "
6147 << (inc ? "link ":"unlink ")
6148 << *dn << " to " << *targeti << dendl;
6149
6150 ceph_assert(g_conf()->mds_kill_link_at != 3);
6151
6152 if (!mdr->more()->witnessed.empty())
6153 mdcache->logged_master_update(mdr->reqid);
6154
6155 if (inc) {
6156 // link the new dentry
6157 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6158 if (!dnl->get_inode())
6159 dn->link_remote(dnl, targeti);
6160 dn->mark_dirty(dpv, mdr->ls);
6161 } else {
6162 // unlink main dentry
6163 dn->get_dir()->unlink_inode(dn);
6164 dn->pop_projected_linkage();
6165 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
6166 }
6167
6168 mdr->apply();
6169
6170 MDRequestRef null_ref;
6171 if (inc)
6172 mdcache->send_dentry_link(dn, null_ref);
6173 else
6174 mdcache->send_dentry_unlink(dn, NULL, null_ref);
6175
6176 // bump target popularity
6177 mds->balancer->hit_inode(targeti, META_POP_IWR);
6178 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6179
6180 // reply
6181 respond_to_request(mdr, 0);
6182
6183 if (!inc)
6184 // removing a new dn?
6185 dn->get_dir()->try_remove_unlinked_dn(dn);
6186 }
6187
6188
6189 // remote linking/unlinking
6190
6191 class C_MDS_SlaveLinkPrep : public ServerLogContext {
6192 CInode *targeti;
6193 bool adjust_realm;
6194 public:
6195 C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
6196 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
6197 void finish(int r) override {
6198 ceph_assert(r == 0);
6199 server->_logged_slave_link(mdr, targeti, adjust_realm);
6200 }
6201 };
6202
6203 class C_MDS_SlaveLinkCommit : public ServerContext {
6204 MDRequestRef mdr;
6205 CInode *targeti;
6206 public:
6207 C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
6208 ServerContext(s), mdr(r), targeti(t) { }
6209 void finish(int r) override {
6210 server->_commit_slave_link(mdr, r, targeti);
6211 }
6212 };
6213
6214 void Server::handle_slave_link_prep(MDRequestRef& mdr)
6215 {
6216 dout(10) << "handle_slave_link_prep " << *mdr
6217 << " on " << mdr->slave_request->get_object_info()
6218 << dendl;
6219
6220 ceph_assert(g_conf()->mds_kill_link_at != 4);
6221
6222 CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
6223 ceph_assert(targeti);
6224 dout(10) << "targeti " << *targeti << dendl;
6225 CDentry *dn = targeti->get_parent_dn();
6226 CDentry::linkage_t *dnl = dn->get_linkage();
6227 ceph_assert(dnl->is_primary());
6228
6229 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6230
6231 mdr->auth_pin(targeti);
6232
6233 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
6234 ceph_assert(g_conf()->mds_kill_link_at != 5);
6235
6236 // journal it
6237 mdr->ls = mdlog->get_current_segment();
6238 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
6239 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
6240 mdlog->start_entry(le);
6241
6242 auto &pi = dnl->get_inode()->project_inode();
6243
6244 // update journaled target inode
6245 bool inc;
6246 bool adjust_realm = false;
6247 bool realm_projected = false;
6248 if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
6249 inc = true;
6250 pi.inode.nlink++;
6251 if (!targeti->is_projected_snaprealm_global()) {
6252 sr_t *newsnap = targeti->project_snaprealm();
6253 targeti->mark_snaprealm_global(newsnap);
6254 targeti->record_snaprealm_parent_dentry(newsnap, NULL, targeti->get_projected_parent_dn(), true);
6255 adjust_realm = true;
6256 realm_projected = true;
6257 }
6258 } else {
6259 inc = false;
6260 pi.inode.nlink--;
6261 if (targeti->is_projected_snaprealm_global()) {
6262 ceph_assert(mdr->slave_request->desti_snapbl.length());
6263 auto p = mdr->slave_request->desti_snapbl.cbegin();
6264
6265 sr_t *newsnap = targeti->project_snaprealm();
6266 decode(*newsnap, p);
6267
6268 if (pi.inode.nlink == 0)
6269 ceph_assert(!newsnap->is_parent_global());
6270
6271 realm_projected = true;
6272 } else {
6273 ceph_assert(mdr->slave_request->desti_snapbl.length() == 0);
6274 }
6275 }
6276
6277 link_rollback rollback;
6278 rollback.reqid = mdr->reqid;
6279 rollback.ino = targeti->ino();
6280 rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
6281 const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
6282 rollback.old_dir_mtime = pf->fragstat.mtime;
6283 rollback.old_dir_rctime = pf->rstat.rctime;
6284 rollback.was_inc = inc;
6285 if (realm_projected) {
6286 if (targeti->snaprealm) {
6287 encode(true, rollback.snapbl);
6288 targeti->encode_snap_blob(rollback.snapbl);
6289 } else {
6290 encode(false, rollback.snapbl);
6291 }
6292 }
6293 encode(rollback, le->rollback);
6294 mdr->more()->rollback_bl = le->rollback;
6295
6296 pi.inode.ctime = mdr->get_op_stamp();
6297 pi.inode.version = targeti->pre_dirty();
6298
6299 dout(10) << " projected inode " << pi.inode.ino << " v " << pi.inode.version << dendl;
6300
6301 // commit case
6302 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
6303 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
6304
6305 // set up commit waiter
6306 mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
6307
6308 mdr->more()->slave_update_journaled = true;
6309 submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, adjust_realm),
6310 mdr, __func__);
6311 mdlog->flush();
6312 }
6313
6314 void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
6315 {
6316 dout(10) << "_logged_slave_link " << *mdr
6317 << " " << *targeti << dendl;
6318
6319 ceph_assert(g_conf()->mds_kill_link_at != 6);
6320
6321 // update the target
6322 targeti->pop_and_dirty_projected_inode(mdr->ls);
6323 mdr->apply();
6324
6325 // hit pop
6326 mds->balancer->hit_inode(targeti, META_POP_IWR);
6327
6328 // done.
6329 mdr->reset_slave_request();
6330
6331 if (adjust_realm) {
6332 int op = CEPH_SNAP_OP_SPLIT;
6333 mds->mdcache->send_snap_update(targeti, 0, op);
6334 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6335 }
6336
6337 // ack
6338 if (!mdr->aborted) {
6339 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_LINKPREPACK);
6340 mds->send_message_mds(reply, mdr->slave_to_mds);
6341 } else {
6342 dout(10) << " abort flag set, finishing" << dendl;
6343 mdcache->request_finish(mdr);
6344 }
6345 }
6346
6347
6348 struct C_MDS_CommittedSlave : public ServerLogContext {
6349 C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
6350 void finish(int r) override {
6351 server->_committed_slave(mdr);
6352 }
6353 };
6354
6355 void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
6356 {
6357 dout(10) << "_commit_slave_link " << *mdr
6358 << " r=" << r
6359 << " " << *targeti << dendl;
6360
6361 ceph_assert(g_conf()->mds_kill_link_at != 7);
6362
6363 if (r == 0) {
6364 // drop our pins, etc.
6365 mdr->cleanup();
6366
6367 // write a commit to the journal
6368 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
6369 ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
6370 mdlog->start_entry(le);
6371 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
6372 mdlog->flush();
6373 } else {
6374 do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
6375 }
6376 }
6377
6378 void Server::_committed_slave(MDRequestRef& mdr)
6379 {
6380 dout(10) << "_committed_slave " << *mdr << dendl;
6381
6382 ceph_assert(g_conf()->mds_kill_link_at != 8);
6383
6384 auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_COMMITTED);
6385 mds->send_message_mds(req, mdr->slave_to_mds);
6386 mdcache->request_finish(mdr);
6387 }
6388
6389 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
6390 MutationRef mut;
6391 map<client_t,MClientSnap::ref> splits;
6392 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
6393 map<client_t,MClientSnap::ref>&& _splits) :
6394 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
6395 }
6396 void finish(int r) override {
6397 server->_link_rollback_finish(mut, mdr, splits);
6398 }
6399 };
6400
6401 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
6402 {
6403 link_rollback rollback;
6404 auto p = rbl.cbegin();
6405 decode(rollback, p);
6406
6407 dout(10) << "do_link_rollback on " << rollback.reqid
6408 << (rollback.was_inc ? " inc":" dec")
6409 << " ino " << rollback.ino
6410 << dendl;
6411
6412 ceph_assert(g_conf()->mds_kill_link_at != 9);
6413
6414 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
6415 ceph_assert(mdr || mds->is_resolve());
6416
6417 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
6418 mut->ls = mds->mdlog->get_current_segment();
6419
6420 CInode *in = mdcache->get_inode(rollback.ino);
6421 ceph_assert(in);
6422 dout(10) << " target is " << *in << dendl;
6423 ceph_assert(!in->is_projected()); // live slave request hold versionlock xlock.
6424
6425 auto &pi = in->project_inode();
6426 pi.inode.version = in->pre_dirty();
6427 mut->add_projected_inode(in);
6428
6429 // parent dir rctime
6430 CDir *parent = in->get_projected_parent_dn()->get_dir();
6431 fnode_t *pf = parent->project_fnode();
6432 mut->add_projected_fnode(parent);
6433 pf->version = parent->pre_dirty();
6434 if (pf->fragstat.mtime == pi.inode.ctime) {
6435 pf->fragstat.mtime = rollback.old_dir_mtime;
6436 if (pf->rstat.rctime == pi.inode.ctime)
6437 pf->rstat.rctime = rollback.old_dir_rctime;
6438 mut->add_updated_lock(&parent->get_inode()->filelock);
6439 mut->add_updated_lock(&parent->get_inode()->nestlock);
6440 }
6441
6442 // inode
6443 pi.inode.ctime = rollback.old_ctime;
6444 if (rollback.was_inc)
6445 pi.inode.nlink--;
6446 else
6447 pi.inode.nlink++;
6448
6449 map<client_t,MClientSnap::ref> splits;
6450 if (rollback.snapbl.length() && in->snaprealm) {
6451 bool hadrealm;
6452 auto p = rollback.snapbl.cbegin();
6453 decode(hadrealm, p);
6454 if (hadrealm) {
6455 if (!mds->is_resolve()) {
6456 sr_t *new_srnode = new sr_t();
6457 decode(*new_srnode, p);
6458 in->project_snaprealm(new_srnode);
6459 } else {
6460 decode(in->snaprealm->srnode, p);
6461 }
6462 } else {
6463 SnapRealm *realm = parent->get_inode()->find_snaprealm();
6464 if (!mds->is_resolve())
6465 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
6466 in->project_snaprealm(NULL);
6467 }
6468 }
6469
6470 // journal it
6471 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
6472 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
6473 mdlog->start_entry(le);
6474 le->commit.add_dir_context(parent);
6475 le->commit.add_dir(parent, true);
6476 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
6477
6478 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
6479 mdr, __func__);
6480 mdlog->flush();
6481 }
6482
6483 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
6484 map<client_t,MClientSnap::ref>& splits)
6485 {
6486 dout(10) << "_link_rollback_finish" << dendl;
6487
6488 ceph_assert(g_conf()->mds_kill_link_at != 10);
6489
6490 mut->apply();
6491
6492 if (!mds->is_resolve())
6493 mdcache->send_snaps(splits);
6494
6495 if (mdr)
6496 mdcache->request_finish(mdr);
6497
6498 mdcache->finish_rollback(mut->reqid);
6499
6500 mut->cleanup();
6501 }
6502
6503
6504 void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &m)
6505 {
6506 dout(10) << "handle_slave_link_prep_ack " << *mdr
6507 << " " << *m << dendl;
6508 mds_rank_t from = mds_rank_t(m->get_source().num());
6509
6510 ceph_assert(g_conf()->mds_kill_link_at != 11);
6511
6512 // note slave
6513 mdr->more()->slaves.insert(from);
6514
6515 // witnessed!
6516 ceph_assert(mdr->more()->witnessed.count(from) == 0);
6517 mdr->more()->witnessed.insert(from);
6518 ceph_assert(!m->is_not_journaled());
6519 mdr->more()->has_journaled_slaves = true;
6520
6521 // remove from waiting list
6522 ceph_assert(mdr->more()->waiting_on_slave.count(from));
6523 mdr->more()->waiting_on_slave.erase(from);
6524
6525 ceph_assert(mdr->more()->waiting_on_slave.empty());
6526
6527 dispatch_client_request(mdr); // go again!
6528 }
6529
6530
6531
6532
6533
6534 // UNLINK
6535
6536 void Server::handle_client_unlink(MDRequestRef& mdr)
6537 {
6538 const MClientRequest::const_ref &req = mdr->client_request;
6539 client_t client = mdr->get_client();
6540
6541 // rmdir or unlink?
6542 bool rmdir = false;
6543 if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true;
6544
6545 const filepath& refpath = req->get_filepath();
6546 if (refpath.depth() == 0) {
6547 respond_to_request(mdr, -EINVAL);
6548 return;
6549 }
6550 if (refpath.is_last_dot_or_dotdot()) {
6551 respond_to_request(mdr, -ENOTEMPTY);
6552 return;
6553 }
6554
6555 // traverse to path
6556 vector<CDentry*> trace;
6557 CInode *in;
6558 CF_MDS_MDRContextFactory cf(mdcache, mdr);
6559 int r = mdcache->path_traverse(mdr, cf, refpath, &trace, &in, MDS_TRAVERSE_FORWARD);
6560 if (r > 0) return;
6561 if (r < 0) {
6562 if (r == -ESTALE) {
6563 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
6564 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
6565 return;
6566 }
6567 respond_to_request(mdr, r);
6568 return;
6569 }
6570 if (mdr->snapid != CEPH_NOSNAP) {
6571 respond_to_request(mdr, -EROFS);
6572 return;
6573 }
6574
6575 CDentry *dn = trace.back();
6576 ceph_assert(dn);
6577 if (!dn->is_auth()) {
6578 mdcache->request_forward(mdr, dn->authority().first);
6579 return;
6580 }
6581
6582 CInode *diri = dn->get_dir()->get_inode();
6583
6584 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
6585 ceph_assert(!dnl->is_null());
6586
6587 if (rmdir) {
6588 dout(7) << "handle_client_rmdir on " << *dn << dendl;
6589 } else {
6590 dout(7) << "handle_client_unlink on " << *dn << dendl;
6591 }
6592 dout(7) << "dn links to " << *in << dendl;
6593
6594 // rmdir vs is_dir
6595 if (in->is_dir()) {
6596 if (rmdir) {
6597 // do empty directory checks
6598 if (_dir_is_nonempty_unlocked(mdr, in)) {
6599 respond_to_request(mdr, -ENOTEMPTY);
6600 return;
6601 }
6602 } else {
6603 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
6604 respond_to_request(mdr, -EISDIR);
6605 return;
6606 }
6607 } else {
6608 if (rmdir) {
6609 // unlink
6610 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
6611 respond_to_request(mdr, -ENOTDIR);
6612 return;
6613 }
6614 }
6615
6616 // -- create stray dentry? --
6617 CDentry *straydn = NULL;
6618 if (dnl->is_primary()) {
6619 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
6620 if (!straydn)
6621 return;
6622 dout(10) << " straydn is " << *straydn << dendl;
6623 } else if (mdr->straydn) {
6624 mdr->unpin(mdr->straydn);
6625 mdr->straydn = NULL;
6626 }
6627
6628 // lock
6629 MutationImpl::LockOpVec lov;
6630
6631 for (int i=0; i<(int)trace.size()-1; i++)
6632 lov.add_rdlock(&trace[i]->lock);
6633 lov.add_xlock(&dn->lock);
6634 lov.add_wrlock(&diri->filelock);
6635 lov.add_wrlock(&diri->nestlock);
6636 lov.add_xlock(&in->linklock);
6637 if (straydn) {
6638 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
6639 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
6640 lov.add_xlock(&straydn->lock);
6641 }
6642
6643 mds->locker->include_snap_rdlocks(diri, lov);
6644 lov.add_xlock(&in->snaplock);
6645 if (in->is_dir())
6646 lov.add_rdlock(&in->filelock); // to verify it's empty
6647
6648 if (!mds->locker->acquire_locks(mdr, lov))
6649 return;
6650
6651 if (in->is_dir() &&
6652 _dir_is_nonempty(mdr, in)) {
6653 respond_to_request(mdr, -ENOTEMPTY);
6654 return;
6655 }
6656
6657 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6658 if (!check_access(mdr, diri, MAY_WRITE))
6659 return;
6660 }
6661
6662 if (straydn)
6663 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
6664
6665 if (!mdr->more()->desti_srnode) {
6666 if (in->is_projected_snaprealm_global()) {
6667 sr_t *new_srnode = in->prepare_new_srnode(0);
6668 in->record_snaprealm_parent_dentry(new_srnode, NULL, dn, dnl->is_primary());
6669 // dropping the last linkage or dropping the last remote linkage,
6670 // detch the inode from global snaprealm
6671 auto nlink = in->get_projected_inode()->nlink;
6672 if (nlink == 1 ||
6673 (nlink == 2 && !dnl->is_primary() &&
6674 !in->get_projected_parent_dir()->inode->is_stray()))
6675 in->clear_snaprealm_global(new_srnode);
6676 mdr->more()->desti_srnode = new_srnode;
6677 } else if (dnl->is_primary()) {
6678 // prepare snaprealm blob for slave request
6679 SnapRealm *realm = in->find_snaprealm();
6680 snapid_t follows = realm->get_newest_seq();
6681 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
6682 sr_t *new_srnode = in->prepare_new_srnode(follows);
6683 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
6684 mdr->more()->desti_srnode = new_srnode;
6685 }
6686 }
6687 }
6688
6689 // yay!
6690 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
6691 // subtree root auths need to be witnesses
6692 set<mds_rank_t> witnesses;
6693 in->list_replicas(witnesses);
6694 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
6695
6696 for (set<mds_rank_t>::iterator p = witnesses.begin();
6697 p != witnesses.end();
6698 ++p) {
6699 if (mdr->more()->witnessed.count(*p)) {
6700 dout(10) << " already witnessed by mds." << *p << dendl;
6701 } else if (mdr->more()->waiting_on_slave.count(*p)) {
6702 dout(10) << " already waiting on witness mds." << *p << dendl;
6703 } else {
6704 if (!_rmdir_prepare_witness(mdr, *p, trace, straydn))
6705 return;
6706 }
6707 }
6708 if (!mdr->more()->waiting_on_slave.empty())
6709 return; // we're waiting for a witness.
6710 }
6711
6712 // ok!
6713 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
6714 _link_remote(mdr, false, dn, dnl->get_inode());
6715 else
6716 _unlink_local(mdr, dn, straydn);
6717 }
6718
6719 class C_MDS_unlink_local_finish : public ServerLogContext {
6720 CDentry *dn;
6721 CDentry *straydn;
6722 version_t dnpv; // deleted dentry
6723 public:
6724 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
6725 ServerLogContext(s, r), dn(d), straydn(sd),
6726 dnpv(d->get_projected_version()) {}
6727 void finish(int r) override {
6728 ceph_assert(r == 0);
6729 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
6730 }
6731 };
6732
6733 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
6734 {
6735 dout(10) << "_unlink_local " << *dn << dendl;
6736
6737 CDentry::linkage_t *dnl = dn->get_projected_linkage();
6738 CInode *in = dnl->get_inode();
6739
6740
6741 // ok, let's do it.
6742 mdr->ls = mdlog->get_current_segment();
6743
6744 // prepare log entry
6745 EUpdate *le = new EUpdate(mdlog, "unlink_local");
6746 mdlog->start_entry(le);
6747 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6748 if (!mdr->more()->witnessed.empty()) {
6749 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6750 le->reqid = mdr->reqid;
6751 le->had_slaves = true;
6752 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6753 }
6754
6755 if (straydn) {
6756 ceph_assert(dnl->is_primary());
6757 straydn->push_projected_linkage(in);
6758 }
6759
6760 // the unlinked dentry
6761 dn->pre_dirty();
6762
6763 auto &pi = in->project_inode();
6764 {
6765 std::string t;
6766 dn->make_path_string(t, true);
6767 pi.inode.stray_prior_path = std::move(t);
6768 }
6769 pi.inode.version = in->pre_dirty();
6770 pi.inode.ctime = mdr->get_op_stamp();
6771 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
6772 pi.inode.rstat.rctime = mdr->get_op_stamp();
6773 pi.inode.change_attr++;
6774 pi.inode.nlink--;
6775 if (pi.inode.nlink == 0)
6776 in->state_set(CInode::STATE_ORPHAN);
6777
6778 if (mdr->more()->desti_srnode) {
6779 auto& desti_srnode = mdr->more()->desti_srnode;
6780 in->project_snaprealm(desti_srnode);
6781 desti_srnode = NULL;
6782 }
6783
6784 if (straydn) {
6785 // will manually pop projected inode
6786
6787 // primary link. add stray dentry.
6788 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
6789 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6790
6791 pi.inode.update_backtrace();
6792 le->metablob.add_primary_dentry(straydn, in, true, true);
6793 } else {
6794 mdr->add_projected_inode(in);
6795 // remote link. update remote inode.
6796 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
6797 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6798 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
6799 }
6800
6801 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6802 le->metablob.add_null_dentry(dn, true);
6803
6804 if (in->is_dir()) {
6805 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
6806 le->metablob.renamed_dirino = in->ino();
6807 }
6808
6809 dn->push_projected_linkage();
6810
6811 if (straydn) {
6812 ceph_assert(in->first <= straydn->first);
6813 in->first = straydn->first;
6814 }
6815
6816 if (in->is_dir()) {
6817 ceph_assert(straydn);
6818 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
6819 }
6820
6821 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
6822 }
6823
6824 void Server::_unlink_local_finish(MDRequestRef& mdr,
6825 CDentry *dn, CDentry *straydn,
6826 version_t dnpv)
6827 {
6828 dout(10) << "_unlink_local_finish " << *dn << dendl;
6829
6830 if (!mdr->more()->witnessed.empty())
6831 mdcache->logged_master_update(mdr->reqid);
6832
6833 CInode *strayin = NULL;
6834 bool hadrealm = false;
6835 if (straydn) {
6836 // if there is newly created snaprealm, need to split old snaprealm's
6837 // inodes_with_caps. So pop snaprealm before linkage changes.
6838 strayin = dn->get_linkage()->get_inode();
6839 hadrealm = strayin->snaprealm ? true : false;
6840 strayin->early_pop_projected_snaprealm();
6841 }
6842
6843 // unlink main dentry
6844 dn->get_dir()->unlink_inode(dn);
6845 dn->pop_projected_linkage();
6846
6847 // relink as stray? (i.e. was primary link?)
6848 if (straydn) {
6849 dout(20) << " straydn is " << *straydn << dendl;
6850 straydn->pop_projected_linkage();
6851
6852 strayin->pop_and_dirty_projected_inode(mdr->ls);
6853
6854 mdcache->touch_dentry_bottom(straydn);
6855 }
6856
6857 dn->mark_dirty(dnpv, mdr->ls);
6858 mdr->apply();
6859
6860 mdcache->send_dentry_unlink(dn, straydn, mdr);
6861
6862 if (straydn) {
6863 // update subtree map?
6864 if (strayin->is_dir())
6865 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
6866
6867 if (strayin->snaprealm && !hadrealm)
6868 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
6869 }
6870
6871 // bump pop
6872 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6873
6874 // reply
6875 respond_to_request(mdr, 0);
6876
6877 // removing a new dn?
6878 dn->get_dir()->try_remove_unlinked_dn(dn);
6879
6880 // clean up ?
6881 // respond_to_request() drops locks. So stray reintegration can race with us.
6882 if (straydn && !straydn->get_projected_linkage()->is_null()) {
6883 // Tip off the MDCache that this dentry is a stray that
6884 // might be elegible for purge.
6885 mdcache->notify_stray(straydn);
6886 }
6887 }
6888
6889 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
6890 {
6891 if (mds->is_cluster_degraded() &&
6892 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
6893 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
6894 if (mdr->more()->waiting_on_slave.empty())
6895 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
6896 return false;
6897 }
6898
6899 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
6900 auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREP);
6901 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
6902 for (auto dn : trace)
6903 req->srcdnpath.push_dentry(dn->get_name());
6904 mdcache->replicate_stray(straydn, who, req->straybl);
6905 if (mdr->more()->desti_srnode)
6906 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
6907
6908 req->op_stamp = mdr->get_op_stamp();
6909 mds->send_message_mds(req, who);
6910
6911 ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
6912 mdr->more()->waiting_on_slave.insert(who);
6913 return true;
6914 }
6915
6916 struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
6917 CDentry *dn, *straydn;
6918 C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
6919 : ServerLogContext(s, r), dn(d), straydn(st) {}
6920 void finish(int r) override {
6921 server->_logged_slave_rmdir(mdr, dn, straydn);
6922 }
6923 };
6924
6925 struct C_MDS_SlaveRmdirCommit : public ServerContext {
6926 MDRequestRef mdr;
6927 CDentry *straydn;
6928 C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
6929 : ServerContext(s), mdr(r), straydn(sd) { }
6930 void finish(int r) override {
6931 server->_commit_slave_rmdir(mdr, r, straydn);
6932 }
6933 };
6934
6935 void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
6936 {
6937 dout(10) << "handle_slave_rmdir_prep " << *mdr
6938 << " " << mdr->slave_request->srcdnpath
6939 << " to " << mdr->slave_request->destdnpath
6940 << dendl;
6941
6942 vector<CDentry*> trace;
6943 filepath srcpath(mdr->slave_request->srcdnpath);
6944 dout(10) << " src " << srcpath << dendl;
6945 CInode *in;
6946 CF_MDS_MDRContextFactory cf(mdcache, mdr);
6947 int r = mdcache->path_traverse(mdr, cf, srcpath, &trace, &in, MDS_TRAVERSE_DISCOVERXLOCK);
6948 if (r > 0) return;
6949 if (r == -ESTALE) {
6950 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
6951 mdr->slave_to_mds);
6952 return;
6953 }
6954 ceph_assert(r == 0);
6955 CDentry *dn = trace.back();
6956 dout(10) << " dn " << *dn << dendl;
6957 mdr->pin(dn);
6958
6959 ceph_assert(mdr->straydn);
6960 CDentry *straydn = mdr->straydn;
6961 dout(10) << " straydn " << *straydn << dendl;
6962
6963 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6964
6965 rmdir_rollback rollback;
6966 rollback.reqid = mdr->reqid;
6967 rollback.src_dir = dn->get_dir()->dirfrag();
6968 rollback.src_dname = dn->get_name();
6969 rollback.dest_dir = straydn->get_dir()->dirfrag();
6970 rollback.dest_dname = straydn->get_name();
6971 if (mdr->slave_request->desti_snapbl.length()) {
6972 if (in->snaprealm) {
6973 encode(true, rollback.snapbl);
6974 in->encode_snap_blob(rollback.snapbl);
6975 } else {
6976 encode(false, rollback.snapbl);
6977 }
6978 }
6979 encode(rollback, mdr->more()->rollback_bl);
6980 // FIXME: rollback snaprealm
6981 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
6982
6983 // set up commit waiter
6984 mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
6985
6986 straydn->push_projected_linkage(in);
6987 dn->push_projected_linkage();
6988
6989 ceph_assert(straydn->first >= in->first);
6990 in->first = straydn->first;
6991
6992 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
6993 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
6994 _logged_slave_rmdir(mdr, dn, straydn);
6995 return;
6996 }
6997
6998 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
6999 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
7000 mdlog->start_entry(le);
7001 le->rollback = mdr->more()->rollback_bl;
7002
7003 le->commit.add_dir_context(straydn->get_dir());
7004 le->commit.add_primary_dentry(straydn, in, true);
7005 // slave: no need to journal original dentry
7006
7007 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7008 le->commit.renamed_dirino = in->ino();
7009
7010 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7011
7012 mdr->more()->slave_update_journaled = true;
7013 submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
7014 mdr, __func__);
7015 mdlog->flush();
7016 }
7017
7018 void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7019 {
7020 dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
7021 CInode *in = dn->get_linkage()->get_inode();
7022
7023 bool new_realm;
7024 if (mdr->slave_request->desti_snapbl.length()) {
7025 new_realm = !in->snaprealm;
7026 in->decode_snap_blob(mdr->slave_request->desti_snapbl);
7027 ceph_assert(in->snaprealm);
7028 ceph_assert(in->snaprealm->have_past_parents_open());
7029 } else {
7030 new_realm = false;
7031 }
7032
7033 // update our cache now, so we are consistent with what is in the journal
7034 // when we journal a subtree map
7035 dn->get_dir()->unlink_inode(dn);
7036 straydn->pop_projected_linkage();
7037 dn->pop_projected_linkage();
7038
7039 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->slave_update_journaled);
7040
7041 if (new_realm)
7042 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
7043
7044 // done.
7045 mdr->reset_slave_request();
7046 mdr->straydn = 0;
7047
7048 if (!mdr->aborted) {
7049 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREPACK);
7050 if (!mdr->more()->slave_update_journaled)
7051 reply->mark_not_journaled();
7052 mds->send_message_mds(reply, mdr->slave_to_mds);
7053 } else {
7054 dout(10) << " abort flag set, finishing" << dendl;
7055 mdcache->request_finish(mdr);
7056 }
7057 }
7058
7059 void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
7060 {
7061 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
7062 << " " << *ack << dendl;
7063
7064 mds_rank_t from = mds_rank_t(ack->get_source().num());
7065
7066 mdr->more()->slaves.insert(from);
7067 mdr->more()->witnessed.insert(from);
7068 if (!ack->is_not_journaled())
7069 mdr->more()->has_journaled_slaves = true;
7070
7071 // remove from waiting list
7072 ceph_assert(mdr->more()->waiting_on_slave.count(from));
7073 mdr->more()->waiting_on_slave.erase(from);
7074
7075 if (mdr->more()->waiting_on_slave.empty())
7076 dispatch_client_request(mdr); // go again!
7077 else
7078 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
7079 }
7080
7081 void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
7082 {
7083 dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
7084
7085 if (r == 0) {
7086 if (mdr->more()->slave_update_journaled) {
7087 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7088 if (strayin && !strayin->snaprealm)
7089 mdcache->clear_dirty_bits_for_stray(strayin);
7090 }
7091
7092 mdr->cleanup();
7093
7094 if (mdr->more()->slave_update_journaled) {
7095 // write a commit to the journal
7096 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
7097 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
7098 ESlaveUpdate::RMDIR);
7099 mdlog->start_entry(le);
7100 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
7101 mdlog->flush();
7102 } else {
7103 _committed_slave(mdr);
7104 }
7105 } else {
7106 // abort
7107 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
7108 }
7109 }
7110
7111 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
7112 metareqid_t reqid;
7113 CDentry *dn;
7114 CDentry *straydn;
7115 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
7116 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
7117 void finish(int r) override {
7118 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
7119 }
7120 };
7121
7122 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
7123 {
7124 // unlink the other rollback methods, the rmdir rollback is only
7125 // needed to record the subtree changes in the journal for inode
7126 // replicas who are auth for empty dirfrags. no actual changes to
7127 // the file system are taking place here, so there is no Mutation.
7128
7129 rmdir_rollback rollback;
7130 auto p = rbl.cbegin();
7131 decode(rollback, p);
7132
7133 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
7134 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
7135 ceph_assert(mdr || mds->is_resolve());
7136
7137 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
7138 if (!dir)
7139 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
7140 ceph_assert(dir);
7141 CDentry *dn = dir->lookup(rollback.src_dname);
7142 ceph_assert(dn);
7143 dout(10) << " dn " << *dn << dendl;
7144 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
7145 ceph_assert(straydir);
7146 CDentry *straydn = straydir->lookup(rollback.dest_dname);
7147 ceph_assert(straydn);
7148 dout(10) << " straydn " << *straydn << dendl;
7149 CInode *in = straydn->get_linkage()->get_inode();
7150
7151 dn->push_projected_linkage(in);
7152 straydn->push_projected_linkage();
7153
7154 if (rollback.snapbl.length() && in->snaprealm) {
7155 bool hadrealm;
7156 auto p = rollback.snapbl.cbegin();
7157 decode(hadrealm, p);
7158 if (hadrealm) {
7159 decode(in->snaprealm->srnode, p);
7160 } else {
7161 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
7162 }
7163 }
7164
7165 if (mdr && !mdr->more()->slave_update_journaled) {
7166 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
7167
7168 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
7169 return;
7170 }
7171
7172
7173 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
7174 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
7175 mdlog->start_entry(le);
7176
7177 le->commit.add_dir_context(dn->get_dir());
7178 le->commit.add_primary_dentry(dn, in, true);
7179 // slave: no need to journal straydn
7180
7181 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7182 le->commit.renamed_dirino = in->ino();
7183
7184 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
7185
7186 submit_mdlog_entry(le,
7187 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
7188 dn, straydn),
7189 mdr, __func__);
7190 mdlog->flush();
7191 }
7192
7193 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
7194 {
7195 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
7196
7197 straydn->get_dir()->unlink_inode(straydn);
7198 dn->pop_projected_linkage();
7199 straydn->pop_projected_linkage();
7200
7201 CInode *in = dn->get_linkage()->get_inode();
7202 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
7203 !mdr || mdr->more()->slave_update_journaled);
7204
7205 if (mds->is_resolve()) {
7206 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
7207 mdcache->try_trim_non_auth_subtree(root);
7208 }
7209
7210 if (mdr)
7211 mdcache->request_finish(mdr);
7212
7213 mdcache->finish_rollback(reqid);
7214 }
7215
7216
7217 /** _dir_is_nonempty[_unlocked]
7218 *
7219 * check if a directory is non-empty (i.e. we can rmdir it).
7220 *
7221 * the unlocked varient this is a fastpath check. we can't really be
7222 * sure until we rdlock the filelock.
7223 */
7224 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
7225 {
7226 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
7227 ceph_assert(in->is_auth());
7228
7229 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
7230 return true; // in a snapshot!
7231
7232 list<CDir*> ls;
7233 in->get_dirfrags(ls);
7234 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7235 CDir *dir = *p;
7236 // is the frag obviously non-empty?
7237 if (dir->is_auth()) {
7238 if (dir->get_projected_fnode()->fragstat.size()) {
7239 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7240 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
7241 return true;
7242 }
7243 }
7244 }
7245
7246 return false;
7247 }
7248
7249 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
7250 {
7251 dout(10) << "dir_is_nonempty " << *in << dendl;
7252 ceph_assert(in->is_auth());
7253 ceph_assert(in->filelock.can_read(mdr->get_client()));
7254
7255 frag_info_t dirstat;
7256 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
7257
7258 list<CDir*> ls;
7259 in->get_dirfrags(ls);
7260 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
7261 CDir *dir = *p;
7262 const fnode_t *pf = dir->get_projected_fnode();
7263 if (pf->fragstat.size()) {
7264 dout(10) << "dir_is_nonempty dirstat has "
7265 << pf->fragstat.size() << " items " << *dir << dendl;
7266 return true;
7267 }
7268
7269 if (pf->accounted_fragstat.version == dirstat_version)
7270 dirstat.add(pf->accounted_fragstat);
7271 else
7272 dirstat.add(pf->fragstat);
7273 }
7274
7275 return dirstat.size() != in->get_projected_inode()->dirstat.size();
7276 }
7277
7278
7279 // ======================================================
7280
7281
7282 class C_MDS_rename_finish : public ServerLogContext {
7283 CDentry *srcdn;
7284 CDentry *destdn;
7285 CDentry *straydn;
7286 public:
7287 C_MDS_rename_finish(Server *s, MDRequestRef& r,
7288 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
7289 ServerLogContext(s, r),
7290 srcdn(sdn), destdn(ddn), straydn(stdn) { }
7291 void finish(int r) override {
7292 ceph_assert(r == 0);
7293 server->_rename_finish(mdr, srcdn, destdn, straydn);
7294 }
7295 };
7296
7297
7298 /** handle_client_rename
7299 *
7300 * rename master is the destdn auth. this is because cached inodes
7301 * must remain connected. thus, any replica of srci, must also
7302 * replicate destdn, and possibly straydn, so that srci (and
7303 * destdn->inode) remain connected during the rename.
7304 *
7305 * to do this, we freeze srci, then master (destdn auth) verifies that
7306 * all other nodes have also replciated destdn and straydn. note that
7307 * destdn replicas need not also replicate srci. this only works when
7308 * destdn is master.
7309 *
7310 * This function takes responsibility for the passed mdr.
7311 */
7312 void Server::handle_client_rename(MDRequestRef& mdr)
7313 {
7314 const MClientRequest::const_ref &req = mdr->client_request;
7315 dout(7) << "handle_client_rename " << *req << dendl;
7316
7317 filepath destpath = req->get_filepath();
7318 filepath srcpath = req->get_filepath2();
7319 if (destpath.depth() == 0 || srcpath.depth() == 0) {
7320 respond_to_request(mdr, -EINVAL);
7321 return;
7322 }
7323 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
7324 respond_to_request(mdr, -EBUSY);
7325 return;
7326 }
7327
7328 std::string_view destname = destpath.last_dentry();
7329
7330 vector<CDentry*>& srctrace = mdr->dn[1];
7331 vector<CDentry*>& desttrace = mdr->dn[0];
7332
7333 MutationImpl::LockOpVec lov;
7334
7335 CDentry *destdn = rdlock_path_xlock_dentry(mdr, 0, lov, true, false, true);
7336 if (!destdn) return;
7337 dout(10) << " destdn " << *destdn << dendl;
7338 if (mdr->snapid != CEPH_NOSNAP) {
7339 respond_to_request(mdr, -EROFS);
7340 return;
7341 }
7342 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7343 CDir *destdir = destdn->get_dir();
7344 ceph_assert(destdir->is_auth());
7345
7346 CF_MDS_MDRContextFactory cf(mdcache, mdr);
7347 int r = mdcache->path_traverse(mdr, cf, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER);
7348 if (r > 0)
7349 return; // delayed
7350 if (r < 0) {
7351 if (r == -ESTALE) {
7352 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
7353 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
7354 } else {
7355 dout(10) << "FAIL on error " << r << dendl;
7356 respond_to_request(mdr, r);
7357 }
7358 return;
7359
7360 }
7361 ceph_assert(!srctrace.empty());
7362 CDentry *srcdn = srctrace.back();
7363 dout(10) << " srcdn " << *srcdn << dendl;
7364 if (srcdn->last != CEPH_NOSNAP) {
7365 respond_to_request(mdr, -EROFS);
7366 return;
7367 }
7368 CDir *srcdir = srcdn->get_dir();
7369 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7370 CInode *srci = srcdnl->get_inode();
7371 dout(10) << " srci " << *srci << dendl;
7372
7373 CInode *oldin = 0;
7374 if (!destdnl->is_null()) {
7375 //dout(10) << "dest dn exists " << *destdn << dendl;
7376 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
7377 if (!oldin) return;
7378 dout(10) << " oldin " << *oldin << dendl;
7379
7380 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
7381 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
7382 respond_to_request(mdr, -ENOTEMPTY);
7383 return;
7384 }
7385
7386 // if srcdn is replica, need to make sure its linkage is correct
7387 if (srcdn->is_auth() ||
7388 srcdn->lock.can_read(mdr->get_client()) ||
7389 (srcdn->lock.is_xlocked() && srcdn->lock.get_xlock_by() == mdr)) {
7390 // mv /some/thing /to/some/existing_other_thing
7391 if (oldin->is_dir() && !srci->is_dir()) {
7392 respond_to_request(mdr, -EISDIR);
7393 return;
7394 }
7395 if (!oldin->is_dir() && srci->is_dir()) {
7396 respond_to_request(mdr, -ENOTDIR);
7397 return;
7398 }
7399 if (srci == oldin && !srcdir->inode->is_stray()) {
7400 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
7401 return;
7402 }
7403 }
7404 }
7405
7406 // -- some sanity checks --
7407
7408 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
7409 if (destpath.get_ino() != srcpath.get_ino() &&
7410 !(req->get_source().is_mds() &&
7411 MDS_INO_IS_MDSDIR(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
7412 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
7413 CInode *destbase = desttrace[0]->get_dir()->get_inode();
7414 // ok, extend srctrace toward root until it is an ancestor of desttrace.
7415 while (srcbase != destbase &&
7416 !srcbase->is_projected_ancestor_of(destbase)) {
7417 CDentry *pdn = srcbase->get_projected_parent_dn();
7418 srctrace.insert(srctrace.begin(), pdn);
7419 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
7420 srcbase = pdn->get_dir()->get_inode();
7421 }
7422
7423 // then, extend destpath until it shares the same parent inode as srcpath.
7424 while (destbase != srcbase) {
7425 CDentry *pdn = destbase->get_projected_parent_dn();
7426 desttrace.insert(desttrace.begin(), pdn);
7427 lov.add_rdlock(&pdn->lock);
7428 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
7429 destbase = pdn->get_dir()->get_inode();
7430 }
7431 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
7432 }
7433
7434 // src == dest?
7435 if (srcdir == destdir && srcdn->get_name() == destname) {
7436 dout(7) << "rename src=dest, noop" << dendl;
7437 respond_to_request(mdr, 0);
7438 return;
7439 }
7440
7441 // dest a child of src?
7442 // e.g. mv /usr /usr/foo
7443 CDentry *pdn = destdir->inode->get_projected_parent_dn();
7444 while (pdn) {
7445 if (pdn == srcdn) {
7446 dout(7) << "cannot rename item to be a child of itself" << dendl;
7447 respond_to_request(mdr, -EINVAL);
7448 return;
7449 }
7450 pdn = pdn->get_dir()->inode->parent;
7451 }
7452
7453 // is this a stray migration, reintegration or merge? (sanity checks!)
7454 if (mdr->reqid.name.is_mds() &&
7455 !(MDS_INO_IS_MDSDIR(srcpath.get_ino()) &&
7456 MDS_INO_IS_MDSDIR(destpath.get_ino())) &&
7457 !(destdnl->is_remote() &&
7458 destdnl->get_remote_ino() == srci->ino())) {
7459 respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
7460 return;
7461 }
7462
7463 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
7464 if (linkmerge)
7465 dout(10) << " this is a link merge" << dendl;
7466
7467 // -- create stray dentry? --
7468 CDentry *straydn = NULL;
7469 if (destdnl->is_primary() && !linkmerge) {
7470 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
7471 if (!straydn)
7472 return;
7473 dout(10) << " straydn is " << *straydn << dendl;
7474 } else if (mdr->straydn) {
7475 mdr->unpin(mdr->straydn);
7476 mdr->straydn = NULL;
7477 }
7478
7479 // -- prepare witness list --
7480 /*
7481 * NOTE: we use _all_ replicas as witnesses.
7482 * this probably isn't totally necessary (esp for file renames),
7483 * but if/when we change that, we have to make sure rejoin is
7484 * sufficiently robust to handle strong rejoins from survivors
7485 * with totally wrong dentry->inode linkage.
7486 * (currently, it can ignore rename effects, because the resolve
7487 * stage will sort them out.)
7488 */
7489 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
7490 if (srcdn->is_auth())
7491 srcdn->list_replicas(witnesses);
7492 else
7493 witnesses.insert(srcdn->authority().first);
7494 if (srcdnl->is_remote() && !srci->is_auth())
7495 witnesses.insert(srci->authority().first);
7496 destdn->list_replicas(witnesses);
7497 if (destdnl->is_remote() && !oldin->is_auth())
7498 witnesses.insert(oldin->authority().first);
7499 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7500
7501
7502 // -- locks --
7503
7504 // srctrace items. this mirrors locks taken in rdlock_path_xlock_dentry
7505 for (int i=0; i<(int)srctrace.size(); i++)
7506 lov.add_rdlock(&srctrace[i]->lock);
7507 lov.add_xlock(&srcdn->lock);
7508 mds_rank_t srcdirauth = srcdir->authority().first;
7509 if (srcdirauth != mds->get_nodeid()) {
7510 dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
7511 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdirauth);
7512 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdirauth);
7513 if (srci->is_dir())
7514 lov.add_rdlock(&srci->dirfragtreelock);
7515 } else {
7516 lov.add_wrlock(&srcdir->inode->filelock);
7517 lov.add_wrlock(&srcdir->inode->nestlock);
7518 }
7519 mds->locker->include_snap_rdlocks(srcdir->inode, lov);
7520
7521 // straydn?
7522 if (straydn) {
7523 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7524 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7525 lov.add_xlock(&straydn->lock);
7526 }
7527
7528 // xlock versionlock on dentries if there are witnesses.
7529 // replicas can't see projected dentry linkages, and will get
7530 // confused if we try to pipeline things.
7531 if (!witnesses.empty()) {
7532 // take xlock on all projected ancestor dentries for srcdn and destdn.
7533 // this ensures the srcdn and destdn can be traversed to by the witnesses.
7534 for (int i= 0; i<(int)srctrace.size(); i++) {
7535 if (srctrace[i]->is_auth() && srctrace[i]->is_projected())
7536 lov.add_xlock(&srctrace[i]->versionlock);
7537 }
7538 for (int i=0; i<(int)desttrace.size(); i++) {
7539 if (desttrace[i]->is_auth() && desttrace[i]->is_projected())
7540 lov.add_xlock(&desttrace[i]->versionlock);
7541 }
7542 // xlock srci and oldin's primary dentries, so witnesses can call
7543 // open_remote_ino() with 'want_locked=true' when the srcdn or destdn
7544 // is traversed.
7545 if (srcdnl->is_remote())
7546 lov.add_xlock(&srci->get_projected_parent_dn()->lock);
7547 if (destdnl->is_remote())
7548 lov.add_xlock(&oldin->get_projected_parent_dn()->lock);
7549 }
7550
7551 // we need to update srci's ctime. xlock its least contended lock to do that...
7552 lov.add_xlock(&srci->linklock);
7553 lov.add_xlock(&srci->snaplock);
7554
7555 if (oldin) {
7556 // xlock oldin (for nlink--)
7557 lov.add_xlock(&oldin->linklock);
7558 lov.add_xlock(&oldin->snaplock);
7559 if (oldin->is_dir())
7560 lov.add_rdlock(&oldin->filelock); // to verify it's empty
7561 }
7562
7563 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL;
7564 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
7565 return;
7566
7567 if (linkmerge)
7568 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
7569
7570 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7571 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
7572 return;
7573
7574 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
7575 return;
7576
7577 if (!check_fragment_space(mdr, destdn->get_dir()))
7578 return;
7579
7580 if (!check_access(mdr, srci, MAY_WRITE))
7581 return;
7582 }
7583
7584 // with read lock, really verify oldin is empty
7585 if (oldin &&
7586 oldin->is_dir() &&
7587 _dir_is_nonempty(mdr, oldin)) {
7588 respond_to_request(mdr, -ENOTEMPTY);
7589 return;
7590 }
7591
7592 /* project_snaprealm_past_parent() will do this job
7593 *
7594 // moving between snaprealms?
7595 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
7596 SnapRealm *srcrealm = srci->find_snaprealm();
7597 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
7598 if (srcrealm != destrealm &&
7599 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
7600 destrealm->get_newest_seq() + 1 > srcdn->first)) {
7601 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
7602 mdcache->snaprealm_create(mdr, srci);
7603 return;
7604 }
7605 }
7606 */
7607
7608 ceph_assert(g_conf()->mds_kill_rename_at != 1);
7609
7610 // -- open all srcdn inode frags, if any --
7611 // we need these open so that auth can properly delegate from inode to dirfrags
7612 // after the inode is _ours_.
7613 if (srcdnl->is_primary() &&
7614 !srcdn->is_auth() &&
7615 srci->is_dir()) {
7616 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
7617 mdr->set_stickydirs(srci);
7618
7619 frag_vec_t leaves;
7620 srci->dirfragtree.get_leaves(leaves);
7621 for (const auto& leaf : leaves) {
7622 CDir *dir = srci->get_dirfrag(leaf);
7623 if (!dir) {
7624 dout(10) << " opening " << leaf << " under " << *srci << dendl;
7625 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
7626 return;
7627 }
7628 }
7629 }
7630
7631 // -- prepare snaprealm ---
7632
7633 if (linkmerge) {
7634 if (!mdr->more()->srci_srnode &&
7635 srci->get_projected_inode()->nlink == 1 &&
7636 srci->is_projected_snaprealm_global()) {
7637 sr_t *new_srnode = srci->prepare_new_srnode(0);
7638 srci->record_snaprealm_parent_dentry(new_srnode, NULL, destdn, false);
7639
7640 srci->clear_snaprealm_global(new_srnode);
7641 mdr->more()->srci_srnode = new_srnode;
7642 }
7643 } else {
7644 if (oldin && !mdr->more()->desti_srnode) {
7645 if (oldin->is_projected_snaprealm_global()) {
7646 sr_t *new_srnode = oldin->prepare_new_srnode(0);
7647 oldin->record_snaprealm_parent_dentry(new_srnode, NULL, destdn, destdnl->is_primary());
7648 // dropping the last linkage or dropping the last remote linkage,
7649 // detch the inode from global snaprealm
7650 auto nlink = oldin->get_projected_inode()->nlink;
7651 if (nlink == 1 ||
7652 (nlink == 2 && !destdnl->is_primary() &&
7653 !oldin->get_projected_parent_dir()->inode->is_stray()))
7654 oldin->clear_snaprealm_global(new_srnode);
7655 mdr->more()->desti_srnode = new_srnode;
7656 } else if (destdnl->is_primary()) {
7657 SnapRealm *dest_realm = destdir->inode->find_snaprealm();
7658 snapid_t follows = dest_realm->get_newest_seq();
7659 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
7660 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
7661 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7662 mdr->more()->desti_srnode = new_srnode;
7663 }
7664 }
7665 }
7666 if (!mdr->more()->srci_srnode) {
7667 SnapRealm *dest_realm = destdir->inode->find_snaprealm();
7668 if (srci->is_projected_snaprealm_global()) {
7669 sr_t *new_srnode = srci->prepare_new_srnode(0);
7670 srci->record_snaprealm_parent_dentry(new_srnode, dest_realm, srcdn, srcdnl->is_primary());
7671 mdr->more()->srci_srnode = new_srnode;
7672 } else if (srcdnl->is_primary()) {
7673 SnapRealm *src_realm = srcdir->inode->find_snaprealm();
7674 snapid_t follows = src_realm->get_newest_seq();
7675 if (src_realm != dest_realm &&
7676 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
7677 sr_t *new_srnode = srci->prepare_new_srnode(follows);
7678 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
7679 mdr->more()->srci_srnode = new_srnode;
7680 }
7681 }
7682 }
7683 }
7684
7685 // -- prepare witnesses --
7686
7687 // do srcdn auth last
7688 mds_rank_t last = MDS_RANK_NONE;
7689 if (!srcdn->is_auth()) {
7690 last = srcdn->authority().first;
7691 mdr->more()->srcdn_auth_mds = last;
7692 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
7693 // are involved in the rename operation.
7694 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
7695 dout(10) << " preparing ambiguous auth for srci" << dendl;
7696 ceph_assert(mdr->more()->is_remote_frozen_authpin);
7697 ceph_assert(mdr->more()->rename_inode == srci);
7698 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
7699 return;
7700 }
7701 }
7702
7703 for (set<mds_rank_t>::iterator p = witnesses.begin();
7704 p != witnesses.end();
7705 ++p) {
7706 if (*p == last) continue; // do it last!
7707 if (mdr->more()->witnessed.count(*p)) {
7708 dout(10) << " already witnessed by mds." << *p << dendl;
7709 } else if (mdr->more()->waiting_on_slave.count(*p)) {
7710 dout(10) << " already waiting on witness mds." << *p << dendl;
7711 } else {
7712 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
7713 return;
7714 }
7715 }
7716 if (!mdr->more()->waiting_on_slave.empty())
7717 return; // we're waiting for a witness.
7718
7719 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
7720 dout(10) << " preparing last witness (srcdn auth)" << dendl;
7721 ceph_assert(mdr->more()->waiting_on_slave.count(last) == 0);
7722 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
7723 return;
7724 }
7725
7726 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
7727 if (!mdr->more()->slaves.empty() && !srci->is_dir())
7728 ceph_assert(g_conf()->mds_kill_rename_at != 3);
7729 if (!mdr->more()->slaves.empty() && srci->is_dir())
7730 ceph_assert(g_conf()->mds_kill_rename_at != 4);
7731
7732 // -- declare now --
7733 mdr->set_mds_stamp(ceph_clock_now());
7734
7735 // -- prepare journal entry --
7736 mdr->ls = mdlog->get_current_segment();
7737 EUpdate *le = new EUpdate(mdlog, "rename");
7738 mdlog->start_entry(le);
7739 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7740 if (!mdr->more()->witnessed.empty()) {
7741 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
7742
7743 le->reqid = mdr->reqid;
7744 le->had_slaves = true;
7745
7746 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7747 // no need to send frozen auth pin to recovring auth MDS of srci
7748 mdr->more()->is_remote_frozen_authpin = false;
7749 }
7750
7751 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
7752 if (le->client_map.length())
7753 le->cmapv = mds->sessionmap.get_projected();
7754
7755 // -- commit locally --
7756 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
7757
7758 journal_and_reply(mdr, srci, destdn, le, fin);
7759 }
7760
7761
7762 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7763 {
7764 dout(10) << "_rename_finish " << *mdr << dendl;
7765
7766 if (!mdr->more()->witnessed.empty())
7767 mdcache->logged_master_update(mdr->reqid);
7768
7769 // apply
7770 _rename_apply(mdr, srcdn, destdn, straydn);
7771
7772 mdcache->send_dentry_link(destdn, mdr);
7773
7774 CDentry::linkage_t *destdnl = destdn->get_linkage();
7775 CInode *in = destdnl->get_inode();
7776 bool need_eval = mdr->more()->cap_imports.count(in);
7777
7778 // test hack: test slave commit
7779 if (!mdr->more()->slaves.empty() && !in->is_dir())
7780 ceph_assert(g_conf()->mds_kill_rename_at != 5);
7781 if (!mdr->more()->slaves.empty() && in->is_dir())
7782 ceph_assert(g_conf()->mds_kill_rename_at != 6);
7783
7784 // bump popularity
7785 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
7786 if (destdnl->is_remote() && in->is_auth())
7787 mds->balancer->hit_inode(in, META_POP_IWR);
7788
7789 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
7790
7791 ceph_assert(g_conf()->mds_kill_rename_at != 7);
7792
7793 // reply
7794 respond_to_request(mdr, 0);
7795
7796 if (need_eval)
7797 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
7798
7799 // clean up?
7800 // respond_to_request() drops locks. So stray reintegration can race with us.
7801 if (straydn && !straydn->get_projected_linkage()->is_null()) {
7802 mdcache->notify_stray(straydn);
7803 }
7804 }
7805
7806
7807
7808 // helpers
7809
7810 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
7811 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
7812 {
7813 if (mds->is_cluster_degraded() &&
7814 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7815 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
7816 if (mdr->more()->waiting_on_slave.empty())
7817 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7818 return false;
7819 }
7820
7821 dout(10) << "_rename_prepare_witness mds." << who << dendl;
7822 auto req = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREP);
7823
7824 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
7825 for (auto dn : srctrace)
7826 req->srcdnpath.push_dentry(dn->get_name());
7827 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
7828 for (auto dn : dsttrace)
7829 req->destdnpath.push_dentry(dn->get_name());
7830 if (straydn)
7831 mdcache->replicate_stray(straydn, who, req->straybl);
7832
7833 if (mdr->more()->srci_srnode)
7834 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
7835 if (mdr->more()->desti_srnode)
7836 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
7837
7838 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
7839
7840 // srcdn auth will verify our current witness list is sufficient
7841 req->witnesses = witnesse;
7842
7843 req->op_stamp = mdr->get_op_stamp();
7844 mds->send_message_mds(req, who);
7845
7846 ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
7847 mdr->more()->waiting_on_slave.insert(who);
7848 return true;
7849 }
7850
7851 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
7852 {
7853 version_t oldpv = mdr->more()->inode_import_v;
7854
7855 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7856
7857 /* import node */
7858 auto blp = mdr->more()->inode_import.cbegin();
7859
7860 // imported caps
7861 map<client_t,entity_inst_t> client_map;
7862 map<client_t, client_metadata_t> client_metadata_map;
7863 decode(client_map, blp);
7864 decode(client_metadata_map, blp);
7865 prepare_force_open_sessions(client_map, client_metadata_map,
7866 mdr->more()->imported_session_map);
7867 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
7868 encode(client_metadata_map, *client_map_bl);
7869
7870 list<ScatterLock*> updated_scatterlocks;
7871 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
7872 mdr->more()->cap_imports, updated_scatterlocks);
7873
7874 // hack: force back to !auth and clean, temporarily
7875 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
7876 srcdnl->get_inode()->mark_clean();
7877
7878 return oldpv;
7879 }
7880
7881 bool Server::_need_force_journal(CInode *diri, bool empty)
7882 {
7883 std::vector<CDir*> dirs;
7884 diri->get_dirfrags(dirs);
7885
7886 bool force_journal = false;
7887 if (empty) {
7888 for (const auto& dir : dirs) {
7889 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
7890 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
7891 force_journal = true;
7892 break;
7893 } else
7894 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
7895 }
7896 } else {
7897 // see if any children of our frags are auth subtrees.
7898 std::vector<CDir*> subtrees;
7899 mdcache->get_subtrees(subtrees);
7900 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
7901 for (const auto& dir : dirs) {
7902 for (const auto& subtree : subtrees) {
7903 if (dir->contains(subtree)) {
7904 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
7905 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
7906 << *subtree << dendl;
7907 force_journal = true;
7908 break;
7909 } else
7910 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
7911 } else
7912 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
7913 }
7914 if (force_journal)
7915 break;
7916 }
7917 }
7918 return force_journal;
7919 }
7920
7921 void Server::_rename_prepare(MDRequestRef& mdr,
7922 EMetaBlob *metablob, bufferlist *client_map_bl,
7923 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
7924 {
7925 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
7926 if (straydn)
7927 dout(10) << " straydn " << *straydn << dendl;
7928
7929 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7930 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7931 CInode *srci = srcdnl->get_inode();
7932 CInode *oldin = destdnl->get_inode();
7933
7934 // primary+remote link merge?
7935 bool linkmerge = (srci == oldin);
7936 if (linkmerge)
7937 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
7938 bool silent = srcdn->get_dir()->inode->is_stray();
7939
7940 bool force_journal_dest = false;
7941 if (srci->is_dir() && !destdn->is_auth()) {
7942 if (srci->is_auth()) {
7943 // if we are auth for srci and exporting it, force journal because journal replay needs
7944 // the source inode to create auth subtrees.
7945 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
7946 force_journal_dest = true;
7947 } else
7948 force_journal_dest = _need_force_journal(srci, false);
7949 }
7950
7951 bool force_journal_stray = false;
7952 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
7953 force_journal_stray = _need_force_journal(oldin, true);
7954
7955 if (linkmerge)
7956 dout(10) << " merging remote and primary links to the same inode" << dendl;
7957 if (silent)
7958 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
7959 if (force_journal_dest)
7960 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
7961 if (force_journal_stray)
7962 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
7963
7964 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
7965 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
7966 metablob->renamed_dirino = srci->ino();
7967 } else if (oldin && oldin->is_dir() && force_journal_stray) {
7968 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
7969 metablob->renamed_dirino = oldin->ino();
7970 }
7971
7972 // prepare
7973 CInode::mempool_inode *spi = 0; // renamed inode
7974 CInode::mempool_inode *tpi = 0; // target/overwritten inode
7975
7976 // target inode
7977 if (!linkmerge) {
7978 if (destdnl->is_primary()) {
7979 ceph_assert(straydn); // moving to straydn.
7980 // link--, and move.
7981 if (destdn->is_auth()) {
7982 auto &pi= oldin->project_inode(); //project_snaprealm
7983 pi.inode.version = straydn->pre_dirty(pi.inode.version);
7984 pi.inode.update_backtrace();
7985 tpi = &pi.inode;
7986 }
7987 straydn->push_projected_linkage(oldin);
7988 } else if (destdnl->is_remote()) {
7989 // nlink-- targeti
7990 if (oldin->is_auth()) {
7991 auto &pi = oldin->project_inode();
7992 pi.inode.version = oldin->pre_dirty();
7993 tpi = &pi.inode;
7994 }
7995 }
7996 }
7997
7998 // dest
7999 if (srcdnl->is_remote()) {
8000 if (!linkmerge) {
8001 // destdn
8002 if (destdn->is_auth())
8003 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
8004 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8005 // srci
8006 if (srci->is_auth()) {
8007 auto &pi = srci->project_inode();
8008 pi.inode.version = srci->pre_dirty();
8009 spi = &pi.inode;
8010 }
8011 } else {
8012 dout(10) << " will merge remote onto primary link" << dendl;
8013 if (destdn->is_auth()) {
8014 auto &pi = oldin->project_inode();
8015 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
8016 spi = &pi.inode;
8017 }
8018 }
8019 } else { // primary
8020 if (destdn->is_auth()) {
8021 version_t oldpv;
8022 if (srcdn->is_auth())
8023 oldpv = srci->get_projected_version();
8024 else {
8025 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
8026
8027 // note which dirfrags have child subtrees in the journal
8028 // event, so that we can open those (as bounds) during replay.
8029 if (srci->is_dir()) {
8030 list<CDir*> ls;
8031 srci->get_dirfrags(ls);
8032 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
8033 CDir *dir = *p;
8034 if (!dir->is_auth())
8035 metablob->renamed_dir_frags.push_back(dir->get_frag());
8036 }
8037 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
8038 }
8039 }
8040 auto &pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
8041 // & srcdnl->snaprealm
8042 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
8043 pi.inode.update_backtrace();
8044 spi = &pi.inode;
8045 }
8046 destdn->push_projected_linkage(srci);
8047 }
8048
8049 // src
8050 if (srcdn->is_auth())
8051 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
8052 srcdn->push_projected_linkage(); // push null linkage
8053
8054 if (!silent) {
8055 if (spi) {
8056 spi->ctime = mdr->get_op_stamp();
8057 if (mdr->get_op_stamp() > spi->rstat.rctime)
8058 spi->rstat.rctime = mdr->get_op_stamp();
8059 spi->change_attr++;
8060 if (linkmerge)
8061 spi->nlink--;
8062 }
8063 if (tpi) {
8064 tpi->ctime = mdr->get_op_stamp();
8065 if (mdr->get_op_stamp() > tpi->rstat.rctime)
8066 tpi->rstat.rctime = mdr->get_op_stamp();
8067 tpi->change_attr++;
8068 {
8069 std::string t;
8070 destdn->make_path_string(t, true);
8071 tpi->stray_prior_path = std::move(t);
8072 }
8073 tpi->nlink--;
8074 if (tpi->nlink == 0)
8075 oldin->state_set(CInode::STATE_ORPHAN);
8076 }
8077 }
8078
8079 // prepare nesting, mtime updates
8080 int predirty_dir = silent ? 0:PREDIRTY_DIR;
8081
8082 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8083 // then link the source inode to destdn
8084 if (destdnl->is_primary()) {
8085 ceph_assert(straydn);
8086 if (straydn->is_auth()) {
8087 metablob->add_dir_context(straydn->get_dir());
8088 metablob->add_dir(straydn->get_dir(), true);
8089 }
8090 }
8091
8092 // sub off target
8093 if (destdn->is_auth() && !destdnl->is_null()) {
8094 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
8095 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
8096 if (destdnl->is_primary()) {
8097 ceph_assert(straydn);
8098 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
8099 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
8100 }
8101 }
8102
8103 // move srcdn
8104 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
8105 int flags = predirty_dir | predirty_primary;
8106 if (srcdn->is_auth())
8107 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
8108 if (destdn->is_auth())
8109 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
8110
8111 // add it all to the metablob
8112 // target inode
8113 if (!linkmerge) {
8114 if (destdnl->is_primary()) {
8115 ceph_assert(straydn);
8116 if (destdn->is_auth()) {
8117 // project snaprealm, too
8118 if (auto& desti_srnode = mdr->more()->desti_srnode) {
8119 oldin->project_snaprealm(desti_srnode);
8120 if (tpi->nlink == 0)
8121 ceph_assert(!desti_srnode->is_parent_global());
8122 desti_srnode = NULL;
8123 }
8124 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8125 metablob->add_primary_dentry(straydn, oldin, true, true);
8126 } else if (force_journal_stray) {
8127 dout(10) << " forced journaling straydn " << *straydn << dendl;
8128 metablob->add_dir_context(straydn->get_dir());
8129 metablob->add_primary_dentry(straydn, oldin, true);
8130 }
8131 } else if (destdnl->is_remote()) {
8132 if (oldin->is_auth()) {
8133 sr_t *new_srnode = NULL;
8134 if (mdr->slave_request) {
8135 if (mdr->slave_request->desti_snapbl.length() > 0) {
8136 new_srnode = new sr_t();
8137 auto p = mdr->slave_request->desti_snapbl.cbegin();
8138 decode(*new_srnode, p);
8139 }
8140 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8141 new_srnode = desti_srnode;
8142 desti_srnode = NULL;
8143 }
8144 if (new_srnode) {
8145 oldin->project_snaprealm(new_srnode);
8146 if (tpi->nlink == 0)
8147 ceph_assert(!new_srnode->is_parent_global());
8148 }
8149 // auth for targeti
8150 metablob->add_dir_context(oldin->get_projected_parent_dir());
8151 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
8152 CEPH_NOSNAP, 0, destdnl);
8153 metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
8154 }
8155 }
8156 }
8157
8158 // dest
8159 if (srcdnl->is_remote()) {
8160 ceph_assert(!linkmerge);
8161 if (destdn->is_auth() && !destdnl->is_null())
8162 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8163 else
8164 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8165
8166 if (destdn->is_auth())
8167 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8168
8169 if (srci->is_auth() ) { // it's remote
8170 if (mdr->slave_request) {
8171 if (mdr->slave_request->srci_snapbl.length() > 0) {
8172 sr_t *new_srnode = new sr_t();
8173 auto p = mdr->slave_request->srci_snapbl.cbegin();
8174 decode(*new_srnode, p);
8175 srci->project_snaprealm(new_srnode);
8176 }
8177 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8178 srci->project_snaprealm(srci_srnode);
8179 srci_srnode = NULL;
8180 }
8181
8182 CDentry *srci_pdn = srci->get_projected_parent_dn();
8183 metablob->add_dir_context(srci_pdn->get_dir());
8184 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn, CEPH_NOSNAP, 0, srcdnl);
8185 metablob->add_primary_dentry(srci_pdn, srci, true);
8186 }
8187 } else if (srcdnl->is_primary()) {
8188 // project snap parent update?
8189 if (destdn->is_auth()) {
8190 if (auto& srci_srnode = mdr->more()->srci_srnode) {
8191 srci->project_snaprealm(srci_srnode);
8192 srci_srnode = NULL;
8193 }
8194 }
8195
8196 if (destdn->is_auth() && !destdnl->is_null())
8197 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8198
8199 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8200
8201 if (destdn->is_auth())
8202 metablob->add_primary_dentry(destdn, srci, true, true);
8203 else if (force_journal_dest) {
8204 dout(10) << " forced journaling destdn " << *destdn << dendl;
8205 metablob->add_dir_context(destdn->get_dir());
8206 metablob->add_primary_dentry(destdn, srci, true);
8207 if (srcdn->is_auth() && srci->is_dir()) {
8208 // journal new subtrees root dirfrags
8209 list<CDir*> ls;
8210 srci->get_dirfrags(ls);
8211 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
8212 CDir *dir = *p;
8213 if (dir->is_auth())
8214 metablob->add_dir(dir, true);
8215 }
8216 }
8217 }
8218 }
8219
8220 // src
8221 if (srcdn->is_auth()) {
8222 dout(10) << " journaling srcdn " << *srcdn << dendl;
8223 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
8224 // also journal the inode in case we need do slave rename rollback. It is Ok to add
8225 // both primary and NULL dentries. Because during journal replay, null dentry is
8226 // processed after primary dentry.
8227 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
8228 metablob->add_primary_dentry(srcdn, srci, true);
8229 metablob->add_null_dentry(srcdn, true);
8230 } else
8231 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
8232
8233 // make renamed inode first track the dn
8234 if (srcdnl->is_primary() && destdn->is_auth()) {
8235 ceph_assert(srci->first <= destdn->first);
8236 srci->first = destdn->first;
8237 }
8238 // make stray inode first track the straydn
8239 if (straydn && straydn->is_auth()) {
8240 ceph_assert(oldin->first <= straydn->first);
8241 oldin->first = straydn->first;
8242 }
8243
8244 if (oldin && oldin->is_dir()) {
8245 ceph_assert(straydn);
8246 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
8247 }
8248 if (srci->is_dir())
8249 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
8250
8251 }
8252
8253
8254 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8255 {
8256 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8257 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
8258
8259 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8260 CDentry::linkage_t *destdnl = destdn->get_linkage();
8261
8262 CInode *oldin = destdnl->get_inode();
8263
8264 // primary+remote link merge?
8265 bool linkmerge = (srcdnl->get_inode() == oldin);
8266 if (linkmerge)
8267 ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
8268
8269 bool new_in_snaprealm = false;
8270 bool new_oldin_snaprealm = false;
8271
8272 // target inode
8273 if (!linkmerge) {
8274 if (destdnl->is_primary()) {
8275 ceph_assert(straydn);
8276 dout(10) << "straydn is " << *straydn << dendl;
8277
8278 // if there is newly created snaprealm, need to split old snaprealm's
8279 // inodes_with_caps. So pop snaprealm before linkage changes.
8280 if (destdn->is_auth()) {
8281 bool hadrealm = (oldin->snaprealm ? true : false);
8282 oldin->early_pop_projected_snaprealm();
8283 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
8284 } else {
8285 ceph_assert(mdr->slave_request);
8286 if (mdr->slave_request->desti_snapbl.length()) {
8287 new_oldin_snaprealm = !oldin->snaprealm;
8288 oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
8289 ceph_assert(oldin->snaprealm);
8290 ceph_assert(oldin->snaprealm->have_past_parents_open());
8291 }
8292 }
8293
8294 destdn->get_dir()->unlink_inode(destdn, false);
8295
8296 straydn->pop_projected_linkage();
8297 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8298 ceph_assert(!straydn->is_projected()); // no other projected
8299
8300 // nlink-- targeti
8301 if (destdn->is_auth())
8302 oldin->pop_and_dirty_projected_inode(mdr->ls);
8303
8304 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
8305 } else if (destdnl->is_remote()) {
8306 destdn->get_dir()->unlink_inode(destdn, false);
8307 if (oldin->is_auth()) {
8308 oldin->pop_and_dirty_projected_inode(mdr->ls);
8309 } else if (mdr->slave_request) {
8310 if (mdr->slave_request->desti_snapbl.length() > 0) {
8311 ceph_assert(oldin->snaprealm);
8312 oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
8313 }
8314 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8315 delete desti_srnode;
8316 desti_srnode = NULL;
8317 }
8318 }
8319 }
8320
8321 // unlink src before we relink it at dest
8322 CInode *in = srcdnl->get_inode();
8323 ceph_assert(in);
8324
8325 bool srcdn_was_remote = srcdnl->is_remote();
8326 if (!srcdn_was_remote) {
8327 // if there is newly created snaprealm, need to split old snaprealm's
8328 // inodes_with_caps. So pop snaprealm before linkage changes.
8329 if (destdn->is_auth()) {
8330 bool hadrealm = (in->snaprealm ? true : false);
8331 in->early_pop_projected_snaprealm();
8332 new_in_snaprealm = (in->snaprealm && !hadrealm);
8333 } else {
8334 ceph_assert(mdr->slave_request);
8335 if (mdr->slave_request->srci_snapbl.length()) {
8336 new_in_snaprealm = !in->snaprealm;
8337 in->decode_snap_blob(mdr->slave_request->srci_snapbl);
8338 ceph_assert(in->snaprealm);
8339 ceph_assert(in->snaprealm->have_past_parents_open());
8340 }
8341 }
8342 }
8343
8344 srcdn->get_dir()->unlink_inode(srcdn);
8345
8346 // dest
8347 if (srcdn_was_remote) {
8348 if (!linkmerge) {
8349 // destdn
8350 destdnl = destdn->pop_projected_linkage();
8351 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8352 ceph_assert(!destdn->is_projected()); // no other projected
8353
8354 destdn->link_remote(destdnl, in);
8355 if (destdn->is_auth())
8356 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
8357 // in
8358 if (in->is_auth()) {
8359 in->pop_and_dirty_projected_inode(mdr->ls);
8360 } else if (mdr->slave_request) {
8361 if (mdr->slave_request->srci_snapbl.length() > 0) {
8362 ceph_assert(in->snaprealm);
8363 in->decode_snap_blob(mdr->slave_request->srci_snapbl);
8364 }
8365 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8366 delete srci_srnode;
8367 srci_srnode = NULL;
8368 }
8369 } else {
8370 dout(10) << "merging remote onto primary link" << dendl;
8371 oldin->pop_and_dirty_projected_inode(mdr->ls);
8372 }
8373 } else { // primary
8374 if (linkmerge) {
8375 dout(10) << "merging primary onto remote link" << dendl;
8376 destdn->get_dir()->unlink_inode(destdn, false);
8377 }
8378 destdnl = destdn->pop_projected_linkage();
8379 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8380 ceph_assert(!destdn->is_projected()); // no other projected
8381
8382 // srcdn inode import?
8383 if (!srcdn->is_auth() && destdn->is_auth()) {
8384 ceph_assert(mdr->more()->inode_import.length() > 0);
8385
8386 map<client_t,Capability::Import> imported_caps;
8387
8388 // finish cap imports
8389 finish_force_open_sessions(mdr->more()->imported_session_map);
8390 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
8391 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
8392 mdr->more()->srcdn_auth_mds, true,
8393 mdr->more()->imported_session_map,
8394 mdr->more()->cap_imports[destdnl->get_inode()],
8395 imported_caps);
8396 }
8397
8398 mdr->more()->inode_import.clear();
8399 encode(imported_caps, mdr->more()->inode_import);
8400
8401 /* hack: add an auth pin for each xlock we hold. These were
8402 * remote xlocks previously but now they're local and
8403 * we're going to try and unpin when we xlock_finish. */
8404
8405 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
8406 i != mdr->locks.end();
8407 ++i) {
8408 SimpleLock *lock = i->lock;
8409 if (lock->get_parent() != destdnl->get_inode())
8410 break;
8411 if (i->is_xlock() && !lock->is_locallock())
8412 mds->locker->xlock_import(lock);
8413 }
8414
8415 // hack: fix auth bit
8416 in->state_set(CInode::STATE_AUTH);
8417
8418 mdr->clear_ambiguous_auth();
8419 }
8420
8421 if (destdn->is_auth())
8422 in->pop_and_dirty_projected_inode(mdr->ls);
8423 }
8424
8425 // src
8426 if (srcdn->is_auth())
8427 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
8428 srcdn->pop_projected_linkage();
8429 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8430 ceph_assert(!srcdn->is_projected()); // no other projected
8431
8432 // apply remaining projected inodes (nested)
8433 mdr->apply();
8434
8435 // update subtree map?
8436 if (destdnl->is_primary() && in->is_dir())
8437 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
8438
8439 if (straydn && oldin->is_dir())
8440 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
8441
8442 if (new_oldin_snaprealm)
8443 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
8444 if (new_in_snaprealm)
8445 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
8446
8447 // removing a new dn?
8448 if (srcdn->is_auth())
8449 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
8450 }
8451
8452
8453
8454 // ------------
8455 // SLAVE
8456
8457 class C_MDS_SlaveRenamePrep : public ServerLogContext {
8458 CDentry *srcdn, *destdn, *straydn;
8459 public:
8460 C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
8461 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
8462 void finish(int r) override {
8463 server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
8464 }
8465 };
8466
8467 class C_MDS_SlaveRenameCommit : public ServerContext {
8468 MDRequestRef mdr;
8469 CDentry *srcdn, *destdn, *straydn;
8470 public:
8471 C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
8472 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
8473 void finish(int r) override {
8474 server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
8475 }
8476 };
8477
8478 class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
8479 MDRequestRef mdr;
8480 public:
8481 C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
8482 ServerContext(s), mdr(r) {}
8483 void finish(int r) override {
8484 server->_slave_rename_sessions_flushed(mdr);
8485 }
8486 };
8487
8488 void Server::handle_slave_rename_prep(MDRequestRef& mdr)
8489 {
8490 dout(10) << "handle_slave_rename_prep " << *mdr
8491 << " " << mdr->slave_request->srcdnpath
8492 << " to " << mdr->slave_request->destdnpath
8493 << dendl;
8494
8495 if (mdr->slave_request->is_interrupted()) {
8496 dout(10) << " slave request interrupted, sending noop reply" << dendl;
8497 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
8498 reply->mark_interrupted();
8499 mds->send_message_mds(reply, mdr->slave_to_mds);
8500 mdr->reset_slave_request();
8501 return;
8502 }
8503
8504 // discover destdn
8505 filepath destpath(mdr->slave_request->destdnpath);
8506 dout(10) << " dest " << destpath << dendl;
8507 vector<CDentry*> trace;
8508 CF_MDS_MDRContextFactory cf(mdcache, mdr);
8509 int r = mdcache->path_traverse(mdr, cf, destpath, &trace, NULL, MDS_TRAVERSE_DISCOVERXLOCK);
8510 if (r > 0) return;
8511 if (r == -ESTALE) {
8512 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
8513 mdr->slave_to_mds);
8514 return;
8515 }
8516 ceph_assert(r == 0); // we shouldn't get an error here!
8517
8518 CDentry *destdn = trace.back();
8519 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8520 dout(10) << " destdn " << *destdn << dendl;
8521 mdr->pin(destdn);
8522
8523 // discover srcdn
8524 filepath srcpath(mdr->slave_request->srcdnpath);
8525 dout(10) << " src " << srcpath << dendl;
8526 CInode *srci = nullptr;
8527 r = mdcache->path_traverse(mdr, cf, srcpath, &trace, &srci, MDS_TRAVERSE_DISCOVERXLOCK);
8528 if (r > 0) return;
8529 ceph_assert(r == 0);
8530
8531 // srcpath must not point to a null dentry
8532 ceph_assert(srci != nullptr);
8533
8534 CDentry *srcdn = trace.back();
8535 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8536 dout(10) << " srcdn " << *srcdn << dendl;
8537 mdr->pin(srcdn);
8538 mdr->pin(srci);
8539
8540 // stray?
8541 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
8542 if (linkmerge)
8543 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
8544 CDentry *straydn = mdr->straydn;
8545 if (destdnl->is_primary() && !linkmerge)
8546 ceph_assert(straydn);
8547
8548 mdr->set_op_stamp(mdr->slave_request->op_stamp);
8549 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
8550
8551 // set up commit waiter (early, to clean up any freezing etc we do)
8552 if (!mdr->more()->slave_commit)
8553 mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
8554
8555 // am i srcdn auth?
8556 if (srcdn->is_auth()) {
8557 set<mds_rank_t> srcdnrep;
8558 srcdn->list_replicas(srcdnrep);
8559
8560 bool reply_witness = false;
8561 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
8562 // freeze?
8563 // we need this to
8564 // - avoid conflicting lock state changes
8565 // - avoid concurrent updates to the inode
8566 // (this could also be accomplished with the versionlock)
8567 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
8568 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
8569 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
8570
8571 // unfreeze auth pin after freezing the inode to avoid queueing waiters
8572 if (srcdnl->get_inode()->is_frozen_auth_pin())
8573 mdr->unfreeze_auth_pin();
8574
8575 if (!frozen_inode) {
8576 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
8577 return;
8578 }
8579
8580 /*
8581 * set ambiguous auth for srci
8582 * NOTE: we don't worry about ambiguous cache expire as we do
8583 * with subtree migrations because all slaves will pin
8584 * srcdn->get_inode() for duration of this rename.
8585 */
8586 mdr->set_ambiguous_auth(srcdnl->get_inode());
8587
8588 // just mark the source inode as ambiguous auth if more than two MDS are involved.
8589 // the master will send another OP_RENAMEPREP slave request later.
8590 if (mdr->slave_request->witnesses.size() > 1) {
8591 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
8592 reply_witness = true;
8593 }
8594
8595 // make sure bystanders have received all lock related messages
8596 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
8597 if (*p == mdr->slave_to_mds ||
8598 (mds->is_cluster_degraded() &&
8599 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
8600 continue;
8601 auto notify = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMENOTIFY);
8602 mds->send_message_mds(notify, *p);
8603 mdr->more()->waiting_on_slave.insert(*p);
8604 }
8605
8606 // make sure clients have received all cap related messages
8607 set<client_t> export_client_set;
8608 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
8609
8610 MDSGatherBuilder gather(g_ceph_context);
8611 flush_client_sessions(export_client_set, gather);
8612 if (gather.has_subs()) {
8613 mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
8614 gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
8615 gather.activate();
8616 }
8617 }
8618
8619 // is witness list sufficient?
8620 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
8621 if (*p == mdr->slave_to_mds ||
8622 mdr->slave_request->witnesses.count(*p)) continue;
8623 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
8624 reply_witness = true;
8625 break;
8626 }
8627
8628 if (reply_witness) {
8629 ceph_assert(!srcdnrep.empty());
8630 auto reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
8631 reply->witnesses.swap(srcdnrep);
8632 mds->send_message_mds(reply, mdr->slave_to_mds);
8633 mdr->reset_slave_request();
8634 return;
8635 }
8636 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
8637 if (!mdr->more()->waiting_on_slave.empty()) {
8638 dout(10) << " still waiting for rename notify acks from "
8639 << mdr->more()->waiting_on_slave << dendl;
8640 return;
8641 }
8642 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
8643 // set ambiguous auth for srci on witnesses
8644 mdr->set_ambiguous_auth(srcdnl->get_inode());
8645 }
8646
8647 // encode everything we'd need to roll this back... basically, just the original state.
8648 rename_rollback rollback;
8649
8650 rollback.reqid = mdr->reqid;
8651
8652 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
8653 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
8654 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
8655 rollback.orig_src.dname = srcdn->get_name();
8656 if (srcdnl->is_primary())
8657 rollback.orig_src.ino = srcdnl->get_inode()->ino();
8658 else {
8659 ceph_assert(srcdnl->is_remote());
8660 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
8661 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
8662 }
8663
8664 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
8665 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
8666 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
8667 rollback.orig_dest.dname = destdn->get_name();
8668 if (destdnl->is_primary())
8669 rollback.orig_dest.ino = destdnl->get_inode()->ino();
8670 else if (destdnl->is_remote()) {
8671 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
8672 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
8673 }
8674
8675 if (straydn) {
8676 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
8677 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
8678 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
8679 rollback.stray.dname = straydn->get_name();
8680 }
8681 if (mdr->slave_request->desti_snapbl.length()) {
8682 CInode *oldin = destdnl->get_inode();
8683 if (oldin->snaprealm) {
8684 encode(true, rollback.desti_snapbl);
8685 oldin->encode_snap_blob(rollback.desti_snapbl);
8686 } else {
8687 encode(false, rollback.desti_snapbl);
8688 }
8689 }
8690 if (mdr->slave_request->srci_snapbl.length()) {
8691 if (srci->snaprealm) {
8692 encode(true, rollback.srci_snapbl);
8693 srci->encode_snap_blob(rollback.srci_snapbl);
8694 } else {
8695 encode(false, rollback.srci_snapbl);
8696 }
8697 }
8698 encode(rollback, mdr->more()->rollback_bl);
8699 // FIXME: rollback snaprealm
8700 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
8701
8702 // journal.
8703 mdr->ls = mdlog->get_current_segment();
8704 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
8705 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
8706 mdlog->start_entry(le);
8707 le->rollback = mdr->more()->rollback_bl;
8708
8709 bufferlist blah; // inode import data... obviously not used if we're the slave
8710 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
8711
8712 if (le->commit.empty()) {
8713 dout(10) << " empty metablob, skipping journal" << dendl;
8714 mdlog->cancel_entry(le);
8715 mdr->ls = NULL;
8716 _logged_slave_rename(mdr, srcdn, destdn, straydn);
8717 } else {
8718 mdr->more()->slave_update_journaled = true;
8719 submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
8720 mdr, __func__);
8721 mdlog->flush();
8722 }
8723 }
8724
8725 void Server::_logged_slave_rename(MDRequestRef& mdr,
8726 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8727 {
8728 dout(10) << "_logged_slave_rename " << *mdr << dendl;
8729
8730 // prepare ack
8731 MMDSSlaveRequest::ref reply;
8732 if (!mdr->aborted) {
8733 reply = MMDSSlaveRequest::create(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
8734 if (!mdr->more()->slave_update_journaled)
8735 reply->mark_not_journaled();
8736 }
8737
8738 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8739 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
8740
8741 // export srci?
8742 if (srcdn->is_auth() && srcdnl->is_primary()) {
8743 // set export bounds for CInode::encode_export()
8744 if (reply) {
8745 list<CDir*> bounds;
8746 if (srcdnl->get_inode()->is_dir()) {
8747 srcdnl->get_inode()->get_dirfrags(bounds);
8748 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
8749 (*p)->state_set(CDir::STATE_EXPORTBOUND);
8750 }
8751
8752 map<client_t,entity_inst_t> exported_client_map;
8753 map<client_t, client_metadata_t> exported_client_metadata_map;
8754 bufferlist inodebl;
8755 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
8756 exported_client_map,
8757 exported_client_metadata_map);
8758
8759 for (list<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p)
8760 (*p)->state_clear(CDir::STATE_EXPORTBOUND);
8761
8762 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
8763 encode(exported_client_metadata_map, reply->inode_export);
8764 reply->inode_export.claim_append(inodebl);
8765 reply->inode_export_v = srcdnl->get_inode()->inode.version;
8766 }
8767
8768 // remove mdr auth pin
8769 mdr->auth_unpin(srcdnl->get_inode());
8770 mdr->more()->is_inode_exporter = true;
8771
8772 if (srcdnl->get_inode()->is_dirty())
8773 srcdnl->get_inode()->mark_clean();
8774
8775 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
8776 }
8777
8778 // apply
8779 _rename_apply(mdr, srcdn, destdn, straydn);
8780
8781 CDentry::linkage_t *destdnl = destdn->get_linkage();
8782
8783 // bump popularity
8784 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
8785 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
8786 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
8787
8788 // done.
8789 mdr->reset_slave_request();
8790 mdr->straydn = 0;
8791
8792 if (reply) {
8793 mds->send_message_mds(reply, mdr->slave_to_mds);
8794 } else {
8795 ceph_assert(mdr->aborted);
8796 dout(10) << " abort flag set, finishing" << dendl;
8797 mdcache->request_finish(mdr);
8798 }
8799 }
8800
8801 void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
8802 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8803 {
8804 dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
8805
8806 CInode *in = destdn->get_linkage()->get_inode();
8807
8808 inodeno_t migrated_stray;
8809 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
8810 migrated_stray = in->ino();
8811
8812 MDSContext::vec finished;
8813 if (r == 0) {
8814 // unfreeze+singleauth inode
8815 // hmm, do i really need to delay this?
8816 if (mdr->more()->is_inode_exporter) {
8817 // drop our pins
8818 // we exported, clear out any xlocks that we moved to another MDS
8819
8820 for (auto i = mdr->locks.lower_bound(&in->versionlock);
8821 i != mdr->locks.end(); ) {
8822 SimpleLock *lock = i->lock;
8823 if (lock->get_parent() != in)
8824 break;
8825 // we only care about xlocks on the exported inode
8826 if (i->is_xlock() && !lock->is_locallock())
8827 mds->locker->xlock_export(i++, mdr.get());
8828 else
8829 ++i;
8830 }
8831
8832 map<client_t,Capability::Import> peer_imported;
8833 auto bp = mdr->more()->inode_import.cbegin();
8834 decode(peer_imported, bp);
8835
8836 dout(10) << " finishing inode export on " << *in << dendl;
8837 mdcache->migrator->finish_export_inode(in, mdr->slave_to_mds, peer_imported, finished);
8838 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
8839
8840 // unfreeze
8841 ceph_assert(in->is_frozen_inode());
8842 in->unfreeze_inode(finished);
8843 }
8844
8845 // singleauth
8846 if (mdr->more()->is_ambiguous_auth) {
8847 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8848 mdr->more()->is_ambiguous_auth = false;
8849 }
8850
8851 if (straydn && mdr->more()->slave_update_journaled) {
8852 CInode *strayin = straydn->get_projected_linkage()->get_inode();
8853 if (strayin && !strayin->snaprealm)
8854 mdcache->clear_dirty_bits_for_stray(strayin);
8855 }
8856
8857 mds->queue_waiters(finished);
8858 mdr->cleanup();
8859
8860 if (mdr->more()->slave_update_journaled) {
8861 // write a commit to the journal
8862 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
8863 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
8864 ESlaveUpdate::RENAME);
8865 mdlog->start_entry(le);
8866 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
8867 mdlog->flush();
8868 } else {
8869 _committed_slave(mdr);
8870 }
8871 } else {
8872
8873 // abort
8874 // rollback_bl may be empty if we froze the inode but had to provide an expanded
8875 // witness list from the master, and they failed before we tried prep again.
8876 if (mdr->more()->rollback_bl.length()) {
8877 if (mdr->more()->is_inode_exporter) {
8878 dout(10) << " reversing inode export of " << *in << dendl;
8879 in->abort_export();
8880 }
8881 if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
8882 mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
8883 // rollback but preserve the slave request
8884 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
8885 mdr->more()->rollback_bl.clear();
8886 } else
8887 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
8888 } else {
8889 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
8890 // singleauth
8891 if (mdr->more()->is_ambiguous_auth) {
8892 if (srcdn->is_auth())
8893 mdr->more()->rename_inode->unfreeze_inode(finished);
8894
8895 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
8896 mdr->more()->is_ambiguous_auth = false;
8897 }
8898 mds->queue_waiters(finished);
8899 mdcache->request_finish(mdr);
8900 }
8901 }
8902
8903 if (migrated_stray && mds->is_stopping())
8904 mdcache->shutdown_export_stray_finish(migrated_stray);
8905 }
8906
8907 void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
8908 bool isdir, int linkunlink, nest_info_t &rstat)
8909 {
8910 fnode_t *pf;
8911 pf = dir->project_fnode();
8912 mut->add_projected_fnode(dir);
8913 pf->version = dir->pre_dirty();
8914
8915 if (isdir) {
8916 pf->fragstat.nsubdirs += linkunlink;
8917 } else {
8918 pf->fragstat.nfiles += linkunlink;
8919 }
8920 if (r.ino) {
8921 pf->rstat.rbytes += linkunlink * rstat.rbytes;
8922 pf->rstat.rfiles += linkunlink * rstat.rfiles;
8923 pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
8924 pf->rstat.rsnaps += linkunlink * rstat.rsnaps;
8925 }
8926 if (pf->fragstat.mtime == ctime) {
8927 pf->fragstat.mtime = r.dirfrag_old_mtime;
8928 if (pf->rstat.rctime == ctime)
8929 pf->rstat.rctime = r.dirfrag_old_rctime;
8930 }
8931 mut->add_updated_lock(&dir->get_inode()->filelock);
8932 mut->add_updated_lock(&dir->get_inode()->nestlock);
8933 }
8934
8935 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
8936 MutationRef mut;
8937 CDentry *srcdn;
8938 version_t srcdnpv;
8939 CDentry *destdn;
8940 CDentry *straydn;
8941 map<client_t,MClientSnap::ref> splits[2];
8942 bool finish_mdr;
8943 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
8944 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
8945 map<client_t,MClientSnap::ref> _splits[2], bool f) :
8946 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
8947 straydn(st), finish_mdr(f) {
8948 splits[0].swap(_splits[0]);
8949 splits[1].swap(_splits[1]);
8950 }
8951 void finish(int r) override {
8952 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
8953 destdn, straydn, splits, finish_mdr);
8954 }
8955 };
8956
8957 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
8958 bool finish_mdr)
8959 {
8960 rename_rollback rollback;
8961 auto p = rbl.cbegin();
8962 decode(rollback, p);
8963
8964 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
8965 // need to finish this update before sending resolve to claim the subtree
8966 mdcache->add_rollback(rollback.reqid, master);
8967
8968 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
8969 mut->ls = mds->mdlog->get_current_segment();
8970
8971 CDentry *srcdn = NULL;
8972 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
8973 if (!srcdir)
8974 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
8975 if (srcdir) {
8976 dout(10) << " srcdir " << *srcdir << dendl;
8977 srcdn = srcdir->lookup(rollback.orig_src.dname);
8978 if (srcdn) {
8979 dout(10) << " srcdn " << *srcdn << dendl;
8980 ceph_assert(srcdn->get_linkage()->is_null());
8981 } else
8982 dout(10) << " srcdn not found" << dendl;
8983 } else
8984 dout(10) << " srcdir not found" << dendl;
8985
8986 CDentry *destdn = NULL;
8987 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
8988 if (!destdir)
8989 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
8990 if (destdir) {
8991 dout(10) << " destdir " << *destdir << dendl;
8992 destdn = destdir->lookup(rollback.orig_dest.dname);
8993 if (destdn)
8994 dout(10) << " destdn " << *destdn << dendl;
8995 else
8996 dout(10) << " destdn not found" << dendl;
8997 } else
8998 dout(10) << " destdir not found" << dendl;
8999
9000 CInode *in = NULL;
9001 if (rollback.orig_src.ino) {
9002 in = mdcache->get_inode(rollback.orig_src.ino);
9003 if (in && in->is_dir())
9004 ceph_assert(srcdn && destdn);
9005 } else
9006 in = mdcache->get_inode(rollback.orig_src.remote_ino);
9007
9008 CDir *straydir = NULL;
9009 CDentry *straydn = NULL;
9010 if (rollback.stray.dirfrag.ino) {
9011 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
9012 if (straydir) {
9013 dout(10) << "straydir " << *straydir << dendl;
9014 straydn = straydir->lookup(rollback.stray.dname);
9015 if (straydn) {
9016 dout(10) << " straydn " << *straydn << dendl;
9017 ceph_assert(straydn->get_linkage()->is_primary());
9018 } else
9019 dout(10) << " straydn not found" << dendl;
9020 } else
9021 dout(10) << "straydir not found" << dendl;
9022 }
9023
9024 CInode *target = NULL;
9025 if (rollback.orig_dest.ino) {
9026 target = mdcache->get_inode(rollback.orig_dest.ino);
9027 if (target)
9028 ceph_assert(destdn && straydn);
9029 } else if (rollback.orig_dest.remote_ino)
9030 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
9031
9032 // can't use is_auth() in the resolve stage
9033 mds_rank_t whoami = mds->get_nodeid();
9034 // slave
9035 ceph_assert(!destdn || destdn->authority().first != whoami);
9036 ceph_assert(!straydn || straydn->authority().first != whoami);
9037
9038 bool force_journal_src = false;
9039 bool force_journal_dest = false;
9040 if (in && in->is_dir() && srcdn->authority().first != whoami)
9041 force_journal_src = _need_force_journal(in, false);
9042 if (in && target && target->is_dir())
9043 force_journal_dest = _need_force_journal(in, true);
9044
9045 version_t srcdnpv = 0;
9046 // repair src
9047 if (srcdn) {
9048 if (srcdn->authority().first == whoami)
9049 srcdnpv = srcdn->pre_dirty();
9050 if (rollback.orig_src.ino) {
9051 ceph_assert(in);
9052 srcdn->push_projected_linkage(in);
9053 } else
9054 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
9055 rollback.orig_src.remote_d_type);
9056 }
9057
9058 map<client_t,MClientSnap::ref> splits[2];
9059
9060 CInode::mempool_inode *pip = nullptr;
9061 if (in) {
9062 bool projected;
9063 if (in->get_projected_parent_dn()->authority().first == whoami) {
9064 auto &pi = in->project_inode();
9065 pip = &pi.inode;
9066 mut->add_projected_inode(in);
9067 pip->version = in->pre_dirty();
9068 projected = true;
9069 } else {
9070 pip = in->get_projected_inode();
9071 projected = false;
9072 }
9073 if (pip->ctime == rollback.ctime)
9074 pip->ctime = rollback.orig_src.old_ctime;
9075
9076 if (rollback.srci_snapbl.length() && in->snaprealm) {
9077 bool hadrealm;
9078 auto p = rollback.srci_snapbl.cbegin();
9079 decode(hadrealm, p);
9080 if (hadrealm) {
9081 if (projected && !mds->is_resolve()) {
9082 sr_t *new_srnode = new sr_t();
9083 decode(*new_srnode, p);
9084 in->project_snaprealm(new_srnode);
9085 } else
9086 decode(in->snaprealm->srnode, p);
9087 } else {
9088 SnapRealm *realm;
9089 if (rollback.orig_src.ino) {
9090 ceph_assert(srcdir);
9091 realm = srcdir->get_inode()->find_snaprealm();
9092 } else {
9093 realm = in->snaprealm->parent;
9094 }
9095 if (!mds->is_resolve())
9096 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
9097 if (projected)
9098 in->project_snaprealm(NULL);
9099 else
9100 in->snaprealm->merge_to(realm);
9101 }
9102 }
9103 }
9104
9105 if (srcdn && srcdn->authority().first == whoami) {
9106 nest_info_t blah;
9107 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
9108 in ? in->is_dir() : false, 1, pip ? pip->accounted_rstat : blah);
9109 }
9110
9111 // repair dest
9112 if (destdn) {
9113 if (rollback.orig_dest.ino && target) {
9114 destdn->push_projected_linkage(target);
9115 } else if (rollback.orig_dest.remote_ino) {
9116 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
9117 rollback.orig_dest.remote_d_type);
9118 } else {
9119 // the dentry will be trimmed soon, it's ok to have wrong linkage
9120 if (rollback.orig_dest.ino)
9121 ceph_assert(mds->is_resolve());
9122 destdn->push_projected_linkage();
9123 }
9124 }
9125
9126 if (straydn)
9127 straydn->push_projected_linkage();
9128
9129 if (target) {
9130 bool projected;
9131 CInode::mempool_inode *ti = nullptr;
9132 if (target->get_projected_parent_dn()->authority().first == whoami) {
9133 auto &pi = target->project_inode();
9134 ti = &pi.inode;
9135 mut->add_projected_inode(target);
9136 ti->version = target->pre_dirty();
9137 projected = true;
9138 } else {
9139 ti = target->get_projected_inode();
9140 projected = false;
9141 }
9142 if (ti->ctime == rollback.ctime)
9143 ti->ctime = rollback.orig_dest.old_ctime;
9144 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
9145 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
9146 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
9147 else
9148 ceph_assert(rollback.orig_dest.remote_ino &&
9149 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
9150 } else
9151 ti->nlink++;
9152
9153 if (rollback.desti_snapbl.length() && target->snaprealm) {
9154 bool hadrealm;
9155 auto p = rollback.desti_snapbl.cbegin();
9156 decode(hadrealm, p);
9157 if (hadrealm) {
9158 if (projected && !mds->is_resolve()) {
9159 sr_t *new_srnode = new sr_t();
9160 decode(*new_srnode, p);
9161 target->project_snaprealm(new_srnode);
9162 } else
9163 decode(target->snaprealm->srnode, p);
9164 } else {
9165 SnapRealm *realm;
9166 if (rollback.orig_dest.ino) {
9167 ceph_assert(destdir);
9168 realm = destdir->get_inode()->find_snaprealm();
9169 } else {
9170 realm = target->snaprealm->parent;
9171 }
9172 if (!mds->is_resolve())
9173 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
9174 if (projected)
9175 target->project_snaprealm(NULL);
9176 else
9177 target->snaprealm->merge_to(realm);
9178 }
9179 }
9180 }
9181
9182 if (srcdn)
9183 dout(0) << " srcdn back to " << *srcdn << dendl;
9184 if (in)
9185 dout(0) << " srci back to " << *in << dendl;
9186 if (destdn)
9187 dout(0) << " destdn back to " << *destdn << dendl;
9188 if (target)
9189 dout(0) << " desti back to " << *target << dendl;
9190
9191 // journal it
9192 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
9193 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
9194 mdlog->start_entry(le);
9195
9196 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
9197 le->commit.add_dir_context(srcdir);
9198 if (rollback.orig_src.ino)
9199 le->commit.add_primary_dentry(srcdn, 0, true);
9200 else
9201 le->commit.add_remote_dentry(srcdn, true);
9202 }
9203
9204 if (!rollback.orig_src.ino && // remote linkage
9205 in && in->authority().first == whoami) {
9206 le->commit.add_dir_context(in->get_projected_parent_dir());
9207 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9208 }
9209
9210 if (force_journal_dest) {
9211 ceph_assert(rollback.orig_dest.ino);
9212 le->commit.add_dir_context(destdir);
9213 le->commit.add_primary_dentry(destdn, 0, true);
9214 }
9215
9216 // slave: no need to journal straydn
9217
9218 if (target && target != in && target->authority().first == whoami) {
9219 ceph_assert(rollback.orig_dest.remote_ino);
9220 le->commit.add_dir_context(target->get_projected_parent_dir());
9221 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
9222 }
9223
9224 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
9225 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
9226 le->commit.renamed_dirino = in->ino();
9227 if (srcdn->authority().first == whoami) {
9228 list<CDir*> ls;
9229 in->get_dirfrags(ls);
9230 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
9231 CDir *dir = *p;
9232 if (!dir->is_auth())
9233 le->commit.renamed_dir_frags.push_back(dir->get_frag());
9234 }
9235 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
9236 }
9237 } else if (force_journal_dest) {
9238 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
9239 le->commit.renamed_dirino = target->ino();
9240 }
9241
9242 if (target && target->is_dir()) {
9243 ceph_assert(destdn);
9244 mdcache->project_subtree_rename(target, straydir, destdir);
9245 }
9246
9247 if (in && in->is_dir()) {
9248 ceph_assert(srcdn);
9249 mdcache->project_subtree_rename(in, destdir, srcdir);
9250 }
9251
9252 if (mdr && !mdr->more()->slave_update_journaled) {
9253 ceph_assert(le->commit.empty());
9254 mdlog->cancel_entry(le);
9255 mut->ls = NULL;
9256 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
9257 } else {
9258 ceph_assert(!le->commit.empty());
9259 if (mdr)
9260 mdr->more()->slave_update_journaled = false;
9261 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
9262 srcdn, srcdnpv, destdn, straydn,
9263 splits, finish_mdr);
9264 submit_mdlog_entry(le, fin, mdr, __func__);
9265 mdlog->flush();
9266 }
9267 }
9268
9269 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
9270 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
9271 map<client_t,MClientSnap::ref> splits[2], bool finish_mdr)
9272 {
9273 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
9274
9275 if (straydn) {
9276 straydn->get_dir()->unlink_inode(straydn);
9277 straydn->pop_projected_linkage();
9278 }
9279 if (destdn) {
9280 destdn->get_dir()->unlink_inode(destdn);
9281 destdn->pop_projected_linkage();
9282 }
9283 if (srcdn) {
9284 srcdn->pop_projected_linkage();
9285 if (srcdn->authority().first == mds->get_nodeid()) {
9286 srcdn->mark_dirty(srcdnpv, mut->ls);
9287 if (srcdn->get_linkage()->is_primary())
9288 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
9289 }
9290 }
9291
9292 mut->apply();
9293
9294 if (srcdn && srcdn->get_linkage()->is_primary()) {
9295 CInode *in = srcdn->get_linkage()->get_inode();
9296 if (in && in->is_dir()) {
9297 ceph_assert(destdn);
9298 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
9299 }
9300 }
9301
9302 if (destdn) {
9303 CInode *oldin = destdn->get_linkage()->get_inode();
9304 // update subtree map?
9305 if (oldin && oldin->is_dir()) {
9306 ceph_assert(straydn);
9307 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
9308 }
9309 }
9310
9311 if (mds->is_resolve()) {
9312 CDir *root = NULL;
9313 if (straydn)
9314 root = mdcache->get_subtree_root(straydn->get_dir());
9315 else if (destdn)
9316 root = mdcache->get_subtree_root(destdn->get_dir());
9317 if (root)
9318 mdcache->try_trim_non_auth_subtree(root);
9319 } else {
9320 mdcache->send_snaps(splits[1]);
9321 mdcache->send_snaps(splits[0]);
9322 }
9323
9324 if (mdr) {
9325 MDSContext::vec finished;
9326 if (mdr->more()->is_ambiguous_auth) {
9327 if (srcdn->is_auth())
9328 mdr->more()->rename_inode->unfreeze_inode(finished);
9329
9330 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9331 mdr->more()->is_ambiguous_auth = false;
9332 }
9333 mds->queue_waiters(finished);
9334 if (finish_mdr || mdr->aborted)
9335 mdcache->request_finish(mdr);
9336 else
9337 mdr->more()->slave_rolling_back = false;
9338 }
9339
9340 mdcache->finish_rollback(mut->reqid);
9341
9342 mut->cleanup();
9343 }
9344
9345 void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
9346 {
9347 dout(10) << "handle_slave_rename_prep_ack " << *mdr
9348 << " witnessed by " << ack->get_source()
9349 << " " << *ack << dendl;
9350 mds_rank_t from = mds_rank_t(ack->get_source().num());
9351
9352 // note slave
9353 mdr->more()->slaves.insert(from);
9354 if (mdr->more()->srcdn_auth_mds == from &&
9355 mdr->more()->is_remote_frozen_authpin &&
9356 !mdr->more()->is_ambiguous_auth) {
9357 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
9358 }
9359
9360 // witnessed? or add extra witnesses?
9361 ceph_assert(mdr->more()->witnessed.count(from) == 0);
9362 if (ack->is_interrupted()) {
9363 dout(10) << " slave request interrupted, noop" << dendl;
9364 } else if (ack->witnesses.empty()) {
9365 mdr->more()->witnessed.insert(from);
9366 if (!ack->is_not_journaled())
9367 mdr->more()->has_journaled_slaves = true;
9368 } else {
9369 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
9370 mdr->more()->extra_witnesses = ack->witnesses;
9371 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
9372 }
9373
9374 // srci import?
9375 if (ack->inode_export.length()) {
9376 dout(10) << " got srci import" << dendl;
9377 mdr->more()->inode_import.share(ack->inode_export);
9378 mdr->more()->inode_import_v = ack->inode_export_v;
9379 }
9380
9381 // remove from waiting list
9382 ceph_assert(mdr->more()->waiting_on_slave.count(from));
9383 mdr->more()->waiting_on_slave.erase(from);
9384
9385 if (mdr->more()->waiting_on_slave.empty())
9386 dispatch_client_request(mdr); // go again!
9387 else
9388 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
9389 }
9390
9391 void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, const MMDSSlaveRequest::const_ref &ack)
9392 {
9393 dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
9394 << ack->get_source() << dendl;
9395 ceph_assert(mdr->is_slave());
9396 mds_rank_t from = mds_rank_t(ack->get_source().num());
9397
9398 if (mdr->more()->waiting_on_slave.count(from)) {
9399 mdr->more()->waiting_on_slave.erase(from);
9400
9401 if (mdr->more()->waiting_on_slave.empty()) {
9402 if (mdr->slave_request)
9403 dispatch_slave_request(mdr);
9404 } else
9405 dout(10) << " still waiting for rename notify acks from "
9406 << mdr->more()->waiting_on_slave << dendl;
9407 }
9408 }
9409
9410 void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
9411 {
9412 dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
9413
9414 if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
9415 mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
9416
9417 if (mdr->more()->waiting_on_slave.empty()) {
9418 if (mdr->slave_request)
9419 dispatch_slave_request(mdr);
9420 } else
9421 dout(10) << " still waiting for rename notify acks from "
9422 << mdr->more()->waiting_on_slave << dendl;
9423 }
9424 }
9425
9426 // snaps
9427 /* This function takes responsibility for the passed mdr*/
9428 void Server::handle_client_lssnap(MDRequestRef& mdr)
9429 {
9430 const MClientRequest::const_ref &req = mdr->client_request;
9431
9432 // traverse to path
9433 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
9434 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
9435 respond_to_request(mdr, -ESTALE);
9436 return;
9437 }
9438 if (!diri->is_auth()) {
9439 mdcache->request_forward(mdr, diri->authority().first);
9440 return;
9441 }
9442 if (!diri->is_dir()) {
9443 respond_to_request(mdr, -ENOTDIR);
9444 return;
9445 }
9446 dout(10) << "lssnap on " << *diri << dendl;
9447
9448 // lock snap
9449 MutationImpl::LockOpVec lov;
9450 mds->locker->include_snap_rdlocks(diri, lov);
9451 if (!mds->locker->acquire_locks(mdr, lov))
9452 return;
9453
9454 if (!check_access(mdr, diri, MAY_READ))
9455 return;
9456
9457 SnapRealm *realm = diri->find_snaprealm();
9458 map<snapid_t,const SnapInfo*> infomap;
9459 realm->get_snap_info(infomap, diri->get_oldest_snap());
9460
9461 unsigned max_entries = req->head.args.readdir.max_entries;
9462 if (!max_entries)
9463 max_entries = infomap.size();
9464 int max_bytes = req->head.args.readdir.max_bytes;
9465 if (!max_bytes)
9466 // make sure at least one item can be encoded
9467 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
9468
9469 __u64 last_snapid = 0;
9470 string offset_str = req->get_path2();
9471 if (!offset_str.empty())
9472 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
9473
9474 //Empty DirStat
9475 bufferlist dirbl;
9476 static DirStat empty;
9477 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
9478
9479 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
9480
9481 __u32 num = 0;
9482 bufferlist dnbl;
9483 auto p = infomap.upper_bound(last_snapid);
9484 for (; p != infomap.end() && num < max_entries; ++p) {
9485 dout(10) << p->first << " -> " << *p->second << dendl;
9486
9487 // actual
9488 string snap_name;
9489 if (p->second->ino == diri->ino())
9490 snap_name = p->second->name;
9491 else
9492 snap_name = p->second->get_long_name();
9493
9494 unsigned start_len = dnbl.length();
9495 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
9496 break;
9497
9498 encode(snap_name, dnbl);
9499 //infinite lease
9500 LeaseStat e(-1, -1, 0);
9501 mds->locker->encode_lease(dnbl, mdr->session->info, e);
9502 dout(20) << "encode_infinite_lease" << dendl;
9503
9504 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
9505 if (r < 0) {
9506 bufferlist keep;
9507 keep.substr_of(dnbl, 0, start_len);
9508 dnbl.swap(keep);
9509 break;
9510 }
9511 ++num;
9512 }
9513
9514 encode(num, dirbl);
9515 __u16 flags = 0;
9516 if (p == infomap.end()) {
9517 flags = CEPH_READDIR_FRAG_END;
9518 if (last_snapid == 0)
9519 flags |= CEPH_READDIR_FRAG_COMPLETE;
9520 }
9521 encode(flags, dirbl);
9522 dirbl.claim_append(dnbl);
9523
9524 mdr->reply_extra_bl = dirbl;
9525 mdr->tracei = diri;
9526 respond_to_request(mdr, 0);
9527 }
9528
9529
9530 // MKSNAP
9531
9532 struct C_MDS_mksnap_finish : public ServerLogContext {
9533 CInode *diri;
9534 SnapInfo info;
9535 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
9536 ServerLogContext(s, r), diri(di), info(i) {}
9537 void finish(int r) override {
9538 server->_mksnap_finish(mdr, diri, info);
9539 }
9540 };
9541
9542 /* This function takes responsibility for the passed mdr*/
9543 void Server::handle_client_mksnap(MDRequestRef& mdr)
9544 {
9545 const MClientRequest::const_ref &req = mdr->client_request;
9546 // make sure we have as new a map as the client
9547 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
9548 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
9549 return;
9550 }
9551 if (!mds->mdsmap->allows_snaps()) {
9552 // you can't make snapshots until you set an option right now
9553 respond_to_request(mdr, -EPERM);
9554 return;
9555 }
9556
9557 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
9558 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
9559 respond_to_request(mdr, -ESTALE);
9560 return;
9561 }
9562
9563 if (!diri->is_auth()) { // fw to auth?
9564 mdcache->request_forward(mdr, diri->authority().first);
9565 return;
9566 }
9567
9568 // dir only
9569 if (!diri->is_dir()) {
9570 respond_to_request(mdr, -ENOTDIR);
9571 return;
9572 }
9573 if (diri->is_system() && !diri->is_root()) {
9574 // no snaps in system dirs (root is ok)
9575 respond_to_request(mdr, -EPERM);
9576 return;
9577 }
9578
9579 std::string_view snapname = req->get_filepath().last_dentry();
9580
9581 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
9582 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
9583 respond_to_request(mdr, -EPERM);
9584 return;
9585 }
9586
9587 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
9588
9589 // lock snap
9590 MutationImpl::LockOpVec lov;
9591
9592 mds->locker->include_snap_rdlocks(diri, lov);
9593 lov.erase_rdlock(&diri->snaplock);
9594 lov.add_xlock(&diri->snaplock);
9595
9596 if (!mds->locker->acquire_locks(mdr, lov))
9597 return;
9598
9599 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
9600 return;
9601
9602 // make sure name is unique
9603 if (diri->snaprealm &&
9604 diri->snaprealm->exists(snapname)) {
9605 respond_to_request(mdr, -EEXIST);
9606 return;
9607 }
9608 if (snapname.length() == 0 ||
9609 snapname[0] == '_') {
9610 respond_to_request(mdr, -EINVAL);
9611 return;
9612 }
9613
9614 // allocate a snapid
9615 if (!mdr->more()->stid) {
9616 // prepare an stid
9617 mds->snapclient->prepare_create(diri->ino(), snapname,
9618 mdr->get_mds_stamp(),
9619 &mdr->more()->stid, &mdr->more()->snapidbl,
9620 new C_MDS_RetryRequest(mdcache, mdr));
9621 return;
9622 }
9623
9624 version_t stid = mdr->more()->stid;
9625 snapid_t snapid;
9626 auto p = mdr->more()->snapidbl.cbegin();
9627 decode(snapid, p);
9628 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
9629
9630 ceph_assert(mds->snapclient->get_cached_version() >= stid);
9631
9632 // journal
9633 SnapInfo info;
9634 info.ino = diri->ino();
9635 info.snapid = snapid;
9636 info.name = snapname;
9637 info.stamp = mdr->get_op_stamp();
9638
9639 auto &pi = diri->project_inode(false, true);
9640 pi.inode.ctime = info.stamp;
9641 if (info.stamp > pi.inode.rstat.rctime)
9642 pi.inode.rstat.rctime = info.stamp;
9643 pi.inode.rstat.rsnaps++;
9644 pi.inode.version = diri->pre_dirty();
9645
9646 // project the snaprealm
9647 auto &newsnap = *pi.snapnode;
9648 newsnap.created = snapid;
9649 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
9650 if (!em.second)
9651 em.first->second = info;
9652 newsnap.seq = snapid;
9653 newsnap.last_created = snapid;
9654
9655 // journal the inode changes
9656 mdr->ls = mdlog->get_current_segment();
9657 EUpdate *le = new EUpdate(mdlog, "mksnap");
9658 mdlog->start_entry(le);
9659
9660 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
9661 le->metablob.add_table_transaction(TABLE_SNAP, stid);
9662 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
9663 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
9664
9665 // journal the snaprealm changes
9666 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
9667 mdr, __func__);
9668 mdlog->flush();
9669 }
9670
9671 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
9672 {
9673 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
9674
9675 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
9676
9677 diri->pop_and_dirty_projected_inode(mdr->ls);
9678 mdr->apply();
9679
9680 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
9681
9682 // create snap
9683 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
9684
9685 // notify other mds
9686 mdcache->send_snap_update(diri, mdr->more()->stid, op);
9687
9688 mdcache->do_realm_invalidate_and_update_notify(diri, op);
9689
9690 // yay
9691 mdr->in[0] = diri;
9692 mdr->snapid = info.snapid;
9693 mdr->tracei = diri;
9694 respond_to_request(mdr, 0);
9695 }
9696
9697
9698 // RMSNAP
9699
9700 struct C_MDS_rmsnap_finish : public ServerLogContext {
9701 CInode *diri;
9702 snapid_t snapid;
9703 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
9704 ServerLogContext(s, r), diri(di), snapid(sn) {}
9705 void finish(int r) override {
9706 server->_rmsnap_finish(mdr, diri, snapid);
9707 }
9708 };
9709
9710 /* This function takes responsibility for the passed mdr*/
9711 void Server::handle_client_rmsnap(MDRequestRef& mdr)
9712 {
9713 const MClientRequest::const_ref &req = mdr->client_request;
9714
9715 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
9716 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
9717 respond_to_request(mdr, -ESTALE);
9718 return;
9719 }
9720 if (!diri->is_auth()) { // fw to auth?
9721 mdcache->request_forward(mdr, diri->authority().first);
9722 return;
9723 }
9724 if (!diri->is_dir()) {
9725 respond_to_request(mdr, -ENOTDIR);
9726 return;
9727 }
9728
9729 std::string_view snapname = req->get_filepath().last_dentry();
9730
9731 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
9732 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
9733 respond_to_request(mdr, -EPERM);
9734 return;
9735 }
9736
9737 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
9738
9739 // does snap exist?
9740 if (snapname.length() == 0 || snapname[0] == '_') {
9741 respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
9742 return;
9743 }
9744 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
9745 respond_to_request(mdr, -ENOENT);
9746 return;
9747 }
9748 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
9749 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
9750
9751 MutationImpl::LockOpVec lov;
9752 mds->locker->include_snap_rdlocks(diri, lov);
9753 lov.erase_rdlock(&diri->snaplock);
9754 lov.add_xlock(&diri->snaplock);
9755
9756 if (!mds->locker->acquire_locks(mdr, lov))
9757 return;
9758
9759 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
9760 return;
9761
9762 // prepare
9763 if (!mdr->more()->stid) {
9764 mds->snapclient->prepare_destroy(diri->ino(), snapid,
9765 &mdr->more()->stid, &mdr->more()->snapidbl,
9766 new C_MDS_RetryRequest(mdcache, mdr));
9767 return;
9768 }
9769 version_t stid = mdr->more()->stid;
9770 auto p = mdr->more()->snapidbl.cbegin();
9771 snapid_t seq;
9772 decode(seq, p);
9773 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
9774
9775 ceph_assert(mds->snapclient->get_cached_version() >= stid);
9776
9777 // journal
9778 auto &pi = diri->project_inode(false, true);
9779 pi.inode.version = diri->pre_dirty();
9780 pi.inode.ctime = mdr->get_op_stamp();
9781 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
9782 pi.inode.rstat.rctime = mdr->get_op_stamp();
9783 pi.inode.rstat.rsnaps--;
9784
9785 mdr->ls = mdlog->get_current_segment();
9786 EUpdate *le = new EUpdate(mdlog, "rmsnap");
9787 mdlog->start_entry(le);
9788
9789 // project the snaprealm
9790 auto &newnode = *pi.snapnode;
9791 newnode.snaps.erase(snapid);
9792 newnode.seq = seq;
9793 newnode.last_destroyed = seq;
9794
9795 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
9796 le->metablob.add_table_transaction(TABLE_SNAP, stid);
9797 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
9798 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
9799
9800 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
9801 mdr, __func__);
9802 mdlog->flush();
9803 }
9804
9805 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
9806 {
9807 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
9808 snapid_t stid = mdr->more()->stid;
9809 auto p = mdr->more()->snapidbl.cbegin();
9810 snapid_t seq;
9811 decode(seq, p);
9812
9813 diri->pop_and_dirty_projected_inode(mdr->ls);
9814 mdr->apply();
9815
9816 mds->snapclient->commit(stid, mdr->ls);
9817
9818 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
9819
9820 // notify other mds
9821 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
9822
9823 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
9824
9825 // yay
9826 mdr->in[0] = diri;
9827 respond_to_request(mdr, 0);
9828
9829 // purge snapshot data
9830 if (diri->snaprealm->have_past_parents_open())
9831 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
9832 }
9833
9834 struct C_MDS_renamesnap_finish : public ServerLogContext {
9835 CInode *diri;
9836 snapid_t snapid;
9837 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
9838 ServerLogContext(s, r), diri(di), snapid(sn) {}
9839 void finish(int r) override {
9840 server->_renamesnap_finish(mdr, diri, snapid);
9841 }
9842 };
9843
9844 /* This function takes responsibility for the passed mdr*/
9845 void Server::handle_client_renamesnap(MDRequestRef& mdr)
9846 {
9847 const MClientRequest::const_ref &req = mdr->client_request;
9848 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
9849 respond_to_request(mdr, -EINVAL);
9850 return;
9851 }
9852
9853 CInode *diri = mdcache->get_inode(req->get_filepath().get_ino());
9854 if (!diri || diri->state_test(CInode::STATE_PURGING)) {
9855 respond_to_request(mdr, -ESTALE);
9856 return;
9857 }
9858
9859 if (!diri->is_auth()) { // fw to auth?
9860 mdcache->request_forward(mdr, diri->authority().first);
9861 return;
9862 }
9863
9864 if (!diri->is_dir()) { // dir only
9865 respond_to_request(mdr, -ENOTDIR);
9866 return;
9867 }
9868
9869 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
9870 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
9871 respond_to_request(mdr, -EPERM);
9872 return;
9873 }
9874
9875 std::string_view dstname = req->get_filepath().last_dentry();
9876 std::string_view srcname = req->get_filepath2().last_dentry();
9877 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
9878
9879 if (srcname.length() == 0 || srcname[0] == '_') {
9880 respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
9881 return;
9882 }
9883 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
9884 respond_to_request(mdr, -ENOENT);
9885 return;
9886 }
9887 if (dstname.length() == 0 || dstname[0] == '_') {
9888 respond_to_request(mdr, -EINVAL);
9889 return;
9890 }
9891 if (diri->snaprealm->exists(dstname)) {
9892 respond_to_request(mdr, -EEXIST);
9893 return;
9894 }
9895
9896 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
9897 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
9898
9899 // lock snap
9900 MutationImpl::LockOpVec lov;
9901
9902 mds->locker->include_snap_rdlocks(diri, lov);
9903 lov.erase_rdlock(&diri->snaplock);
9904 lov.add_xlock(&diri->snaplock);
9905
9906 if (!mds->locker->acquire_locks(mdr, lov))
9907 return;
9908
9909 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
9910 return;
9911
9912 // prepare
9913 if (!mdr->more()->stid) {
9914 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
9915 &mdr->more()->stid,
9916 new C_MDS_RetryRequest(mdcache, mdr));
9917 return;
9918 }
9919
9920 version_t stid = mdr->more()->stid;
9921 dout(10) << " stid is " << stid << dendl;
9922
9923 ceph_assert(mds->snapclient->get_cached_version() >= stid);
9924
9925 // journal
9926 auto &pi = diri->project_inode(false, true);
9927 pi.inode.ctime = mdr->get_op_stamp();
9928 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
9929 pi.inode.rstat.rctime = mdr->get_op_stamp();
9930 pi.inode.version = diri->pre_dirty();
9931
9932 // project the snaprealm
9933 auto &newsnap = *pi.snapnode;
9934 auto it = newsnap.snaps.find(snapid);
9935 ceph_assert(it != newsnap.snaps.end());
9936 it->second.name = dstname;
9937
9938 // journal the inode changes
9939 mdr->ls = mdlog->get_current_segment();
9940 EUpdate *le = new EUpdate(mdlog, "renamesnap");
9941 mdlog->start_entry(le);
9942
9943 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
9944 le->metablob.add_table_transaction(TABLE_SNAP, stid);
9945 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
9946 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
9947
9948 // journal the snaprealm changes
9949 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
9950 mdr, __func__);
9951 mdlog->flush();
9952 }
9953
9954 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
9955 {
9956 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
9957
9958 diri->pop_and_dirty_projected_inode(mdr->ls);
9959 mdr->apply();
9960
9961 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
9962
9963 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
9964
9965 // notify other mds
9966 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
9967
9968 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
9969
9970 // yay
9971 mdr->in[0] = diri;
9972 mdr->tracei = diri;
9973 mdr->snapid = snapid;
9974 respond_to_request(mdr, 0);
9975 }
9976
9977 /**
9978 * Return true if server is in state RECONNECT and this
9979 * client has not yet reconnected.
9980 */
9981 bool Server::waiting_for_reconnect(client_t c) const
9982 {
9983 return client_reconnect_gather.count(c) > 0;
9984 }
9985
9986 void Server::dump_reconnect_status(Formatter *f) const
9987 {
9988 f->open_object_section("reconnect_status");
9989 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
9990 f->close_section();
9991 }