]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
3bb03dd08ff8776b4063bab3fcd8bdf75072fad6
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
21
22 #include "MDSRank.h"
23 #include "Server.h"
24 #include "Locker.h"
25 #include "MDCache.h"
26 #include "MDLog.h"
27 #include "Migrator.h"
28 #include "MDBalancer.h"
29 #include "InoTable.h"
30 #include "SnapClient.h"
31 #include "Mutation.h"
32 #include "cephfs_features.h"
33
34 #include "msg/Messenger.h"
35
36 #include "osdc/Objecter.h"
37
38 #include "events/EUpdate.h"
39 #include "events/ESlaveUpdate.h"
40 #include "events/ESession.h"
41 #include "events/EOpen.h"
42 #include "events/ECommitted.h"
43 #include "events/EPurged.h"
44
45 #include "include/stringify.h"
46 #include "include/filepath.h"
47 #include "common/errno.h"
48 #include "common/Timer.h"
49 #include "common/perf_counters.h"
50 #include "include/compat.h"
51 #include "osd/OSDMap.h"
52
53 #include <errno.h>
54 #include <math.h>
55
56 #include <list>
57 #include <iostream>
58 #include <string_view>
59
60 #include "common/config.h"
61
62 #define dout_context g_ceph_context
63 #define dout_subsys ceph_subsys_mds
64 #undef dout_prefix
65 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
66
67 class ServerContext : public MDSContext {
68 protected:
69 Server *server;
70 MDSRank *get_mds() override
71 {
72 return server->mds;
73 }
74
75 public:
76 explicit ServerContext(Server *s) : server(s) {
77 ceph_assert(server != NULL);
78 }
79 };
80
81 class Batch_Getattr_Lookup : public BatchOp {
82 protected:
83 Server* server;
84 ceph::ref_t<MDRequestImpl> mdr;
85 MDCache* mdcache;
86 int res = 0;
87 public:
88 Batch_Getattr_Lookup(Server* s, ceph::ref_t<MDRequestImpl> r, MDCache* mdc) : server(s), mdr(std::move(r)), mdcache(mdc) {}
89 void add_request(const ceph::ref_t<MDRequestImpl>& m) override {
90 mdr->batch_reqs.push_back(m);
91 }
92 void set_request(const ceph::ref_t<MDRequestImpl>& m) override {
93 mdr = m;
94 }
95 void _forward(mds_rank_t t) override {
96 mdcache->mds->forward_message_mds(mdr->release_client_request(), t);
97 mdr->set_mds_stamp(ceph_clock_now());
98 for (auto& m : mdr->batch_reqs) {
99 if (!m->killed)
100 mdcache->request_forward(m, t);
101 }
102 mdr->batch_reqs.clear();
103 }
104 void _respond(int r) override {
105 mdr->set_mds_stamp(ceph_clock_now());
106 for (auto& m : mdr->batch_reqs) {
107 if (!m->killed) {
108 m->tracei = mdr->tracei;
109 m->tracedn = mdr->tracedn;
110 server->respond_to_request(m, r);
111 }
112 }
113 mdr->batch_reqs.clear();
114 server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
115 }
116 void print(std::ostream& o) {
117 o << "[batch front=" << *mdr << "]";
118 }
119 };
120
121 class ServerLogContext : public MDSLogContextBase {
122 protected:
123 Server *server;
124 MDSRank *get_mds() override
125 {
126 return server->mds;
127 }
128
129 MDRequestRef mdr;
130 void pre_finish(int r) override {
131 if (mdr)
132 mdr->mark_event("journal_committed: ");
133 }
134 public:
135 explicit ServerLogContext(Server *s) : server(s) {
136 ceph_assert(server != NULL);
137 }
138 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
139 ceph_assert(server != NULL);
140 }
141 };
142
143 void Server::create_logger()
144 {
145 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
146
147 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
148 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
149 plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
150 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
151 plb.add_u64_counter(l_mdss_handle_client_session,
152 "handle_client_session", "Client session messages", "hcs",
153 PerfCountersBuilder::PRIO_INTERESTING);
154 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
155 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
156
157 // fop latencies are useful
158 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
159 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
160 "Request type lookup hash of inode latency");
161 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
162 "Request type lookup inode latency");
163 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
164 "Request type lookup parent latency");
165 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
166 "Request type lookup name latency");
167 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
168 "Request type lookup latency");
169 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
170 "Request type lookup snapshot latency");
171 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
172 "Request type get attribute latency");
173 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
174 "Request type set attribute latency");
175 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
176 "Request type set file layout latency");
177 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
178 "Request type set directory layout latency");
179 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
180 "Request type set extended attribute latency");
181 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
182 "Request type remove extended attribute latency");
183 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
184 "Request type read directory latency");
185 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
186 "Request type set file lock latency");
187 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
188 "Request type get file lock latency");
189 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
190 "Request type create latency");
191 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
192 "Request type open latency");
193 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
194 "Request type make node latency");
195 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
196 "Request type link latency");
197 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
198 "Request type unlink latency");
199 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
200 "Request type remove directory latency");
201 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
202 "Request type rename latency");
203 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
204 "Request type make directory latency");
205 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
206 "Request type symbolic link latency");
207 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
208 "Request type list snapshot latency");
209 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
210 "Request type make snapshot latency");
211 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
212 "Request type remove snapshot latency");
213 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
214 "Request type rename snapshot latency");
215
216 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
217 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
218 "Client requests dispatched");
219 plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request",
220 "Server requests dispatched");
221
222 logger = plb.create_perf_counters();
223 g_ceph_context->get_perfcounters_collection()->add(logger);
224 }
225
226 Server::Server(MDSRank *m) :
227 mds(m),
228 mdcache(mds->mdcache), mdlog(mds->mdlog),
229 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate"))
230 {
231 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
232 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
233 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
234 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
235 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
236 }
237
238 void Server::dispatch(const cref_t<Message> &m)
239 {
240 switch (m->get_type()) {
241 case CEPH_MSG_CLIENT_RECONNECT:
242 handle_client_reconnect(ref_cast<MClientReconnect>(m));
243 return;
244 }
245
246 /*
247 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
248
249 1. In reconnect phase, client sent unsafe requests to mds.
250 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
251 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
252 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
253
254 */
255 bool sessionclosed_isok = replay_unsafe_with_closed_session;
256 // active?
257 // handle_slave_request()/handle_client_session() will wait if necessary
258 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
259 const auto &req = ref_cast<MClientRequest>(m);
260 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
261 Session *session = mds->get_session(req);
262 if (!session || (!session->is_open() && !sessionclosed_isok)) {
263 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
264 return;
265 }
266 bool queue_replay = false;
267 if (req->is_replay() || req->is_async()) {
268 dout(3) << "queuing replayed op" << dendl;
269 queue_replay = true;
270 if (req->head.ino &&
271 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
272 mdcache->add_replay_ino_alloc(inodeno_t(req->head.ino));
273 }
274 } else if (req->get_retry_attempt()) {
275 // process completed request in clientreplay stage. The completed request
276 // might have created new file/directorie. This guarantees MDS sends a reply
277 // to client before other request modifies the new file/directorie.
278 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
279 dout(3) << "queuing completed op" << dendl;
280 queue_replay = true;
281 }
282 // this request was created before the cap reconnect message, drop any embedded
283 // cap releases.
284 req->releases.clear();
285 }
286 if (queue_replay) {
287 req->mark_queued_for_replay();
288 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
289 return;
290 }
291 }
292
293 bool wait_for_active = true;
294 if (mds->is_stopping()) {
295 wait_for_active = false;
296 } else if (mds->is_clientreplay()) {
297 if (req->is_queued_for_replay()) {
298 wait_for_active = false;
299 }
300 }
301 if (wait_for_active) {
302 dout(3) << "not active yet, waiting" << dendl;
303 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
304 return;
305 }
306 }
307
308 switch (m->get_type()) {
309 case CEPH_MSG_CLIENT_SESSION:
310 handle_client_session(ref_cast<MClientSession>(m));
311 return;
312 case CEPH_MSG_CLIENT_REQUEST:
313 handle_client_request(ref_cast<MClientRequest>(m));
314 return;
315 case CEPH_MSG_CLIENT_RECLAIM:
316 handle_client_reclaim(ref_cast<MClientReclaim>(m));
317 return;
318 case MSG_MDS_SLAVE_REQUEST:
319 handle_slave_request(ref_cast<MMDSSlaveRequest>(m));
320 return;
321 default:
322 derr << "server unknown message " << m->get_type() << dendl;
323 ceph_abort_msg("server unknown message");
324 }
325 }
326
327
328
329 // ----------------------------------------------------------
330 // SESSION management
331
332 class C_MDS_session_finish : public ServerLogContext {
333 Session *session;
334 uint64_t state_seq;
335 bool open;
336 version_t cmapv;
337 interval_set<inodeno_t> inos;
338 version_t inotablev;
339 interval_set<inodeno_t> purge_inos;
340 LogSegment *ls = nullptr;
341 Context *fin;
342 public:
343 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
344 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
345 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t> i, version_t iv, Context *fin_ = NULL) :
346 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(std::move(i)), inotablev(iv), fin(fin_) { }
347 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t> i, version_t iv,
348 interval_set<inodeno_t> _purge_inos, LogSegment *_ls, Context *fin_ = NULL) :
349 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(std::move(i)), inotablev(iv), purge_inos(std::move(_purge_inos)), ls(_ls), fin(fin_){}
350 void finish(int r) override {
351 ceph_assert(r == 0);
352 server->_session_logged(session, state_seq, open, cmapv, inos, inotablev, purge_inos, ls);
353 if (fin) {
354 fin->complete(r);
355 }
356 }
357 };
358
359 Session* Server::find_session_by_uuid(std::string_view uuid)
360 {
361 Session* session = nullptr;
362 for (auto& it : mds->sessionmap.get_sessions()) {
363 auto& metadata = it.second->info.client_metadata;
364
365 auto p = metadata.find("uuid");
366 if (p == metadata.end() || p->second != uuid)
367 continue;
368
369 if (!session) {
370 session = it.second;
371 } else if (!session->reclaiming_from) {
372 assert(it.second->reclaiming_from == session);
373 session = it.second;
374 } else {
375 assert(session->reclaiming_from == it.second);
376 }
377 }
378 return session;
379 }
380
381 void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
382 {
383 if (!session->is_open() && !session->is_stale()) {
384 dout(10) << "session not open, dropping this req" << dendl;
385 return;
386 }
387
388 auto reply = make_message<MClientReclaimReply>(0);
389 if (m->get_uuid().empty()) {
390 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
391 reply->set_result(-EINVAL);
392 mds->send_message_client(reply, session);
393 return;
394 }
395
396 unsigned flags = m->get_flags();
397 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
398 dout(10) << __func__ << " unsupported flags" << dendl;
399 reply->set_result(-EOPNOTSUPP);
400 mds->send_message_client(reply, session);
401 return;
402 }
403
404 Session* target = find_session_by_uuid(m->get_uuid());
405 if (target) {
406 if (session->info.auth_name != target->info.auth_name) {
407 dout(10) << __func__ << " session auth_name " << session->info.auth_name
408 << " != target auth_name " << target->info.auth_name << dendl;
409 reply->set_result(-EPERM);
410 mds->send_message_client(reply, session);
411 }
412
413 assert(!target->reclaiming_from);
414 assert(!session->reclaiming_from);
415 session->reclaiming_from = target;
416 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
417 }
418
419 if (flags & CEPH_RECLAIM_RESET) {
420 finish_reclaim_session(session, reply);
421 return;
422 }
423
424 ceph_abort();
425 }
426
427 void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
428 {
429 Session *target = session->reclaiming_from;
430 if (target) {
431 session->reclaiming_from = nullptr;
432
433 Context *send_reply;
434 if (reply) {
435 int64_t session_id = session->get_client().v;
436 send_reply = new LambdaContext([this, session_id, reply](int r) {
437 assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
438 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
439 if (!session) {
440 return;
441 }
442 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
443 reply->set_epoch(epoch);
444 mds->send_message_client(reply, session);
445 });
446 } else {
447 send_reply = nullptr;
448 }
449
450 bool blacklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
451 return map.is_blacklisted(target->info.inst.addr);
452 });
453
454 if (blacklisted || !g_conf()->mds_session_blacklist_on_evict) {
455 kill_session(target, send_reply);
456 } else {
457 std::stringstream ss;
458 mds->evict_client(target->get_client().v, false, true, ss, send_reply);
459 }
460 } else if (reply) {
461 mds->send_message_client(reply, session);
462 }
463 }
464
465 void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
466 {
467 Session *session = mds->get_session(m);
468 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
469 assert(m->get_source().is_client()); // should _not_ come from an mds!
470
471 if (!session) {
472 dout(0) << " ignoring sessionless msg " << *m << dendl;
473 return;
474 }
475
476 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
477 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
478 return;
479 }
480
481 if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
482 finish_reclaim_session(session);
483 } else {
484 reclaim_session(session, m);
485 }
486 }
487
488 void Server::handle_client_session(const cref_t<MClientSession> &m)
489 {
490 version_t pv;
491 Session *session = mds->get_session(m);
492
493 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
494 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
495
496 if (!session) {
497 dout(0) << " ignoring sessionless msg " << *m << dendl;
498 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
499 reply->metadata["error_string"] = "sessionless";
500 mds->send_message(reply, m->get_connection());
501 return;
502 }
503
504 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
505 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
506 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
507 // close requests need to be handled when mds is active
508 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
509 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
510 return;
511 }
512 } else {
513 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
514 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
515 return;
516 }
517 }
518
519 if (logger)
520 logger->inc(l_mdss_handle_client_session);
521
522 uint64_t sseq = 0;
523 switch (m->get_op()) {
524 case CEPH_SESSION_REQUEST_OPEN:
525 if (session->is_opening() ||
526 session->is_open() ||
527 session->is_stale() ||
528 session->is_killing() ||
529 terminating_sessions) {
530 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
531 return;
532 }
533 ceph_assert(session->is_closed() || session->is_closing());
534
535 if (mds->is_stopping()) {
536 dout(10) << "mds is stopping, dropping open req" << dendl;
537 return;
538 }
539
540 {
541 auto& addr = session->info.inst.addr;
542 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
543 auto& client_metadata = session->info.client_metadata;
544
545 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
546 auto now = ceph_clock_now();
547 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
548 auto elapsed = now - m->get_recv_stamp();
549 CachedStackStringStream css;
550 *css << "New client session:"
551 << " addr=\"" << session->info.inst.addr << "\""
552 << ",elapsed=" << elapsed
553 << ",throttled=" << throttle_elapsed
554 << ",status=\"" << status << "\"";
555 if (!err.empty()) {
556 *css << ",error=\"" << err << "\"";
557 }
558 const auto& metadata = session->info.client_metadata;
559 if (auto it = metadata.find("root"); it != metadata.end()) {
560 *css << ",root=\"" << it->second << "\"";
561 }
562 dout(2) << css->strv() << dendl;
563 };
564
565 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str) {
566 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT);
567 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
568 m->metadata["error_string"] = err_str;
569 mds->send_message_client(m, session);
570 log_session_status("REJECTED", err_str);
571 };
572
573 bool blacklisted = mds->objecter->with_osdmap(
574 [&addr](const OSDMap &osd_map) -> bool {
575 return osd_map.is_blacklisted(addr);
576 });
577
578 if (blacklisted) {
579 dout(10) << "rejecting blacklisted client " << addr << dendl;
580 send_reject_message("blacklisted");
581 session->clear();
582 break;
583 }
584
585 if (client_metadata.features.empty())
586 infer_supported_features(session, client_metadata);
587
588 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
589 dout(20) << " features: '" << client_metadata.features << "'" << dendl;
590 dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
591 for (const auto& p : client_metadata) {
592 dout(20) << " " << p.first << ": " << p.second << dendl;
593 }
594
595 feature_bitset_t missing_features = required_client_features;
596 missing_features -= client_metadata.features;
597 if (!missing_features.empty()) {
598 stringstream ss;
599 ss << "missing required features '" << missing_features << "'";
600 send_reject_message(ss.str());
601 mds->clog->warn() << "client session (" << session->info.inst
602 << ") lacks required features " << missing_features
603 << "; client supports " << client_metadata.features;
604 session->clear();
605 break;
606 }
607
608 // Special case for the 'root' metadata path; validate that the claimed
609 // root is actually within the caps of the session
610 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
611 auto claimed_root = it->second;
612 stringstream ss;
613 bool denied = false;
614 // claimed_root has a leading "/" which we strip before passing
615 // into caps check
616 if (claimed_root.empty() || claimed_root[0] != '/') {
617 denied = true;
618 ss << "invalue root '" << claimed_root << "'";
619 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
620 denied = true;
621 ss << "non-allowable root '" << claimed_root << "'";
622 }
623
624 if (denied) {
625 // Tell the client we're rejecting their open
626 send_reject_message(ss.str());
627 mds->clog->warn() << "client session with " << ss.str()
628 << " denied (" << session->info.inst << ")";
629 session->clear();
630 break;
631 }
632 }
633
634 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
635 if (find_session_by_uuid(it->second)) {
636 send_reject_message("duplicated session uuid");
637 mds->clog->warn() << "client session with duplicated session uuid '"
638 << it->second << "' denied (" << session->info.inst << ")";
639 session->clear();
640 break;
641 }
642 }
643
644 if (session->is_closed())
645 mds->sessionmap.add_session(session);
646
647 pv = mds->sessionmap.mark_projected(session);
648 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
649 mds->sessionmap.touch_session(session);
650 auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
651 ceph_assert(r == 0);
652 log_session_status("ACCEPTED", "");
653 });
654 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
655 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
656 mdlog->flush();
657 }
658 break;
659
660 case CEPH_SESSION_REQUEST_RENEWCAPS:
661 if (session->is_open() || session->is_stale()) {
662 mds->sessionmap.touch_session(session);
663 if (session->is_stale()) {
664 mds->sessionmap.set_state(session, Session::STATE_OPEN);
665 mds->locker->resume_stale_caps(session);
666 mds->sessionmap.touch_session(session);
667 }
668 auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
669 mds->send_message_client(reply, session);
670 } else {
671 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
672 }
673 break;
674
675 case CEPH_SESSION_REQUEST_CLOSE:
676 {
677 if (session->is_closed() ||
678 session->is_closing() ||
679 session->is_killing()) {
680 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
681 return;
682 }
683 if (session->is_importing()) {
684 dout(10) << "ignoring close req on importing session" << dendl;
685 return;
686 }
687 ceph_assert(session->is_open() ||
688 session->is_stale() ||
689 session->is_opening());
690 if (m->get_seq() < session->get_push_seq()) {
691 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
692 << ", dropping" << dendl;
693 return;
694 }
695 // We are getting a seq that is higher than expected.
696 // Handle the same as any other seqn error.
697 //
698 if (m->get_seq() != session->get_push_seq()) {
699 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
700 << ", BUGGY!" << dendl;
701 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
702 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
703 return;
704 }
705 journal_close_session(session, Session::STATE_CLOSING, NULL);
706 }
707 break;
708
709 case CEPH_SESSION_FLUSHMSG_ACK:
710 finish_flush_session(session, m->get_seq());
711 break;
712
713 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
714 if (mds->is_active())
715 mdlog->flush();
716 break;
717
718 default:
719 ceph_abort();
720 }
721 }
722
723
724 void Server::flush_session(Session *session, MDSGatherBuilder *gather) {
725 if (!session->is_open() ||
726 !session->get_connection() ||
727 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
728 return;
729 }
730
731 version_t seq = session->wait_for_flush(gather->new_sub());
732 mds->send_message_client(
733 make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
734 }
735
736 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
737 {
738 for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
739 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
740 ceph_assert(session);
741 flush_session(session, &gather);
742 }
743 }
744
745 void Server::finish_flush_session(Session *session, version_t seq)
746 {
747 MDSContext::vec finished;
748 session->finish_flush(seq, finished);
749 mds->queue_waiters(finished);
750 }
751
752 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
753 const interval_set<inodeno_t>& inos, version_t piv,
754 const interval_set<inodeno_t>& purge_inos, LogSegment *ls)
755 {
756 dout(10) << "_session_logged " << session->info.inst
757 << " state_seq " << state_seq
758 << " " << (open ? "open":"close")
759 << " " << pv
760 << " purge_inos : " << purge_inos << dendl;
761
762 if (NULL != ls) {
763 dout(10) << "_session_logged seq : " << ls->seq << dendl;
764 if (purge_inos.size()){
765 ls->purge_inodes.insert(purge_inos);
766 mdcache->purge_inodes(purge_inos, ls);
767 }
768 }
769
770 if (piv) {
771 ceph_assert(session->is_closing() || session->is_killing() ||
772 session->is_opening()); // re-open closing session
773 session->info.prealloc_inos.subtract(inos);
774 session->delegated_inos.clear();
775 mds->inotable->apply_release_ids(inos);
776 ceph_assert(mds->inotable->get_version() == piv);
777 }
778
779 mds->sessionmap.mark_dirty(session);
780
781 // apply
782 if (session->get_state_seq() != state_seq) {
783 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
784 << ", noop" << dendl;
785 // close must have been canceled (by an import?), or any number of other things..
786 } else if (open) {
787 ceph_assert(session->is_opening());
788 mds->sessionmap.set_state(session, Session::STATE_OPEN);
789 mds->sessionmap.touch_session(session);
790 ceph_assert(session->get_connection());
791 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
792 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
793 reply->supported_features = supported_features;
794 mds->send_message_client(reply, session);
795 if (mdcache->is_readonly()) {
796 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
797 mds->send_message_client(m, session);
798 }
799 } else if (session->is_closing() ||
800 session->is_killing()) {
801 // kill any lingering capabilities, leases, requests
802 while (!session->caps.empty()) {
803 Capability *cap = session->caps.front();
804 CInode *in = cap->get_inode();
805 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
806 mds->locker->remove_client_cap(in, cap, true);
807 }
808 while (!session->leases.empty()) {
809 ClientLease *r = session->leases.front();
810 CDentry *dn = static_cast<CDentry*>(r->parent);
811 dout(20) << " killing client lease of " << *dn << dendl;
812 dn->remove_client_lease(r, mds->locker);
813 }
814 if (client_reconnect_gather.erase(session->info.get_client())) {
815 dout(20) << " removing client from reconnect set" << dendl;
816 if (client_reconnect_gather.empty()) {
817 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
818 reconnect_gather_finish();
819 }
820 }
821 if (client_reclaim_gather.erase(session->info.get_client())) {
822 dout(20) << " removing client from reclaim set" << dendl;
823 if (client_reclaim_gather.empty()) {
824 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
825 mds->maybe_clientreplay_done();
826 }
827 }
828
829 if (session->is_closing()) {
830 // mark con disposable. if there is a fault, we will get a
831 // reset and clean it up. if the client hasn't received the
832 // CLOSE message yet, they will reconnect and get an
833 // ms_handle_remote_reset() and realize they had in fact closed.
834 // do this *before* sending the message to avoid a possible
835 // race.
836 if (session->get_connection()) {
837 // Conditional because terminate_sessions will indiscrimately
838 // put sessions in CLOSING whether they ever had a conn or not.
839 session->get_connection()->mark_disposable();
840 }
841
842 // reset session
843 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
844 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
845 session->clear();
846 mds->sessionmap.remove_session(session);
847 } else if (session->is_killing()) {
848 // destroy session, close connection
849 if (session->get_connection()) {
850 session->get_connection()->mark_down();
851 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
852 session->set_connection(nullptr);
853 }
854 mds->sessionmap.remove_session(session);
855 } else {
856 ceph_abort();
857 }
858 } else {
859 ceph_abort();
860 }
861 }
862
863 /**
864 * Inject sessions from some source other than actual connections.
865 *
866 * For example:
867 * - sessions inferred from journal replay
868 * - sessions learned from other MDSs during rejoin
869 * - sessions learned from other MDSs during dir/caps migration
870 * - sessions learned from other MDSs during a cross-MDS rename
871 */
872 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
873 map<client_t,client_metadata_t>& cmm,
874 map<client_t, pair<Session*,uint64_t> >& smap)
875 {
876 version_t pv = mds->sessionmap.get_projected();
877
878 dout(10) << "prepare_force_open_sessions " << pv
879 << " on " << cm.size() << " clients"
880 << dendl;
881
882 mds->objecter->with_osdmap(
883 [this, &cm, &cmm](const OSDMap &osd_map) {
884 for (auto p = cm.begin(); p != cm.end(); ) {
885 if (osd_map.is_blacklisted(p->second.addr)) {
886 dout(10) << " ignoring blacklisted client." << p->first
887 << " (" << p->second.addr << ")" << dendl;
888 cmm.erase(p->first);
889 cm.erase(p++);
890 } else {
891 ++p;
892 }
893 }
894 });
895
896 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
897 Session *session = mds->sessionmap.get_or_add_session(p->second);
898 pv = mds->sessionmap.mark_projected(session);
899 uint64_t sseq;
900 if (session->is_closed() ||
901 session->is_closing() ||
902 session->is_killing()) {
903 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
904 auto q = cmm.find(p->first);
905 if (q != cmm.end())
906 session->info.client_metadata.merge(q->second);
907 } else {
908 ceph_assert(session->is_open() ||
909 session->is_opening() ||
910 session->is_stale());
911 sseq = 0;
912 }
913 smap[p->first] = make_pair(session, sseq);
914 session->inc_importing();
915 }
916 return pv;
917 }
918
919 void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
920 bool dec_import)
921 {
922 /*
923 * FIXME: need to carefully consider the race conditions between a
924 * client trying to close a session and an MDS doing an import
925 * trying to force open a session...
926 */
927 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
928 << " initial v " << mds->sessionmap.get_version() << dendl;
929
930 for (auto &it : smap) {
931 Session *session = it.second.first;
932 uint64_t sseq = it.second.second;
933 if (sseq > 0) {
934 if (session->get_state_seq() != sseq) {
935 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
936 } else {
937 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
938 mds->sessionmap.set_state(session, Session::STATE_OPEN);
939 mds->sessionmap.touch_session(session);
940
941 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
942 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
943 reply->supported_features = supported_features;
944 mds->send_message_client(reply, session);
945
946 if (mdcache->is_readonly())
947 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
948 }
949 } else {
950 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
951 ceph_assert(session->is_open() || session->is_stale());
952 }
953
954 if (dec_import) {
955 session->dec_importing();
956 }
957
958 mds->sessionmap.mark_dirty(session);
959 }
960
961 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
962 }
963
964 class C_MDS_TerminatedSessions : public ServerContext {
965 void finish(int r) override {
966 server->terminating_sessions = false;
967 }
968 public:
969 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
970 };
971
972 void Server::terminate_sessions()
973 {
974 dout(5) << "terminating all sessions..." << dendl;
975
976 terminating_sessions = true;
977
978 // kill them off. clients will retry etc.
979 set<Session*> sessions;
980 mds->sessionmap.get_client_session_set(sessions);
981 for (set<Session*>::const_iterator p = sessions.begin();
982 p != sessions.end();
983 ++p) {
984 Session *session = *p;
985 if (session->is_closing() ||
986 session->is_killing() ||
987 session->is_closed())
988 continue;
989 journal_close_session(session, Session::STATE_CLOSING, NULL);
990 }
991
992 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
993 }
994
995
996 void Server::find_idle_sessions()
997 {
998 auto now = clock::now();
999 auto last_cleared_laggy = mds->last_cleared_laggy();
1000
1001 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
1002
1003 // timeout/stale
1004 // (caps go stale, lease die)
1005 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
1006 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
1007
1008 // don't kick clients if we've been laggy
1009 if (last_cleared_laggy < cutoff) {
1010 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
1011 << "), not marking any client stale" << dendl;
1012 return;
1013 }
1014
1015 std::vector<Session*> to_evict;
1016
1017 bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
1018 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
1019 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
1020 std::vector<Session*> new_stale;
1021
1022 for (auto session : *(sessions_p1->second)) {
1023 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1024 if (last_cap_renew_span < cutoff) {
1025 dout(20) << "laggiest active session is " << session->info.inst
1026 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1027 break;
1028 }
1029
1030 if (session->last_seen > session->last_cap_renew) {
1031 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
1032 if (last_cap_renew_span < cutoff) {
1033 dout(20) << "laggiest active session is " << session->info.inst
1034 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1035 continue;
1036 }
1037 }
1038
1039 if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
1040 dout(20) << "evicting session " << session->info.inst << " since autoclose "
1041 "has arrived" << dendl;
1042 // evict session without marking it stale
1043 to_evict.push_back(session);
1044 continue;
1045 }
1046
1047 if (defer_session_stale &&
1048 !session->is_any_flush_waiter() &&
1049 !mds->locker->is_revoking_any_caps_from(session->get_client())) {
1050 dout(20) << "deferring marking session " << session->info.inst << " stale "
1051 "since it holds no caps" << dendl;
1052 continue;
1053 }
1054
1055 auto it = session->info.client_metadata.find("timeout");
1056 if (it != session->info.client_metadata.end()) {
1057 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
1058 if (timeout == 0) {
1059 dout(10) << "skipping session " << session->info.inst
1060 << ", infinite timeout specified" << dendl;
1061 continue;
1062 }
1063 double cutoff = queue_max_age + timeout;
1064 if (last_cap_renew_span < cutoff) {
1065 dout(10) << "skipping session " << session->info.inst
1066 << ", timeout (" << timeout << ") specified"
1067 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1068 continue;
1069 }
1070
1071 // do not go through stale, evict it directly.
1072 to_evict.push_back(session);
1073 } else {
1074 dout(10) << "new stale session " << session->info.inst
1075 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1076 new_stale.push_back(session);
1077 }
1078 }
1079
1080 for (auto session : new_stale) {
1081 mds->sessionmap.set_state(session, Session::STATE_STALE);
1082 if (mds->locker->revoke_stale_caps(session)) {
1083 mds->locker->remove_stale_leases(session);
1084 finish_flush_session(session, session->get_push_seq());
1085 auto m = make_message<MClientSession>(CEPH_SESSION_STALE, session->get_push_seq());
1086 mds->send_message_client(m, session);
1087 } else {
1088 to_evict.push_back(session);
1089 }
1090 }
1091 }
1092
1093 // autoclose
1094 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
1095
1096 // Collect a list of sessions exceeding the autoclose threshold
1097 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
1098 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1099 for (auto session : *(sessions_p2->second)) {
1100 assert(session->is_stale());
1101 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1102 if (last_cap_renew_span < cutoff) {
1103 dout(20) << "oldest stale session is " << session->info.inst
1104 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1105 break;
1106 }
1107 to_evict.push_back(session);
1108 }
1109 }
1110
1111 for (auto session: to_evict) {
1112 if (session->is_importing()) {
1113 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1114 continue;
1115 }
1116
1117 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1118 mds->clog->warn() << "evicting unresponsive client " << *session
1119 << ", after " << last_cap_renew_span << " seconds";
1120 dout(10) << "autoclosing stale session " << session->info.inst
1121 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1122
1123 if (g_conf()->mds_session_blacklist_on_timeout) {
1124 std::stringstream ss;
1125 mds->evict_client(session->get_client().v, false, true, ss, nullptr);
1126 } else {
1127 kill_session(session, NULL);
1128 }
1129 }
1130 }
1131
1132 void Server::evict_cap_revoke_non_responders() {
1133 if (!cap_revoke_eviction_timeout) {
1134 return;
1135 }
1136
1137 auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
1138
1139 for (auto const &client: to_evict) {
1140 mds->clog->warn() << "client id " << client << " has not responded to"
1141 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1142 << " seconds, evicting";
1143 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1144 << client << dendl;
1145
1146 std::stringstream ss;
1147 bool evicted = mds->evict_client(client.v, false,
1148 g_conf()->mds_session_blacklist_on_evict,
1149 ss, nullptr);
1150 if (evicted && logger) {
1151 logger->inc(l_mdss_cap_revoke_eviction);
1152 }
1153 }
1154 }
1155
1156 void Server::handle_conf_change(const std::set<std::string>& changed) {
1157 if (changed.count("mds_replay_unsafe_with_closed_session")) {
1158 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
1159 }
1160 if (changed.count("mds_cap_revoke_eviction_timeout")) {
1161 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
1162 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1163 << cap_revoke_eviction_timeout << dendl;
1164 }
1165 if (changed.count("mds_recall_max_decay_rate")) {
1166 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
1167 }
1168 if (changed.count("mds_max_snaps_per_dir")) {
1169 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
1170 dout(20) << __func__ << " max snapshots per directory changed to "
1171 << max_snaps_per_dir << dendl;
1172 }
1173 if (changed.count("mds_client_delegate_inos_pct")) {
1174 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
1175 }
1176 }
1177
1178 /*
1179 * XXX bump in the interface here, not using an MDSContext here
1180 * because all the callers right now happen to use a SaferCond
1181 */
1182 void Server::kill_session(Session *session, Context *on_safe, bool need_purge_inos)
1183 {
1184 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
1185
1186 if ((session->is_opening() ||
1187 session->is_open() ||
1188 session->is_stale()) &&
1189 !session->is_importing()) {
1190 dout(10) << "kill_session " << session << dendl;
1191 journal_close_session(session, Session::STATE_KILLING, on_safe, need_purge_inos);
1192 } else {
1193 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
1194 if (session->is_closing() ||
1195 session->is_killing()) {
1196 if (on_safe)
1197 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1198 } else {
1199 ceph_assert(session->is_closed() ||
1200 session->is_importing());
1201 if (on_safe)
1202 on_safe->complete(0);
1203 }
1204 }
1205 }
1206
1207 size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
1208 {
1209 bool prenautilus = mds->objecter->with_osdmap(
1210 [&](const OSDMap& o) {
1211 return o.require_osd_release < ceph_release_t::nautilus;
1212 });
1213
1214 std::vector<Session*> victims;
1215 const auto& sessions = mds->sessionmap.get_sessions();
1216 for (const auto& p : sessions) {
1217 if (!p.first.is_client()) {
1218 // Do not apply OSDMap blacklist to MDS daemons, we find out
1219 // about their death via MDSMap.
1220 continue;
1221 }
1222
1223 Session *s = p.second;
1224 auto inst_addr = s->info.inst.addr;
1225 // blacklist entries are always TYPE_ANY for nautilus+
1226 inst_addr.set_type(entity_addr_t::TYPE_ANY);
1227 if (blacklist.count(inst_addr)) {
1228 victims.push_back(s);
1229 continue;
1230 }
1231 if (prenautilus) {
1232 // ...except pre-nautilus, they were TYPE_LEGACY
1233 inst_addr.set_type(entity_addr_t::TYPE_LEGACY);
1234 if (blacklist.count(inst_addr)) {
1235 victims.push_back(s);
1236 }
1237 }
1238 }
1239
1240 for (const auto& s : victims) {
1241 kill_session(s, nullptr);
1242 }
1243
1244 dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
1245
1246 return victims.size();
1247 }
1248
1249 void Server::journal_close_session(Session *session, int state, Context *on_safe, bool need_purge_inos)
1250 {
1251 dout(10) << __func__ << " : "
1252 << "("<< need_purge_inos << ")"
1253 << session->info.inst
1254 << "(" << session->info.prealloc_inos.size() << "|" << session->pending_prealloc_inos.size() << ")" << dendl;
1255
1256 uint64_t sseq = mds->sessionmap.set_state(session, state);
1257 version_t pv = mds->sessionmap.mark_projected(session);
1258 version_t piv = 0;
1259
1260 // release alloc and pending-alloc inos for this session
1261 // and wipe out session state, in case the session close aborts for some reason
1262 interval_set<inodeno_t> both;
1263 both.insert(session->pending_prealloc_inos);
1264 if (!need_purge_inos)
1265 both.insert(session->info.prealloc_inos);
1266 if (both.size()) {
1267 mds->inotable->project_release_ids(both);
1268 piv = mds->inotable->get_projected_version();
1269 } else
1270 piv = 0;
1271
1272 if(need_purge_inos && session->info.prealloc_inos.size()) {
1273 dout(10) << "start purge indoes " << session->info.prealloc_inos << dendl;
1274 LogSegment* ls = mdlog->get_current_segment();
1275 LogEvent* e = new ESession(session->info.inst, false, pv, both, piv, session->info.prealloc_inos);
1276 MDSLogContextBase* c = new C_MDS_session_finish(this, session, sseq, false, pv, both, piv,
1277 session->info.prealloc_inos, ls, on_safe);
1278 mdlog->start_submit_entry(e, c);
1279 } else {
1280 interval_set<inodeno_t> empty;
1281 LogEvent* e = new ESession(session->info.inst, false, pv, both, piv, empty);
1282 MDSLogContextBase* c = new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe);
1283 mdlog->start_submit_entry(e, c);
1284 }
1285 mdlog->flush();
1286
1287 // clean up requests, too
1288 for (auto p = session->requests.begin(); !p.end(); ) {
1289 MDRequestRef mdr(*p);
1290 ++p;
1291 mdcache->request_kill(mdr);
1292 }
1293
1294 finish_flush_session(session, session->get_push_seq());
1295 }
1296
1297 void Server::reconnect_clients(MDSContext *reconnect_done_)
1298 {
1299 reconnect_done = reconnect_done_;
1300
1301 auto now = clock::now();
1302 set<Session*> sessions;
1303 mds->sessionmap.get_client_session_set(sessions);
1304 for (auto session : sessions) {
1305 if (session->is_open()) {
1306 client_reconnect_gather.insert(session->get_client());
1307 session->set_reconnecting(true);
1308 session->last_cap_renew = now;
1309 }
1310 }
1311
1312 if (client_reconnect_gather.empty()) {
1313 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1314 reconnect_gather_finish();
1315 return;
1316 }
1317
1318 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1319
1320 reconnect_start = now;
1321 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1322 mds->sessionmap.dump();
1323 }
1324
1325 void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
1326 {
1327 dout(7) << "handle_client_reconnect " << m->get_source()
1328 << (m->has_more() ? " (more)" : "") << dendl;
1329 client_t from = m->get_source().num();
1330 Session *session = mds->get_session(m);
1331 if (!session) {
1332 dout(0) << " ignoring sessionless msg " << *m << dendl;
1333 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
1334 reply->metadata["error_string"] = "sessionless";
1335 mds->send_message(reply, m->get_connection());
1336 return;
1337 }
1338
1339 if (!session->is_open()) {
1340 dout(0) << " ignoring msg from not-open session" << *m << dendl;
1341 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1342 mds->send_message(reply, m->get_connection());
1343 return;
1344 }
1345
1346 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1347 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1348 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1349 return;
1350 }
1351
1352 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
1353 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1354
1355 bool deny = false;
1356 if (!mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
1357 // XXX maybe in the future we can do better than this?
1358 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1359 mds->clog->info() << "denied reconnect attempt (mds is "
1360 << ceph_mds_state_name(mds->get_state())
1361 << ") from " << m->get_source_inst()
1362 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
1363 deny = true;
1364 } else {
1365 std::string error_str;
1366 if (!session->is_open()) {
1367 error_str = "session is closed";
1368 } else if (mdcache->is_readonly()) {
1369 error_str = "mds is readonly";
1370 } else {
1371 if (session->info.client_metadata.features.empty())
1372 infer_supported_features(session, session->info.client_metadata);
1373
1374 feature_bitset_t missing_features = required_client_features;
1375 missing_features -= session->info.client_metadata.features;
1376 if (!missing_features.empty()) {
1377 stringstream ss;
1378 ss << "missing required features '" << missing_features << "'";
1379 error_str = ss.str();
1380 }
1381 }
1382
1383 if (!error_str.empty()) {
1384 deny = true;
1385 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1386 mds->clog->info() << "denied reconnect attempt from "
1387 << m->get_source_inst() << " (" << error_str << ")";
1388 }
1389 }
1390
1391 if (deny) {
1392 auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1393 mds->send_message_client(r, session);
1394 if (session->is_open())
1395 kill_session(session, nullptr);
1396 return;
1397 }
1398
1399 if (!m->has_more()) {
1400 // notify client of success with an OPEN
1401 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
1402 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
1403 reply->supported_features = supported_features;
1404 mds->send_message_client(reply, session);
1405 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1406 }
1407
1408 session->last_cap_renew = clock::now();
1409
1410 // snaprealms
1411 for (const auto &r : m->realms) {
1412 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
1413 if (in && in->state_test(CInode::STATE_PURGING))
1414 continue;
1415 if (in) {
1416 if (in->snaprealm) {
1417 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
1418 } else {
1419 // this can happen if we are non-auth or we rollback snaprealm
1420 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
1421 }
1422 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1423 } else {
1424 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1425 << " seq " << r.realm.seq << dendl;
1426 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1427 }
1428 }
1429
1430 // caps
1431 for (const auto &p : m->caps) {
1432 // make sure our last_cap_id is MAX over all issued caps
1433 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1434 mdcache->last_cap_id = p.second.capinfo.cap_id;
1435
1436 CInode *in = mdcache->get_inode(p.first);
1437 if (in && in->state_test(CInode::STATE_PURGING))
1438 continue;
1439 if (in && in->is_auth()) {
1440 // we recovered it, and it's ours. take note.
1441 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
1442 << " on " << *in << dendl;
1443 in->reconnect_cap(from, p.second, session);
1444 mdcache->add_reconnected_cap(from, p.first, p.second);
1445 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
1446 continue;
1447 }
1448
1449 if (in && !in->is_auth()) {
1450 // not mine.
1451 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1452 // add to cap export list.
1453 mdcache->rejoin_export_caps(p.first, from, p.second,
1454 in->authority().first, true);
1455 } else {
1456 // don't know if the inode is mine
1457 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1458 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
1459 }
1460 }
1461
1462 reconnect_last_seen = clock::now();
1463
1464 if (!m->has_more()) {
1465 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1466
1467 // remove from gather set
1468 client_reconnect_gather.erase(from);
1469 session->set_reconnecting(false);
1470 if (client_reconnect_gather.empty())
1471 reconnect_gather_finish();
1472 }
1473 }
1474
1475 void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1476 {
1477 int supported = -1;
1478 auto it = client_metadata.find("ceph_version");
1479 if (it != client_metadata.end()) {
1480 // user space client
1481 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1482 supported = CEPHFS_FEATURE_LUMINOUS;
1483 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1484 supported = CEPHFS_FEATURE_KRAKEN;
1485 } else {
1486 it = client_metadata.find("kernel_version");
1487 if (it != client_metadata.end()) {
1488 // kernel client
1489 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1490 supported = CEPHFS_FEATURE_LUMINOUS;
1491 }
1492 }
1493 if (supported == -1 &&
1494 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1495 supported = CEPHFS_FEATURE_JEWEL;
1496
1497 if (supported >= 0) {
1498 unsigned long value = (1UL << (supported + 1)) - 1;
1499 client_metadata.features = feature_bitset_t(value);
1500 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1501 }
1502 }
1503
1504 void Server::update_required_client_features()
1505 {
1506 vector<size_t> bits = CEPHFS_FEATURES_MDS_REQUIRED;
1507
1508 /* If this blows up on you, you added a release without adding a new release bit to cephfs_features.h */
1509 static_assert(CEPHFS_CURRENT_RELEASE == CEPH_RELEASE_MAX-1);
1510
1511 ceph_release_t min_compat = mds->mdsmap->get_min_compat_client();
1512 if (min_compat >= ceph_release_t::octopus)
1513 bits.push_back(CEPHFS_FEATURE_OCTOPUS);
1514 else if (min_compat >= ceph_release_t::nautilus)
1515 bits.push_back(CEPHFS_FEATURE_NAUTILUS);
1516 else if (min_compat >= ceph_release_t::mimic)
1517 bits.push_back(CEPHFS_FEATURE_MIMIC);
1518 else if (min_compat >= ceph_release_t::luminous)
1519 bits.push_back(CEPHFS_FEATURE_LUMINOUS);
1520 else if (min_compat >= ceph_release_t::kraken)
1521 bits.push_back(CEPHFS_FEATURE_KRAKEN);
1522 else if (min_compat >= ceph_release_t::jewel)
1523 bits.push_back(CEPHFS_FEATURE_JEWEL);
1524
1525 std::sort(bits.begin(), bits.end());
1526 required_client_features = feature_bitset_t(bits);
1527 dout(7) << "required_client_features: " << required_client_features << dendl;
1528
1529 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1530 set<Session*> sessions;
1531 mds->sessionmap.get_client_session_set(sessions);
1532 for (auto session : sessions) {
1533 feature_bitset_t missing_features = required_client_features;
1534 missing_features -= session->info.client_metadata.features;
1535 if (!missing_features.empty()) {
1536 bool blacklisted = mds->objecter->with_osdmap(
1537 [session](const OSDMap &osd_map) -> bool {
1538 return osd_map.is_blacklisted(session->info.inst.addr);
1539 });
1540 if (blacklisted)
1541 continue;
1542
1543 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1544 << missing_features << "'";
1545 std::stringstream ss;
1546 mds->evict_client(session->get_client().v, false,
1547 g_conf()->mds_session_blacklist_on_evict, ss);
1548 }
1549 }
1550 }
1551 }
1552
1553 void Server::reconnect_gather_finish()
1554 {
1555 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1556 ceph_assert(reconnect_done);
1557
1558 if (!mds->snapclient->is_synced()) {
1559 // make sure snaptable cache is populated. snaprealms will be
1560 // extensively used in rejoin stage.
1561 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1562 mds->snapclient->wait_for_sync(reconnect_done);
1563 } else {
1564 reconnect_done->complete(0);
1565 }
1566 reconnect_done = NULL;
1567 }
1568
1569 void Server::reconnect_tick()
1570 {
1571 if (reconnect_evicting) {
1572 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
1573 return;
1574 }
1575
1576 if (client_reconnect_gather.empty())
1577 return;
1578
1579 auto now = clock::now();
1580 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
1581 if (elapse1 < g_conf()->mds_reconnect_timeout)
1582 return;
1583
1584 vector<Session*> remaining_sessions;
1585 remaining_sessions.reserve(client_reconnect_gather.size());
1586 for (auto c : client_reconnect_gather) {
1587 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1588 ceph_assert(session);
1589 remaining_sessions.push_back(session);
1590 // client re-sends cap flush messages before the reconnect message
1591 if (session->last_seen > reconnect_last_seen)
1592 reconnect_last_seen = session->last_seen;
1593 }
1594
1595 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
1596 if (elapse2 < g_conf()->mds_reconnect_timeout / 2) {
1597 dout(7) << "reconnect_tick: last seen " << elapse2
1598 << " seconds ago, extending reconnect interval" << dendl;
1599 return;
1600 }
1601
1602 dout(7) << "reconnect timed out, " << remaining_sessions.size()
1603 << " clients have not reconnected in time" << dendl;
1604
1605 // If we're doing blacklist evictions, use this to wait for them before
1606 // proceeding to reconnect_gather_finish
1607 MDSGatherBuilder gather(g_ceph_context);
1608
1609 for (auto session : remaining_sessions) {
1610 // Keep sessions that have specified timeout. These sessions will prevent
1611 // mds from going to active. MDS goes to active after they all have been
1612 // killed or reclaimed.
1613 if (session->info.client_metadata.find("timeout") !=
1614 session->info.client_metadata.end()) {
1615 dout(1) << "reconnect keeps " << session->info.inst
1616 << ", need to be reclaimed" << dendl;
1617 client_reclaim_gather.insert(session->get_client());
1618 continue;
1619 }
1620
1621 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
1622
1623 mds->clog->warn() << "evicting unresponsive client " << *session
1624 << ", after waiting " << elapse1
1625 << " seconds during MDS startup";
1626
1627 if (g_conf()->mds_session_blacklist_on_timeout) {
1628 std::stringstream ss;
1629 mds->evict_client(session->get_client().v, false, true, ss,
1630 gather.new_sub());
1631 } else {
1632 kill_session(session, NULL, true);
1633 }
1634
1635 failed_reconnects++;
1636 }
1637 client_reconnect_gather.clear();
1638
1639 if (gather.has_subs()) {
1640 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1641 gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
1642 [this](int r){reconnect_gather_finish();})));
1643 gather.activate();
1644 reconnect_evicting = true;
1645 } else {
1646 reconnect_gather_finish();
1647 }
1648 }
1649
1650 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1651 {
1652 if (!locks.length()) return;
1653 int numlocks;
1654 ceph_filelock lock;
1655 auto p = locks.cbegin();
1656 decode(numlocks, p);
1657 for (int i = 0; i < numlocks; ++i) {
1658 decode(lock, p);
1659 lock.client = client;
1660 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1661 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1662 }
1663 decode(numlocks, p);
1664 for (int i = 0; i < numlocks; ++i) {
1665 decode(lock, p);
1666 lock.client = client;
1667 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1668 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1669 }
1670 }
1671
1672 /**
1673 * Call this when the MDCache is oversized, to send requests to the clients
1674 * to trim some caps, and consequently unpin some inodes in the MDCache so
1675 * that it can trim too.
1676 */
1677 std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1678 {
1679 const auto now = clock::now();
1680 const bool steady = !!(flags&RecallFlags::STEADY);
1681 const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
1682 const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
1683 const bool trim = !!(flags&RecallFlags::TRIM);
1684
1685 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1686 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1687 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1688 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1689 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
1690 const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
1691
1692 dout(7) << __func__ << ":"
1693 << " min=" << min_caps_per_client
1694 << " max=" << max_caps_per_client
1695 << " total=" << Capability::count()
1696 << " flags=" << flags
1697 << dendl;
1698
1699 /* trim caps of sessions with the most caps first */
1700 std::multimap<uint64_t, Session*> caps_session;
1701 auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
1702 auto num_caps = s->caps.size();
1703 auto cache_liveness = s->get_session_cache_liveness();
1704 if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
1705 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1706 }
1707 };
1708 mds->sessionmap.get_client_sessions(std::move(f));
1709
1710 std::pair<bool, uint64_t> result = {false, 0};
1711 auto& [throttled, caps_recalled] = result;
1712 last_recall_state = now;
1713 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
1714 if (!session->is_open() ||
1715 !session->get_connection() ||
1716 !session->info.inst.name.is_client())
1717 continue;
1718
1719 dout(10) << __func__ << ":"
1720 << " session " << session->info.inst
1721 << " caps " << num_caps
1722 << ", leases " << session->leases.size()
1723 << dendl;
1724
1725 uint64_t newlim;
1726 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1727 newlim = min_caps_per_client;
1728 } else {
1729 newlim = num_caps-recall_max_caps;
1730 }
1731 if (num_caps > newlim) {
1732 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1733 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1734 newlim = num_caps-recall;
1735 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
1736 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1737 const uint64_t global_recall_throttle = recall_throttle.get();
1738 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1739 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1740 throttled = true;
1741 continue;
1742 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1743 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1744 throttled = true;
1745 continue;
1746 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1747 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1748 throttled = true;
1749 break;
1750 }
1751
1752 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1753 if (steady) {
1754 const auto session_recall = session->get_recall_caps();
1755 const auto session_release = session->get_release_caps();
1756 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1757 /* The session has been unable to keep up with the number of caps
1758 * recalled (by half); additionally, to prevent marking sessions
1759 * we've just begun to recall from, the session_recall counter
1760 * (decayed count of caps recently recalled) is **greater** than the
1761 * session threshold for the session's cap recall throttle.
1762 */
1763 dout(15) << " 2*session_release < session_recall"
1764 " (2*" << session_release << " < " << session_recall << ") &&"
1765 " 2*session_recall < recall_max_decay_threshold"
1766 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
1767 " Skipping because we are unlikely to get more released." << dendl;
1768 continue;
1769 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1770 /* The number of caps recalled is less than the number we *could*
1771 * recall (so there isn't much left to recall?) and the number of
1772 * caps is less than the current recall_caps counter (decayed count
1773 * of caps recently recalled).
1774 */
1775 dout(15) << " 2*recall < session_recall "
1776 " (2*" << recall << " < " << session_recall << ") &&"
1777 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1778 " Skipping because we are unlikely to get more released." << dendl;
1779 continue;
1780 }
1781 }
1782
1783 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1784
1785 auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
1786 m->head.max_caps = newlim;
1787 mds->send_message_client(m, session);
1788 if (gather) {
1789 flush_session(session, gather);
1790 }
1791 caps_recalled += session->notify_recall_sent(newlim);
1792 recall_throttle.hit(recall);
1793 }
1794 }
1795
1796 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1797
1798 return result;
1799 }
1800
1801 void Server::force_clients_readonly()
1802 {
1803 dout(10) << "force_clients_readonly" << dendl;
1804 set<Session*> sessions;
1805 mds->sessionmap.get_client_session_set(sessions);
1806 for (set<Session*>::const_iterator p = sessions.begin();
1807 p != sessions.end();
1808 ++p) {
1809 Session *session = *p;
1810 if (!session->info.inst.name.is_client() ||
1811 !(session->is_open() || session->is_stale()))
1812 continue;
1813 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
1814 }
1815 }
1816
1817 /*******
1818 * some generic stuff for finishing off requests
1819 */
1820 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1821 {
1822 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1823 ceph_assert(!mdr->has_completed);
1824
1825 // note trace items for eventual reply.
1826 mdr->tracei = in;
1827 if (in)
1828 mdr->pin(in);
1829
1830 mdr->tracedn = dn;
1831 if (dn)
1832 mdr->pin(dn);
1833
1834 early_reply(mdr, in, dn);
1835
1836 mdr->committing = true;
1837 submit_mdlog_entry(le, fin, mdr, __func__);
1838
1839 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1840 if (mds->queue_one_replay()) {
1841 dout(10) << " queued next replay op" << dendl;
1842 } else {
1843 dout(10) << " journaled last replay op" << dendl;
1844 }
1845 } else if (mdr->did_early_reply)
1846 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
1847 else
1848 mdlog->flush();
1849 }
1850
1851 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1852 std::string_view event)
1853 {
1854 if (mdr) {
1855 string event_str("submit entry: ");
1856 event_str += event;
1857 mdr->mark_event(event_str);
1858 }
1859 mdlog->submit_entry(le, fin);
1860 }
1861
1862 /*
1863 * send response built from mdr contents and error code; clean up mdr
1864 */
1865 void Server::respond_to_request(MDRequestRef& mdr, int r)
1866 {
1867 if (mdr->client_request) {
1868 if (mdr->is_batch_op() && mdr->is_batch_head) {
1869 int mask = mdr->client_request->head.args.getattr.mask;
1870
1871 std::unique_ptr<BatchOp> bop;
1872 if (mdr->client_request->get_op() == CEPH_MDS_OP_GETATTR) {
1873 dout(20) << __func__ << ": respond other getattr ops. " << *mdr << dendl;
1874 auto it = mdr->in[0]->batch_ops.find(mask);
1875 bop = std::move(it->second);
1876 mdr->in[0]->batch_ops.erase(it);
1877 } else {
1878 dout(20) << __func__ << ": respond other lookup ops. " << *mdr << dendl;
1879 auto it = mdr->dn[0].back()->batch_ops.find(mask);
1880 bop = std::move(it->second);
1881 mdr->dn[0].back()->batch_ops.erase(it);
1882 }
1883
1884 bop->respond(r);
1885 } else {
1886 reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
1887 }
1888 } else if (mdr->internal_op > -1) {
1889 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1890 if (!mdr->internal_op_finish)
1891 ceph_abort_msg("trying to respond to internal op without finisher");
1892 mdr->internal_op_finish->complete(r);
1893 mdcache->request_finish(mdr);
1894 }
1895 }
1896
1897 // statistics mds req op number and latency
1898 void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
1899 {
1900 int code = l_mdss_first;
1901 switch(req->get_op()) {
1902 case CEPH_MDS_OP_LOOKUPHASH:
1903 code = l_mdss_req_lookuphash_latency;
1904 break;
1905 case CEPH_MDS_OP_LOOKUPINO:
1906 code = l_mdss_req_lookupino_latency;
1907 break;
1908 case CEPH_MDS_OP_LOOKUPPARENT:
1909 code = l_mdss_req_lookupparent_latency;
1910 break;
1911 case CEPH_MDS_OP_LOOKUPNAME:
1912 code = l_mdss_req_lookupname_latency;
1913 break;
1914 case CEPH_MDS_OP_LOOKUP:
1915 code = l_mdss_req_lookup_latency;
1916 break;
1917 case CEPH_MDS_OP_LOOKUPSNAP:
1918 code = l_mdss_req_lookupsnap_latency;
1919 break;
1920 case CEPH_MDS_OP_GETATTR:
1921 code = l_mdss_req_getattr_latency;
1922 break;
1923 case CEPH_MDS_OP_SETATTR:
1924 code = l_mdss_req_setattr_latency;
1925 break;
1926 case CEPH_MDS_OP_SETLAYOUT:
1927 code = l_mdss_req_setlayout_latency;
1928 break;
1929 case CEPH_MDS_OP_SETDIRLAYOUT:
1930 code = l_mdss_req_setdirlayout_latency;
1931 break;
1932 case CEPH_MDS_OP_SETXATTR:
1933 code = l_mdss_req_setxattr_latency;
1934 break;
1935 case CEPH_MDS_OP_RMXATTR:
1936 code = l_mdss_req_rmxattr_latency;
1937 break;
1938 case CEPH_MDS_OP_READDIR:
1939 code = l_mdss_req_readdir_latency;
1940 break;
1941 case CEPH_MDS_OP_SETFILELOCK:
1942 code = l_mdss_req_setfilelock_latency;
1943 break;
1944 case CEPH_MDS_OP_GETFILELOCK:
1945 code = l_mdss_req_getfilelock_latency;
1946 break;
1947 case CEPH_MDS_OP_CREATE:
1948 code = l_mdss_req_create_latency;
1949 break;
1950 case CEPH_MDS_OP_OPEN:
1951 code = l_mdss_req_open_latency;
1952 break;
1953 case CEPH_MDS_OP_MKNOD:
1954 code = l_mdss_req_mknod_latency;
1955 break;
1956 case CEPH_MDS_OP_LINK:
1957 code = l_mdss_req_link_latency;
1958 break;
1959 case CEPH_MDS_OP_UNLINK:
1960 code = l_mdss_req_unlink_latency;
1961 break;
1962 case CEPH_MDS_OP_RMDIR:
1963 code = l_mdss_req_rmdir_latency;
1964 break;
1965 case CEPH_MDS_OP_RENAME:
1966 code = l_mdss_req_rename_latency;
1967 break;
1968 case CEPH_MDS_OP_MKDIR:
1969 code = l_mdss_req_mkdir_latency;
1970 break;
1971 case CEPH_MDS_OP_SYMLINK:
1972 code = l_mdss_req_symlink_latency;
1973 break;
1974 case CEPH_MDS_OP_LSSNAP:
1975 code = l_mdss_req_lssnap_latency;
1976 break;
1977 case CEPH_MDS_OP_MKSNAP:
1978 code = l_mdss_req_mksnap_latency;
1979 break;
1980 case CEPH_MDS_OP_RMSNAP:
1981 code = l_mdss_req_rmsnap_latency;
1982 break;
1983 case CEPH_MDS_OP_RENAMESNAP:
1984 code = l_mdss_req_renamesnap_latency;
1985 break;
1986 default: ceph_abort();
1987 }
1988 logger->tinc(code, lat);
1989 }
1990
1991 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
1992 {
1993 if (!g_conf()->mds_early_reply)
1994 return;
1995
1996 if (mdr->no_early_reply) {
1997 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
1998 return;
1999 }
2000
2001 if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
2002 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
2003 return;
2004 }
2005
2006 if (mdr->alloc_ino) {
2007 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
2008 return;
2009 }
2010
2011 const cref_t<MClientRequest> &req = mdr->client_request;
2012 entity_inst_t client_inst = req->get_source_inst();
2013 if (client_inst.name.is_mds())
2014 return;
2015
2016 if (req->is_replay()) {
2017 dout(10) << " no early reply on replay op" << dendl;
2018 return;
2019 }
2020
2021
2022 auto reply = make_message<MClientReply>(*req, 0);
2023 reply->set_unsafe();
2024
2025 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2026 //
2027 //_rename_finish() does not send dentry link/unlink message to replicas.
2028 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2029 // that have projected linkages from getting new replica.
2030 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
2031
2032 dout(10) << "early_reply " << reply->get_result()
2033 << " (" << cpp_strerror(reply->get_result())
2034 << ") " << *req << dendl;
2035
2036 if (tracei || tracedn) {
2037 if (tracei)
2038 mdr->cap_releases.erase(tracei->vino());
2039 if (tracedn)
2040 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2041
2042 set_trace_dist(reply, tracei, tracedn, mdr);
2043 }
2044
2045 reply->set_extra_bl(mdr->reply_extra_bl);
2046 mds->send_message_client(reply, mdr->session);
2047
2048 mdr->did_early_reply = true;
2049
2050 mds->logger->inc(l_mds_reply);
2051 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
2052 mds->logger->tinc(l_mds_reply_latency, lat);
2053 if (client_inst.name.is_client()) {
2054 mds->sessionmap.hit_session(mdr->session);
2055 }
2056 perf_gather_op_latency(req, lat);
2057 dout(20) << "lat " << lat << dendl;
2058
2059 mdr->mark_event("early_replied");
2060 }
2061
2062 /*
2063 * send given reply
2064 * include a trace to tracei
2065 * Clean up mdr
2066 */
2067 void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
2068 {
2069 ceph_assert(mdr.get());
2070 const cref_t<MClientRequest> &req = mdr->client_request;
2071
2072 dout(7) << "reply_client_request " << reply->get_result()
2073 << " (" << cpp_strerror(reply->get_result())
2074 << ") " << *req << dendl;
2075
2076 mdr->mark_event("replying");
2077
2078 Session *session = mdr->session;
2079
2080 // note successful request in session map?
2081 //
2082 // setfilelock requests are special, they only modify states in MDS memory.
2083 // The states get lost when MDS fails. If Client re-send a completed
2084 // setfilelock request, it means that client did not receive corresponding
2085 // setfilelock reply. So MDS should re-execute the setfilelock request.
2086 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
2087 reply->get_result() == 0 && session) {
2088 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
2089 session->add_completed_request(mdr->reqid.tid, created);
2090 if (mdr->ls) {
2091 mdr->ls->touched_sessions.insert(session->info.inst.name);
2092 }
2093 }
2094
2095 // give any preallocated inos to the session
2096 apply_allocated_inos(mdr, session);
2097
2098 // get tracei/tracedn from mdr?
2099 CInode *tracei = mdr->tracei;
2100 CDentry *tracedn = mdr->tracedn;
2101
2102 bool is_replay = mdr->client_request->is_replay();
2103 bool did_early_reply = mdr->did_early_reply;
2104 entity_inst_t client_inst = req->get_source_inst();
2105
2106 if (!did_early_reply && !is_replay) {
2107
2108 mds->logger->inc(l_mds_reply);
2109 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
2110 mds->logger->tinc(l_mds_reply_latency, lat);
2111 if (session && client_inst.name.is_client()) {
2112 mds->sessionmap.hit_session(session);
2113 }
2114 perf_gather_op_latency(req, lat);
2115 dout(20) << "lat " << lat << dendl;
2116
2117 if (tracei)
2118 mdr->cap_releases.erase(tracei->vino());
2119 if (tracedn)
2120 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2121 }
2122
2123 // drop non-rdlocks before replying, so that we can issue leases
2124 mdcache->request_drop_non_rdlocks(mdr);
2125
2126 // reply at all?
2127 if (session && !client_inst.name.is_mds()) {
2128 // send reply.
2129 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
2130 (tracei || tracedn)) {
2131 if (is_replay) {
2132 if (tracei)
2133 mdcache->try_reconnect_cap(tracei, session);
2134 } else {
2135 // include metadata in reply
2136 set_trace_dist(reply, tracei, tracedn, mdr);
2137 }
2138 }
2139
2140 // We can set the extra bl unconditionally: if it's already been sent in the
2141 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2142 reply->set_extra_bl(mdr->reply_extra_bl);
2143
2144 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
2145 mds->send_message_client(reply, session);
2146 }
2147
2148 if (req->is_queued_for_replay() &&
2149 (mdr->has_completed || reply->get_result() < 0)) {
2150 if (reply->get_result() < 0) {
2151 int r = reply->get_result();
2152 derr << "reply_client_request: failed to replay " << *req
2153 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2154 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2155 }
2156 mds->queue_one_replay();
2157 }
2158
2159 // clean up request
2160 mdcache->request_finish(mdr);
2161
2162 // take a closer look at tracei, if it happens to be a remote link
2163 if (tracei &&
2164 tracedn &&
2165 tracedn->get_projected_linkage()->is_remote()) {
2166 mdcache->eval_remote(tracedn);
2167 }
2168 }
2169
2170 /*
2171 * pass inode OR dentry (not both, or we may get confused)
2172 *
2173 * trace is in reverse order (i.e. root inode comes last)
2174 */
2175 void Server::set_trace_dist(const ref_t<MClientReply> &reply,
2176 CInode *in, CDentry *dn,
2177 MDRequestRef& mdr)
2178 {
2179 // skip doing this for debugging purposes?
2180 if (g_conf()->mds_inject_traceless_reply_probability &&
2181 mdr->ls && !mdr->o_trunc &&
2182 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
2183 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2184 return;
2185 }
2186
2187 // inode, dentry, dir, ..., inode
2188 bufferlist bl;
2189 mds_rank_t whoami = mds->get_nodeid();
2190 Session *session = mdr->session;
2191 snapid_t snapid = mdr->snapid;
2192 utime_t now = ceph_clock_now();
2193
2194 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2195
2196 // realm
2197 if (snapid == CEPH_NOSNAP) {
2198 SnapRealm *realm;
2199 if (in)
2200 realm = in->find_snaprealm();
2201 else
2202 realm = dn->get_dir()->get_inode()->find_snaprealm();
2203 reply->snapbl = realm->get_snap_trace();
2204 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2205 }
2206
2207 // dir + dentry?
2208 if (dn) {
2209 reply->head.is_dentry = 1;
2210 CDir *dir = dn->get_dir();
2211 CInode *diri = dir->get_inode();
2212
2213 diri->encode_inodestat(bl, session, NULL, snapid);
2214 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2215
2216 #ifdef MDS_VERIFY_FRAGSTAT
2217 if (dir->is_complete())
2218 dir->verify_fragstat();
2219 #endif
2220 DirStat ds;
2221 ds.frag = dir->get_frag();
2222 ds.auth = dir->get_dir_auth().first;
2223 if (dir->is_auth() && !mdcache->forward_all_reqs_to_auth())
2224 dir->get_dist_spec(ds.dist, whoami);
2225
2226 dir->encode_dirstat(bl, session->info, ds);
2227 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2228
2229 encode(dn->get_name(), bl);
2230
2231 int lease_mask = 0;
2232 CDentry::linkage_t *dnl = dn->get_linkage(mdr->get_client(), mdr);
2233 if (dnl->is_primary()) {
2234 ceph_assert(dnl->get_inode() == in);
2235 lease_mask = CEPH_LEASE_PRIMARY_LINK;
2236 } else {
2237 if (dnl->is_remote())
2238 ceph_assert(dnl->get_remote_ino() == in->ino());
2239 else
2240 ceph_assert(!in);
2241 }
2242 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, bl);
2243 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
2244 } else
2245 reply->head.is_dentry = 0;
2246
2247 // inode
2248 if (in) {
2249 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2250 dout(20) << "set_trace_dist added in " << *in << dendl;
2251 reply->head.is_target = 1;
2252 } else
2253 reply->head.is_target = 0;
2254
2255 reply->set_trace(bl);
2256 }
2257
2258 void Server::handle_client_request(const cref_t<MClientRequest> &req)
2259 {
2260 dout(4) << "handle_client_request " << *req << dendl;
2261
2262 if (mds->logger)
2263 mds->logger->inc(l_mds_request);
2264 if (logger)
2265 logger->inc(l_mdss_handle_client_request);
2266
2267 if (!mdcache->is_open()) {
2268 dout(5) << "waiting for root" << dendl;
2269 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2270 return;
2271 }
2272
2273 bool sessionclosed_isok = replay_unsafe_with_closed_session;
2274 // active session?
2275 Session *session = 0;
2276 if (req->get_source().is_client()) {
2277 session = mds->get_session(req);
2278 if (!session) {
2279 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
2280 } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
2281 session->is_closing() ||
2282 session->is_killing()) {
2283 dout(5) << "session closed|closing|killing, dropping" << dendl;
2284 session = NULL;
2285 }
2286 if (!session) {
2287 if (req->is_queued_for_replay())
2288 mds->queue_one_replay();
2289 return;
2290 }
2291 }
2292
2293 // old mdsmap?
2294 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2295 // send it? hrm, this isn't ideal; they may get a lot of copies if
2296 // they have a high request rate.
2297 }
2298
2299 // completed request?
2300 bool has_completed = false;
2301 if (req->is_replay() || req->get_retry_attempt()) {
2302 ceph_assert(session);
2303 inodeno_t created;
2304 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2305 has_completed = true;
2306 if (!session->is_open())
2307 return;
2308 // Don't send traceless reply if the completed request has created
2309 // new inode. Treat the request as lookup request instead.
2310 if (req->is_replay() ||
2311 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2312 req->get_op() != CEPH_MDS_OP_OPEN &&
2313 req->get_op() != CEPH_MDS_OP_CREATE)) {
2314 dout(5) << "already completed " << req->get_reqid() << dendl;
2315 auto reply = make_message<MClientReply>(*req, 0);
2316 if (created != inodeno_t()) {
2317 bufferlist extra;
2318 encode(created, extra);
2319 reply->set_extra_bl(extra);
2320 }
2321 mds->send_message_client(reply, session);
2322
2323 if (req->is_queued_for_replay())
2324 mds->queue_one_replay();
2325
2326 return;
2327 }
2328 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2329 req->get_op() != CEPH_MDS_OP_CREATE) {
2330 dout(10) << " completed request which created new inode " << created
2331 << ", convert it to lookup request" << dendl;
2332 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2333 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2334 }
2335 }
2336 }
2337
2338 // trim completed_request list
2339 if (req->get_oldest_client_tid() > 0) {
2340 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
2341 ceph_assert(session);
2342 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2343 // Sessions 'completed_requests' was dirtied, mark it to be
2344 // potentially flushed at segment expiry.
2345 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2346
2347 if (session->get_num_trim_requests_warnings() > 0 &&
2348 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
2349 session->reset_num_trim_requests_warnings();
2350 } else {
2351 if (session->get_num_completed_requests() >=
2352 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
2353 session->inc_num_trim_requests_warnings();
2354 stringstream ss;
2355 ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
2356 << req->get_oldest_client_tid() << "), "
2357 << session->get_num_completed_requests()
2358 << " completed requests recorded in session\n";
2359 mds->clog->warn() << ss.str();
2360 dout(20) << __func__ << " " << ss.str() << dendl;
2361 }
2362 }
2363 }
2364
2365 // register + dispatch
2366 MDRequestRef mdr = mdcache->request_start(req);
2367 if (!mdr.get())
2368 return;
2369
2370 if (session) {
2371 mdr->session = session;
2372 session->requests.push_back(&mdr->item_session_request);
2373 }
2374
2375 if (has_completed)
2376 mdr->has_completed = true;
2377
2378 // process embedded cap releases?
2379 // (only if NOT replay!)
2380 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
2381 client_t client = req->get_source().num();
2382 for (const auto &r : req->releases) {
2383 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2384 }
2385 req->releases.clear();
2386 }
2387
2388 dispatch_client_request(mdr);
2389 return;
2390 }
2391
2392 void Server::handle_osd_map()
2393 {
2394 /* Note that we check the OSDMAP_FULL flag directly rather than
2395 * using osdmap_full_flag(), because we want to know "is the flag set"
2396 * rather than "does the flag apply to us?" */
2397 mds->objecter->with_osdmap([this](const OSDMap& o) {
2398 auto pi = o.get_pg_pool(mds->mdsmap->get_metadata_pool());
2399 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
2400 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2401 << o.get_epoch() << dendl;
2402 });
2403 }
2404
2405 void Server::clear_batch_ops(const MDRequestRef& mdr)
2406 {
2407 int mask = mdr->client_request->head.args.getattr.mask;
2408 if (mdr->client_request->get_op() == CEPH_MDS_OP_GETATTR && mdr->in[0]) {
2409 mdr->in[0]->batch_ops.erase(mask);
2410 } else if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP && mdr->dn[0].size()) {
2411 mdr->dn[0].back()->batch_ops.erase(mask);
2412 }
2413 }
2414
2415 void Server::dispatch_client_request(MDRequestRef& mdr)
2416 {
2417 // we shouldn't be waiting on anyone.
2418 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
2419
2420 if (mdr->killed) {
2421 dout(10) << "request " << *mdr << " was killed" << dendl;
2422 //if the mdr is a "batch_op" and it has followers, pick a follower as
2423 //the new "head of the batch ops" and go on processing the new one.
2424 if (mdr->is_batch_op() && mdr->is_batch_head ) {
2425 if (!mdr->batch_reqs.empty()) {
2426 MDRequestRef new_batch_head;
2427 for (auto itr = mdr->batch_reqs.cbegin(); itr != mdr->batch_reqs.cend();) {
2428 auto req = *itr;
2429 itr = mdr->batch_reqs.erase(itr);
2430 if (!req->killed) {
2431 new_batch_head = req;
2432 break;
2433 }
2434 }
2435
2436 if (!new_batch_head) {
2437 clear_batch_ops(mdr);
2438 return;
2439 }
2440
2441 new_batch_head->batch_reqs = std::move(mdr->batch_reqs);
2442
2443 mdr = new_batch_head;
2444 mdr->is_batch_head = true;
2445 int mask = mdr->client_request->head.args.getattr.mask;
2446 if (mdr->client_request->get_op() == CEPH_MDS_OP_GETATTR) {
2447 auto& fin = mdr->in[0]->batch_ops[mask];
2448 fin->set_request(new_batch_head);
2449 } else if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP) {
2450 auto& fin = mdr->dn[0].back()->batch_ops[mask];
2451 fin->set_request(new_batch_head);
2452 }
2453 } else {
2454 clear_batch_ops(mdr);
2455 return;
2456 }
2457 } else {
2458 return;
2459 }
2460 } else if (mdr->aborted) {
2461 mdr->aborted = false;
2462 mdcache->request_kill(mdr);
2463 return;
2464 }
2465
2466 const cref_t<MClientRequest> &req = mdr->client_request;
2467
2468 if (logger) logger->inc(l_mdss_dispatch_client_request);
2469
2470 dout(7) << "dispatch_client_request " << *req << dendl;
2471
2472 if (req->may_write() && mdcache->is_readonly()) {
2473 dout(10) << " read-only FS" << dendl;
2474 respond_to_request(mdr, -EROFS);
2475 return;
2476 }
2477 if (mdr->has_more() && mdr->more()->slave_error) {
2478 dout(10) << " got error from slaves" << dendl;
2479 respond_to_request(mdr, mdr->more()->slave_error);
2480 return;
2481 }
2482
2483 if (is_full) {
2484 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2485 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2486 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2487 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2488 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2489 req->get_op() == CEPH_MDS_OP_CREATE ||
2490 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2491 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2492 ((req->get_op() == CEPH_MDS_OP_LINK ||
2493 req->get_op() == CEPH_MDS_OP_RENAME) &&
2494 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
2495 ) {
2496
2497 dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2498 respond_to_request(mdr, -ENOSPC);
2499 return;
2500 } else {
2501 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2502 }
2503 }
2504
2505 switch (req->get_op()) {
2506 case CEPH_MDS_OP_LOOKUPHASH:
2507 case CEPH_MDS_OP_LOOKUPINO:
2508 handle_client_lookup_ino(mdr, false, false);
2509 break;
2510 case CEPH_MDS_OP_LOOKUPPARENT:
2511 handle_client_lookup_ino(mdr, true, false);
2512 break;
2513 case CEPH_MDS_OP_LOOKUPNAME:
2514 handle_client_lookup_ino(mdr, false, true);
2515 break;
2516
2517 // inodes ops.
2518 case CEPH_MDS_OP_LOOKUP:
2519 handle_client_getattr(mdr, true);
2520 break;
2521
2522 case CEPH_MDS_OP_LOOKUPSNAP:
2523 // lookupsnap does not reference a CDentry; treat it as a getattr
2524 case CEPH_MDS_OP_GETATTR:
2525 handle_client_getattr(mdr, false);
2526 break;
2527
2528 case CEPH_MDS_OP_SETATTR:
2529 handle_client_setattr(mdr);
2530 break;
2531 case CEPH_MDS_OP_SETLAYOUT:
2532 handle_client_setlayout(mdr);
2533 break;
2534 case CEPH_MDS_OP_SETDIRLAYOUT:
2535 handle_client_setdirlayout(mdr);
2536 break;
2537 case CEPH_MDS_OP_SETXATTR:
2538 handle_client_setxattr(mdr);
2539 break;
2540 case CEPH_MDS_OP_RMXATTR:
2541 handle_client_removexattr(mdr);
2542 break;
2543
2544 case CEPH_MDS_OP_READDIR:
2545 handle_client_readdir(mdr);
2546 break;
2547
2548 case CEPH_MDS_OP_SETFILELOCK:
2549 handle_client_file_setlock(mdr);
2550 break;
2551
2552 case CEPH_MDS_OP_GETFILELOCK:
2553 handle_client_file_readlock(mdr);
2554 break;
2555
2556 // funky.
2557 case CEPH_MDS_OP_CREATE:
2558 if (mdr->has_completed)
2559 handle_client_open(mdr); // already created.. just open
2560 else
2561 handle_client_openc(mdr);
2562 break;
2563
2564 case CEPH_MDS_OP_OPEN:
2565 handle_client_open(mdr);
2566 break;
2567
2568 // namespace.
2569 // no prior locks.
2570 case CEPH_MDS_OP_MKNOD:
2571 handle_client_mknod(mdr);
2572 break;
2573 case CEPH_MDS_OP_LINK:
2574 handle_client_link(mdr);
2575 break;
2576 case CEPH_MDS_OP_UNLINK:
2577 case CEPH_MDS_OP_RMDIR:
2578 handle_client_unlink(mdr);
2579 break;
2580 case CEPH_MDS_OP_RENAME:
2581 handle_client_rename(mdr);
2582 break;
2583 case CEPH_MDS_OP_MKDIR:
2584 handle_client_mkdir(mdr);
2585 break;
2586 case CEPH_MDS_OP_SYMLINK:
2587 handle_client_symlink(mdr);
2588 break;
2589
2590
2591 // snaps
2592 case CEPH_MDS_OP_LSSNAP:
2593 handle_client_lssnap(mdr);
2594 break;
2595 case CEPH_MDS_OP_MKSNAP:
2596 handle_client_mksnap(mdr);
2597 break;
2598 case CEPH_MDS_OP_RMSNAP:
2599 handle_client_rmsnap(mdr);
2600 break;
2601 case CEPH_MDS_OP_RENAMESNAP:
2602 handle_client_renamesnap(mdr);
2603 break;
2604
2605 default:
2606 dout(1) << " unknown client op " << req->get_op() << dendl;
2607 respond_to_request(mdr, -EOPNOTSUPP);
2608 }
2609 }
2610
2611
2612 // ---------------------------------------
2613 // SLAVE REQUESTS
2614
2615 void Server::handle_slave_request(const cref_t<MMDSSlaveRequest> &m)
2616 {
2617 dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
2618 mds_rank_t from = mds_rank_t(m->get_source().num());
2619
2620 if (logger) logger->inc(l_mdss_handle_slave_request);
2621
2622 // reply?
2623 if (m->is_reply())
2624 return handle_slave_request_reply(m);
2625
2626 // the purpose of rename notify is enforcing causal message ordering. making sure
2627 // bystanders have received all messages from rename srcdn's auth MDS.
2628 if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
2629 auto reply = make_message<MMDSSlaveRequest>(m->get_reqid(), m->get_attempt(), MMDSSlaveRequest::OP_RENAMENOTIFYACK);
2630 mds->send_message(reply, m->get_connection());
2631 return;
2632 }
2633
2634 CDentry *straydn = NULL;
2635 if (m->straybl.length() > 0) {
2636 mdcache->decode_replica_stray(straydn, m->straybl, from);
2637 ceph_assert(straydn);
2638 m->straybl.clear();
2639 }
2640
2641 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2642 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2643 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2644 return;
2645 }
2646
2647 // am i a new slave?
2648 MDRequestRef mdr;
2649 if (mdcache->have_request(m->get_reqid())) {
2650 // existing?
2651 mdr = mdcache->request_get(m->get_reqid());
2652
2653 // is my request newer?
2654 if (mdr->attempt > m->get_attempt()) {
2655 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2656 << ", dropping " << *m << dendl;
2657 return;
2658 }
2659
2660 if (mdr->attempt < m->get_attempt()) {
2661 // mine is old, close it out
2662 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2663 << ", closing out" << dendl;
2664 mdcache->request_finish(mdr);
2665 mdr.reset();
2666 } else if (mdr->slave_to_mds != from) {
2667 dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
2668 return;
2669 }
2670
2671 // may get these while mdr->slave_request is non-null
2672 if (m->get_op() == MMDSSlaveRequest::OP_DROPLOCKS) {
2673 mds->locker->drop_locks(mdr.get());
2674 return;
2675 }
2676 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
2677 if (m->is_abort()) {
2678 mdr->aborted = true;
2679 if (mdr->slave_request) {
2680 // only abort on-going xlock, wrlock and auth pin
2681 ceph_assert(!mdr->slave_did_prepare());
2682 } else {
2683 mdcache->request_finish(mdr);
2684 }
2685 } else {
2686 if (m->inode_export.length() > 0)
2687 mdr->more()->inode_import = m->inode_export;
2688 // finish off request.
2689 mdcache->request_finish(mdr);
2690 }
2691 return;
2692 }
2693 }
2694 if (!mdr.get()) {
2695 // new?
2696 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
2697 dout(10) << "missing slave request for " << m->get_reqid()
2698 << " OP_FINISH, must have lost race with a forward" << dendl;
2699 return;
2700 }
2701 mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
2702 mdr->set_op_stamp(m->op_stamp);
2703 }
2704 ceph_assert(mdr->slave_request == 0); // only one at a time, please!
2705
2706 if (straydn) {
2707 mdr->pin(straydn);
2708 mdr->straydn = straydn;
2709 }
2710
2711 if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2712 mdr->locks.empty()) {
2713 dout(3) << "not active yet, waiting" << dendl;
2714 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2715 return;
2716 }
2717
2718 mdr->reset_slave_request(m);
2719
2720 dispatch_slave_request(mdr);
2721 }
2722
2723 void Server::handle_slave_request_reply(const cref_t<MMDSSlaveRequest> &m)
2724 {
2725 mds_rank_t from = mds_rank_t(m->get_source().num());
2726
2727 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2728 metareqid_t r = m->get_reqid();
2729 if (!mdcache->have_uncommitted_master(r, from)) {
2730 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2731 << from << " reqid " << r << dendl;
2732 return;
2733 }
2734 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2735 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2736 return;
2737 }
2738
2739 if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
2740 metareqid_t r = m->get_reqid();
2741 mdcache->committed_master_slave(r, from);
2742 return;
2743 }
2744
2745 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2746 if (m->get_attempt() != mdr->attempt) {
2747 dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
2748 << m->get_attempt() << dendl;
2749 return;
2750 }
2751
2752 switch (m->get_op()) {
2753 case MMDSSlaveRequest::OP_XLOCKACK:
2754 {
2755 // identify lock, master request
2756 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2757 m->get_object_info());
2758 mdr->more()->slaves.insert(from);
2759 lock->decode_locked_state(m->get_lock_data());
2760 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2761 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
2762 mdr->finish_locking(lock);
2763 lock->get_xlock(mdr, mdr->get_client());
2764
2765 ceph_assert(mdr->more()->waiting_on_slave.count(from));
2766 mdr->more()->waiting_on_slave.erase(from);
2767 ceph_assert(mdr->more()->waiting_on_slave.empty());
2768 mdcache->dispatch_request(mdr);
2769 }
2770 break;
2771
2772 case MMDSSlaveRequest::OP_WRLOCKACK:
2773 {
2774 // identify lock, master request
2775 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2776 m->get_object_info());
2777 mdr->more()->slaves.insert(from);
2778 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2779 auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
2780 ceph_assert(it->is_remote_wrlock());
2781 ceph_assert(it->wrlock_target == from);
2782
2783 mdr->finish_locking(lock);
2784
2785 ceph_assert(mdr->more()->waiting_on_slave.count(from));
2786 mdr->more()->waiting_on_slave.erase(from);
2787 ceph_assert(mdr->more()->waiting_on_slave.empty());
2788 mdcache->dispatch_request(mdr);
2789 }
2790 break;
2791
2792 case MMDSSlaveRequest::OP_AUTHPINACK:
2793 handle_slave_auth_pin_ack(mdr, m);
2794 break;
2795
2796 case MMDSSlaveRequest::OP_LINKPREPACK:
2797 handle_slave_link_prep_ack(mdr, m);
2798 break;
2799
2800 case MMDSSlaveRequest::OP_RMDIRPREPACK:
2801 handle_slave_rmdir_prep_ack(mdr, m);
2802 break;
2803
2804 case MMDSSlaveRequest::OP_RENAMEPREPACK:
2805 handle_slave_rename_prep_ack(mdr, m);
2806 break;
2807
2808 case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
2809 handle_slave_rename_notify_ack(mdr, m);
2810 break;
2811
2812 default:
2813 ceph_abort();
2814 }
2815 }
2816
2817 void Server::dispatch_slave_request(MDRequestRef& mdr)
2818 {
2819 dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
2820
2821 if (mdr->aborted) {
2822 dout(7) << " abort flag set, finishing" << dendl;
2823 mdcache->request_finish(mdr);
2824 return;
2825 }
2826
2827 if (logger) logger->inc(l_mdss_dispatch_slave_request);
2828
2829 int op = mdr->slave_request->get_op();
2830 switch (op) {
2831 case MMDSSlaveRequest::OP_XLOCK:
2832 case MMDSSlaveRequest::OP_WRLOCK:
2833 {
2834 // identify object
2835 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2836 mdr->slave_request->get_object_info());
2837
2838 if (!lock) {
2839 dout(10) << "don't have object, dropping" << dendl;
2840 ceph_abort(); // can this happen, if we auth pinned properly.
2841 }
2842 if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2843 dout(10) << "not auth for remote xlock attempt, dropping on "
2844 << *lock << " on " << *lock->get_parent() << dendl;
2845 } else {
2846 // use acquire_locks so that we get auth_pinning.
2847 MutationImpl::LockOpVec lov;
2848 for (const auto& p : mdr->locks) {
2849 if (p.is_xlock())
2850 lov.add_xlock(p.lock);
2851 else if (p.is_wrlock())
2852 lov.add_wrlock(p.lock);
2853 }
2854
2855 int replycode = 0;
2856 switch (op) {
2857 case MMDSSlaveRequest::OP_XLOCK:
2858 lov.add_xlock(lock);
2859 replycode = MMDSSlaveRequest::OP_XLOCKACK;
2860 break;
2861 case MMDSSlaveRequest::OP_WRLOCK:
2862 lov.add_wrlock(lock);
2863 replycode = MMDSSlaveRequest::OP_WRLOCKACK;
2864 break;
2865 }
2866
2867 if (!mds->locker->acquire_locks(mdr, lov))
2868 return;
2869
2870 // ack
2871 auto r = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, replycode);
2872 r->set_lock_type(lock->get_type());
2873 lock->get_parent()->set_object_info(r->get_object_info());
2874 if (replycode == MMDSSlaveRequest::OP_XLOCKACK)
2875 lock->encode_locked_state(r->get_lock_data());
2876 mds->send_message(r, mdr->slave_request->get_connection());
2877 }
2878
2879 // done.
2880 mdr->reset_slave_request();
2881 }
2882 break;
2883
2884 case MMDSSlaveRequest::OP_UNXLOCK:
2885 case MMDSSlaveRequest::OP_UNWRLOCK:
2886 {
2887 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2888 mdr->slave_request->get_object_info());
2889 ceph_assert(lock);
2890 auto it = mdr->locks.find(lock);
2891 ceph_assert(it != mdr->locks.end());
2892 bool need_issue = false;
2893 switch (op) {
2894 case MMDSSlaveRequest::OP_UNXLOCK:
2895 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
2896 break;
2897 case MMDSSlaveRequest::OP_UNWRLOCK:
2898 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
2899 break;
2900 }
2901 if (need_issue)
2902 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2903
2904 // done. no ack necessary.
2905 mdr->reset_slave_request();
2906 }
2907 break;
2908
2909 case MMDSSlaveRequest::OP_AUTHPIN:
2910 handle_slave_auth_pin(mdr);
2911 break;
2912
2913 case MMDSSlaveRequest::OP_LINKPREP:
2914 case MMDSSlaveRequest::OP_UNLINKPREP:
2915 handle_slave_link_prep(mdr);
2916 break;
2917
2918 case MMDSSlaveRequest::OP_RMDIRPREP:
2919 handle_slave_rmdir_prep(mdr);
2920 break;
2921
2922 case MMDSSlaveRequest::OP_RENAMEPREP:
2923 handle_slave_rename_prep(mdr);
2924 break;
2925
2926 default:
2927 ceph_abort();
2928 }
2929 }
2930
2931 void Server::handle_slave_auth_pin(MDRequestRef& mdr)
2932 {
2933 dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
2934
2935 // build list of objects
2936 list<MDSCacheObject*> objects;
2937 CInode *auth_pin_freeze = NULL;
2938 bool nonblocking = mdr->slave_request->is_nonblocking();
2939 bool fail = false, wouldblock = false, readonly = false;
2940 ref_t<MMDSSlaveRequest> reply;
2941
2942 if (mdcache->is_readonly()) {
2943 dout(10) << " read-only FS" << dendl;
2944 readonly = true;
2945 fail = true;
2946 }
2947
2948 if (!fail) {
2949 for (const auto &oi : mdr->slave_request->get_authpins()) {
2950 MDSCacheObject *object = mdcache->get_object(oi);
2951 if (!object) {
2952 dout(10) << " don't have " << oi << dendl;
2953 fail = true;
2954 break;
2955 }
2956
2957 objects.push_back(object);
2958 if (oi == mdr->slave_request->get_authpin_freeze())
2959 auth_pin_freeze = static_cast<CInode*>(object);
2960 }
2961 }
2962
2963 // can we auth pin them?
2964 if (!fail) {
2965 for (const auto& obj : objects) {
2966 if (!obj->is_auth()) {
2967 dout(10) << " not auth for " << *obj << dendl;
2968 fail = true;
2969 break;
2970 }
2971 if (mdr->is_auth_pinned(obj))
2972 continue;
2973 if (!mdr->can_auth_pin(obj)) {
2974 if (nonblocking) {
2975 dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
2976 fail = true;
2977 wouldblock = true;
2978 break;
2979 }
2980 // wait
2981 dout(10) << " waiting for authpinnable on " << *obj << dendl;
2982 obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2983 mdr->drop_local_auth_pins();
2984
2985 mds->locker->notify_freeze_waiter(obj);
2986 goto blocked;
2987 }
2988 }
2989 }
2990
2991 if (!fail) {
2992 /* freeze authpin wrong inode */
2993 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
2994 mdr->more()->rename_inode != auth_pin_freeze)
2995 mdr->unfreeze_auth_pin(true);
2996
2997 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2998 * on the source inode to complete. This happens after all locks for the rename
2999 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3000 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3001 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
3002 * The solution is freeze the inode and prevent other MDRequests from getting new
3003 * auth pins.
3004 */
3005 if (auth_pin_freeze) {
3006 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
3007 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
3008 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
3009 mds->mdlog->flush();
3010 goto blocked;
3011 }
3012 }
3013 }
3014
3015 reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
3016
3017 if (fail) {
3018 mdr->drop_local_auth_pins(); // just in case
3019 if (readonly)
3020 reply->mark_error_rofs();
3021 if (wouldblock)
3022 reply->mark_error_wouldblock();
3023 } else {
3024 // auth pin!
3025 for (const auto& obj : objects) {
3026 dout(10) << "auth_pinning " << *obj << dendl;
3027 mdr->auth_pin(obj);
3028 }
3029 // return list of my auth_pins (if any)
3030 for (const auto &p : mdr->object_states) {
3031 if (!p.second.auth_pinned)
3032 continue;
3033 MDSCacheObjectInfo info;
3034 p.first->set_object_info(info);
3035 reply->get_authpins().push_back(info);
3036 if (p.first == (MDSCacheObject*)auth_pin_freeze)
3037 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
3038 }
3039 }
3040
3041 mds->send_message_mds(reply, mdr->slave_to_mds);
3042
3043 // clean up this request
3044 mdr->reset_slave_request();
3045 return;
3046
3047 blocked:
3048 if (mdr->slave_request->should_notify_blocking()) {
3049 reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
3050 reply->mark_req_blocked();
3051 mds->send_message_mds(reply, mdr->slave_to_mds);
3052 mdr->slave_request->clear_notify_blocking();
3053 }
3054 return;
3055 }
3056
3057 void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
3058 {
3059 dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
3060 mds_rank_t from = mds_rank_t(ack->get_source().num());
3061
3062 if (ack->is_req_blocked()) {
3063 mdr->disable_lock_cache();
3064 // slave auth pin is blocked, drop locks to avoid deadlock
3065 mds->locker->drop_locks(mdr.get(), nullptr);
3066 return;
3067 }
3068
3069 // added auth pins?
3070 set<MDSCacheObject*> pinned;
3071 for (const auto &oi : ack->get_authpins()) {
3072 MDSCacheObject *object = mdcache->get_object(oi);
3073 ceph_assert(object); // we pinned it
3074 dout(10) << " remote has pinned " << *object << dendl;
3075 mdr->set_remote_auth_pinned(object, from);
3076 if (oi == ack->get_authpin_freeze())
3077 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
3078 pinned.insert(object);
3079 }
3080
3081 // removed frozen auth pin ?
3082 if (mdr->more()->is_remote_frozen_authpin &&
3083 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
3084 auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
3085 ceph_assert(stat_p);
3086 if (stat_p->remote_auth_pinned == from) {
3087 mdr->more()->is_remote_frozen_authpin = false;
3088 }
3089 }
3090
3091 // removed auth pins?
3092 for (auto& p : mdr->object_states) {
3093 if (p.second.remote_auth_pinned == MDS_RANK_NONE)
3094 continue;
3095 MDSCacheObject* object = p.first;
3096 if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
3097 dout(10) << " remote has unpinned " << *object << dendl;
3098 mdr->_clear_remote_auth_pinned(p.second);
3099 }
3100 }
3101
3102 // note slave
3103 mdr->more()->slaves.insert(from);
3104
3105 // clear from waiting list
3106 auto ret = mdr->more()->waiting_on_slave.erase(from);
3107 ceph_assert(ret);
3108
3109 if (ack->is_error_rofs()) {
3110 mdr->more()->slave_error = -EROFS;
3111 } else if (ack->is_error_wouldblock()) {
3112 mdr->more()->slave_error = -EWOULDBLOCK;
3113 }
3114
3115 // go again?
3116 if (mdr->more()->waiting_on_slave.empty())
3117 mdcache->dispatch_request(mdr);
3118 else
3119 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
3120 }
3121
3122
3123 // ---------------------------------------
3124 // HELPERS
3125
3126
3127 /**
3128 * check whether we are permitted to complete a request
3129 *
3130 * Check whether we have permission to perform the operation specified
3131 * by mask on the given inode, based on the capability in the mdr's
3132 * session.
3133 */
3134 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
3135 {
3136 if (mdr->session) {
3137 int r = mdr->session->check_access(
3138 in, mask,
3139 mdr->client_request->get_caller_uid(),
3140 mdr->client_request->get_caller_gid(),
3141 &mdr->client_request->get_caller_gid_list(),
3142 mdr->client_request->head.args.setattr.uid,
3143 mdr->client_request->head.args.setattr.gid);
3144 if (r < 0) {
3145 respond_to_request(mdr, r);
3146 return false;
3147 }
3148 }
3149 return true;
3150 }
3151
3152 /**
3153 * check whether fragment has reached maximum size
3154 *
3155 */
3156 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
3157 {
3158 const auto size = in->get_frag_size();
3159 if (size >= g_conf()->mds_bal_fragment_size_max) {
3160 dout(10) << "fragment " << *in << " size exceeds " << g_conf()->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
3161 respond_to_request(mdr, -ENOSPC);
3162 return false;
3163 }
3164
3165 return true;
3166 }
3167
3168 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3169 {
3170 CDentry *straydn = mdr->straydn;
3171 if (straydn) {
3172 string straydname;
3173 in->name_stray_dentry(straydname);
3174 ceph_assert(straydn->get_name() == straydname);
3175 return straydn;
3176 }
3177
3178 CDir *straydir = mdcache->get_stray_dir(in);
3179
3180 if (!mdr->client_request->is_replay() &&
3181 !check_fragment_space(mdr, straydir))
3182 return NULL;
3183
3184 straydn = mdcache->get_or_create_stray_dentry(in);
3185 mdr->straydn = straydn;
3186 mdr->pin(straydn);
3187 return straydn;
3188 }
3189
3190 /** prepare_new_inode
3191 *
3192 * create a new inode. set c/m/atime. hit dir pop.
3193 */
3194 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
3195 file_layout_t *layout)
3196 {
3197 CInode *in = new CInode(mdcache);
3198
3199 // Server::prepare_force_open_sessions() can re-open session in closing
3200 // state. In that corner case, session's prealloc_inos are being freed.
3201 // To simplify the code, we disallow using/refilling session's prealloc_ino
3202 // while session is opening.
3203 bool allow_prealloc_inos = mdr->session->is_open();
3204
3205 // assign ino
3206 if (allow_prealloc_inos && (mdr->used_prealloc_ino = in->inode.ino = mdr->session->take_ino(useino))) {
3207 mds->sessionmap.mark_projected(mdr->session);
3208 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3209 << " (" << mdr->session->info.prealloc_inos
3210 << ", " << mdr->session->info.prealloc_inos.size() << " left)"
3211 << dendl;
3212 } else {
3213 mdr->alloc_ino =
3214 in->inode.ino = mds->inotable->project_alloc_id(useino);
3215 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3216 }
3217
3218 if (useino && useino != in->inode.ino) {
3219 dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
3220 mds->clog->error() << mdr->client_request->get_source()
3221 << " specified ino " << useino
3222 << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
3223 //ceph_abort(); // just for now.
3224 }
3225
3226 if (allow_prealloc_inos &&
3227 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3228 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
3229 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
3230 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
3231 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3232 mds->sessionmap.mark_projected(mdr->session);
3233 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3234 }
3235
3236 in->inode.version = 1;
3237 in->inode.xattr_version = 1;
3238 in->inode.nlink = 1; // FIXME
3239
3240 in->inode.mode = mode;
3241
3242 // FIPS zeroization audit 20191117: this memset is not security related.
3243 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
3244 if (in->inode.is_dir()) {
3245 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
3246 } else if (layout) {
3247 in->inode.layout = *layout;
3248 } else {
3249 in->inode.layout = mdcache->default_file_layout;
3250 }
3251
3252 in->inode.truncate_size = -1ull; // not truncated, yet!
3253 in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
3254
3255 CInode *diri = dir->get_inode();
3256
3257 dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
3258
3259 if (diri->inode.mode & S_ISGID) {
3260 dout(10) << " dir is sticky" << dendl;
3261 in->inode.gid = diri->inode.gid;
3262 if (S_ISDIR(mode)) {
3263 dout(10) << " new dir also sticky" << dendl;
3264 in->inode.mode |= S_ISGID;
3265 }
3266 } else
3267 in->inode.gid = mdr->client_request->get_caller_gid();
3268
3269 in->inode.uid = mdr->client_request->get_caller_uid();
3270
3271 in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
3272 mdr->get_op_stamp();
3273
3274 in->inode.change_attr = 0;
3275
3276 const cref_t<MClientRequest> &req = mdr->client_request;
3277 if (req->get_data().length()) {
3278 auto p = req->get_data().cbegin();
3279
3280 // xattrs on new inode?
3281 CInode::mempool_xattr_map xattrs;
3282 decode_noshare(xattrs, p);
3283 for (const auto &p : xattrs) {
3284 dout(10) << "prepare_new_inode setting xattr " << p.first << dendl;
3285 auto em = in->xattrs.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple(p.second));
3286 if (!em.second)
3287 em.first->second = p.second;
3288 }
3289 }
3290
3291 if (!mds->mdsmap->get_inline_data_enabled() ||
3292 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
3293 in->inode.inline_data.version = CEPH_INLINE_NONE;
3294
3295 mdcache->add_inode(in); // add
3296 dout(10) << "prepare_new_inode " << *in << dendl;
3297 return in;
3298 }
3299
3300 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3301 {
3302 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3303 << " inotablev " << mds->inotable->get_projected_version()
3304 << dendl;
3305 blob->set_ino_alloc(mdr->alloc_ino,
3306 mdr->used_prealloc_ino,
3307 mdr->prealloc_inos,
3308 mdr->client_request->get_source(),
3309 mds->sessionmap.get_projected(),
3310 mds->inotable->get_projected_version());
3311 }
3312
3313 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3314 {
3315 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3316 << " / " << mdr->prealloc_inos
3317 << " / " << mdr->used_prealloc_ino << dendl;
3318
3319 if (mdr->alloc_ino) {
3320 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3321 }
3322 if (mdr->prealloc_inos.size()) {
3323 ceph_assert(session);
3324 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
3325 session->info.prealloc_inos.insert(mdr->prealloc_inos);
3326 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
3327 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3328 }
3329 if (mdr->used_prealloc_ino) {
3330 ceph_assert(session);
3331 session->info.used_inos.erase(mdr->used_prealloc_ino);
3332 mds->sessionmap.mark_dirty(session);
3333 }
3334 }
3335
3336 class C_MDS_TryFindInode : public ServerContext {
3337 MDRequestRef mdr;
3338 public:
3339 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3340 void finish(int r) override {
3341 if (r == -ESTALE) // :( find_ino_peers failed
3342 server->respond_to_request(mdr, r);
3343 else
3344 server->dispatch_client_request(mdr);
3345 }
3346 };
3347
3348 class CF_MDS_MDRContextFactory : public MDSContextFactory {
3349 public:
3350 CF_MDS_MDRContextFactory(MDCache *cache, MDRequestRef &mdr, bool dl) :
3351 mdcache(cache), mdr(mdr), drop_locks(dl) {}
3352 MDSContext *build() {
3353 if (drop_locks) {
3354 mdcache->mds->locker->drop_locks(mdr.get(), nullptr);
3355 mdr->drop_local_auth_pins();
3356 }
3357 return new C_MDS_RetryRequest(mdcache, mdr);
3358 }
3359 private:
3360 MDCache *mdcache;
3361 MDRequestRef mdr;
3362 bool drop_locks;
3363 };
3364
3365 /* If this returns null, the request has been handled
3366 * as appropriate: forwarded on, or the client's been replied to */
3367 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
3368 bool want_auth,
3369 bool no_want_auth)
3370 {
3371 const filepath& refpath = mdr->get_filepath();
3372 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3373
3374 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3375 return mdr->in[0];
3376
3377 // traverse
3378 CF_MDS_MDRContextFactory cf(mdcache, mdr, true);
3379 int flags = 0;
3380 if (refpath.is_last_snap()) {
3381 if (!no_want_auth)
3382 want_auth = true;
3383 } else {
3384 flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
3385 }
3386 if (want_auth)
3387 flags |= MDS_TRAVERSE_WANT_AUTH;
3388 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
3389 if (r > 0)
3390 return nullptr; // delayed
3391 if (r < 0) { // error
3392 if (r == -ENOENT && !mdr->dn[0].empty()) {
3393 if (mdr->client_request &&
3394 mdr->client_request->get_dentry_wanted())
3395 mdr->tracedn = mdr->dn[0].back();
3396 respond_to_request(mdr, r);
3397 } else if (r == -ESTALE) {
3398 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3399 MDSContext *c = new C_MDS_TryFindInode(this, mdr);
3400 mdcache->find_ino_peers(refpath.get_ino(), c);
3401 } else {
3402 dout(10) << "FAIL on error " << r << dendl;
3403 respond_to_request(mdr, r);
3404 }
3405 return nullptr;
3406 }
3407 CInode *ref = mdr->in[0];
3408 dout(10) << "ref is " << *ref << dendl;
3409
3410 if (want_auth) {
3411 // auth_pin?
3412 // do NOT proceed if freezing, as cap release may defer in that case, and
3413 // we could deadlock when we try to lock @ref.
3414 // if we're already auth_pinned, continue; the release has already been processed.
3415 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3416 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3417 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
3418 ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
3419 if (mdr->is_any_remote_auth_pin())
3420 mds->locker->notify_freeze_waiter(ref);
3421 return 0;
3422 }
3423 mdr->auth_pin(ref);
3424 }
3425
3426 // set and pin ref
3427 mdr->pin(ref);
3428 return ref;
3429 }
3430
3431
3432 /** rdlock_path_xlock_dentry
3433 * traverse path to the directory that could/would contain dentry.
3434 * make sure i am auth for that dentry, forward as necessary.
3435 * create null dentry in place (or use existing if okexist).
3436 * get rdlocks on traversed dentries, xlock on new dentry.
3437 */
3438 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
3439 bool create, bool okexist, bool want_layout)
3440 {
3441 const filepath& refpath = mdr->get_filepath();
3442 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3443
3444 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3445 return mdr->dn[0].back();
3446
3447 // figure parent dir vs dname
3448 if (refpath.depth() == 0) {
3449 dout(7) << "invalid path (zero length)" << dendl;
3450 respond_to_request(mdr, -EINVAL);
3451 return nullptr;
3452 }
3453
3454 if (refpath.is_last_snap()) {
3455 respond_to_request(mdr, -EROFS);
3456 return nullptr;
3457 }
3458
3459 if (refpath.is_last_dot_or_dotdot()) {
3460 dout(7) << "invalid path (last dot or dot_dot)" << dendl;
3461 if (create)
3462 respond_to_request(mdr, -EEXIST);
3463 else
3464 respond_to_request(mdr, -ENOTEMPTY);
3465 return nullptr;
3466 }
3467
3468 // traverse to parent dir
3469 CF_MDS_MDRContextFactory cf(mdcache, mdr, true);
3470 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
3471 MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
3472 MDS_TRAVERSE_WANT_AUTH;
3473 if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
3474 flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
3475 if (create)
3476 flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
3477 if (want_layout)
3478 flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
3479 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3480 if (r > 0)
3481 return nullptr; // delayed
3482 if (r < 0) {
3483 if (r == -ESTALE) {
3484 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3485 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3486 return nullptr;
3487 }
3488 respond_to_request(mdr, r);
3489 return nullptr;
3490 }
3491
3492 CDentry *dn = mdr->dn[0].back();
3493 CDir *dir = dn->get_dir();
3494 CInode *diri = dir->get_inode();
3495
3496 if (!mdr->reqid.name.is_mds()) {
3497 if (diri->is_system() && !diri->is_root()) {
3498 respond_to_request(mdr, -EROFS);
3499 return nullptr;
3500 }
3501 }
3502
3503 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
3504 respond_to_request(mdr, -ENOENT);
3505 return nullptr;
3506 }
3507
3508 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3509 if (dnl->is_null()) {
3510 if (!create && okexist) {
3511 respond_to_request(mdr, -ENOENT);
3512 return nullptr;
3513 }
3514
3515 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3516 dn->first = std::max(dn->first, next_snap);
3517 } else {
3518 if (!okexist) {
3519 respond_to_request(mdr, -EEXIST);
3520 return nullptr;
3521 }
3522 mdr->in[0] = dnl->get_inode();
3523 }
3524
3525 return dn;
3526 }
3527
3528 /** rdlock_two_paths_xlock_destdn
3529 * traverse two paths and lock the two paths in proper order.
3530 * The order of taking locks is:
3531 * 1. Lock directory inodes or dentries according to which trees they
3532 * are under. Lock objects under fs root before objects under mdsdir.
3533 * 2. Lock directory inodes or dentries according to their depth, in
3534 * ascending order.
3535 * 3. Lock directory inodes or dentries according to inode numbers or
3536 * dentries' parent inode numbers, in ascending order.
3537 * 4. Lock dentries in the same directory in order of their keys.
3538 * 5. Lock non-directory inodes according to inode numbers, in ascending
3539 * order.
3540 */
3541 std::pair<CDentry*, CDentry*>
3542 Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
3543 {
3544
3545 const filepath& refpath = mdr->get_filepath();
3546 const filepath& refpath2 = mdr->get_filepath2();
3547
3548 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
3549
3550 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3551 return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
3552
3553 if (refpath.depth() != 1 || refpath2.depth() != 1) {
3554 respond_to_request(mdr, -EINVAL);
3555 return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
3556 }
3557
3558 if (refpath.is_last_snap() || refpath2.is_last_snap()) {
3559 respond_to_request(mdr, -EROFS);
3560 return std::make_pair(nullptr, nullptr);
3561 }
3562
3563 // traverse to parent dir
3564 CF_MDS_MDRContextFactory cf(mdcache, mdr, true);
3565 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
3566 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3567 if (r != 0) {
3568 if (r == -ESTALE) {
3569 dout(10) << "ESTALE on path, attempting recovery" << dendl;
3570 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3571 } else if (r < 0) {
3572 respond_to_request(mdr, r);
3573 }
3574 return std::make_pair(nullptr, nullptr);
3575 }
3576
3577 flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
3578 r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
3579 if (r != 0) {
3580 if (r == -ESTALE) {
3581 dout(10) << "ESTALE on path2, attempting recovery" << dendl;
3582 mdcache->find_ino_peers(refpath2.get_ino(), new C_MDS_TryFindInode(this, mdr));
3583 } else if (r < 0) {
3584 respond_to_request(mdr, r);
3585 }
3586 return std::make_pair(nullptr, nullptr);
3587 }
3588
3589 CDentry *srcdn = mdr->dn[1].back();
3590 CDir *srcdir = srcdn->get_dir();
3591 CDentry *destdn = mdr->dn[0].back();
3592 CDir *destdir = destdn->get_dir();
3593
3594 if (!mdr->reqid.name.is_mds()) {
3595 if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
3596 (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
3597 respond_to_request(mdr, -EROFS);
3598 return std::make_pair(nullptr, nullptr);
3599 }
3600 }
3601
3602 if (!destdir->get_inode()->is_base() &&
3603 destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
3604 respond_to_request(mdr, -ENOENT);
3605 return std::make_pair(nullptr, nullptr);
3606 }
3607
3608 MutationImpl::LockOpVec lov;
3609 if (srcdir->get_inode() == destdir->get_inode()) {
3610 lov.add_wrlock(&destdir->inode->filelock);
3611 lov.add_wrlock(&destdir->inode->nestlock);
3612 if (xlock_srcdn && srcdir != destdir) {
3613 mds_rank_t srcdir_auth = srcdir->authority().first;
3614 if (srcdir_auth != mds->get_nodeid()) {
3615 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3616 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3617 }
3618 }
3619
3620 if (srcdn->get_name() > destdn->get_name())
3621 lov.add_xlock(&destdn->lock);
3622
3623 if (xlock_srcdn)
3624 lov.add_xlock(&srcdn->lock);
3625 else
3626 lov.add_rdlock(&srcdn->lock);
3627
3628 if (srcdn->get_name() < destdn->get_name())
3629 lov.add_xlock(&destdn->lock);
3630 } else {
3631 int cmp = mdr->compare_paths();
3632 bool lock_destdir_first =
3633 (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
3634
3635 if (lock_destdir_first) {
3636 lov.add_wrlock(&destdir->inode->filelock);
3637 lov.add_wrlock(&destdir->inode->nestlock);
3638 lov.add_xlock(&destdn->lock);
3639 }
3640
3641 if (xlock_srcdn) {
3642 mds_rank_t srcdir_auth = srcdir->authority().first;
3643 if (srcdir_auth == mds->get_nodeid()) {
3644 lov.add_wrlock(&srcdir->inode->filelock);
3645 lov.add_wrlock(&srcdir->inode->nestlock);
3646 } else {
3647 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3648 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3649 }
3650 lov.add_xlock(&srcdn->lock);
3651 } else {
3652 lov.add_rdlock(&srcdn->lock);
3653 }
3654
3655 if (!lock_destdir_first) {
3656 lov.add_wrlock(&destdir->inode->filelock);
3657 lov.add_wrlock(&destdir->inode->nestlock);
3658 lov.add_xlock(&destdn->lock);
3659 }
3660 }
3661
3662 CInode *auth_pin_freeze = nullptr;
3663 // XXX any better way to do this?
3664 if (xlock_srcdn && !srcdn->is_auth()) {
3665 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
3666 auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
3667 }
3668 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
3669 return std::make_pair(nullptr, nullptr);
3670
3671 if (srcdn->get_projected_linkage()->is_null()) {
3672 respond_to_request(mdr, -ENOENT);
3673 return std::make_pair(nullptr, nullptr);
3674 }
3675
3676 if (destdn->get_projected_linkage()->is_null()) {
3677 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3678 destdn->first = std::max(destdn->first, next_snap);
3679 }
3680
3681 mdr->locking_state |= MutationImpl::PATH_LOCKED;
3682
3683 return std::make_pair(destdn, srcdn);
3684 }
3685
3686 /**
3687 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3688 *
3689 * @param diri base inode
3690 * @param fg the exact frag we want
3691 * @param mdr request
3692 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3693 */
3694 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3695 {
3696 CDir *dir = diri->get_dirfrag(fg);
3697
3698 if (dir) {
3699 // am i auth for the dirfrag?
3700 if (!dir->is_auth()) {
3701 mds_rank_t auth = dir->authority().first;
3702 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3703 << ", fw to mds." << auth << dendl;
3704 mdcache->request_forward(mdr, auth);
3705 return nullptr;
3706 }
3707 } else {
3708 // not open and inode not mine?
3709 if (!diri->is_auth()) {
3710 mds_rank_t inauth = diri->authority().first;
3711 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3712 mdcache->request_forward(mdr, inauth);
3713 return nullptr;
3714 }
3715
3716 // not open and inode frozen?
3717 if (diri->is_frozen()) {
3718 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3719 ceph_assert(diri->get_parent_dir());
3720 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3721 return nullptr;
3722 }
3723
3724 // invent?
3725 dir = diri->get_or_open_dirfrag(mdcache, fg);
3726 }
3727
3728 return dir;
3729 }
3730
3731
3732 // ===============================================================================
3733 // STAT
3734
3735 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3736 {
3737 const cref_t<MClientRequest> &req = mdr->client_request;
3738
3739 if (req->get_filepath().depth() == 0 && is_lookup) {
3740 // refpath can't be empty for lookup but it can for
3741 // getattr (we do getattr with empty refpath for mount of '/')
3742 respond_to_request(mdr, -EINVAL);
3743 return;
3744 }
3745
3746 bool want_auth = false;
3747 int mask = req->head.args.getattr.mask;
3748 if (mask & CEPH_STAT_RSTAT)
3749 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3750
3751 CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
3752 if (!ref)
3753 return;
3754
3755 mdr->getattr_caps = mask;
3756
3757 if (mdr->snapid == CEPH_NOSNAP && !mdr->is_batch_head && mdr->is_batch_op()) {
3758 if (!is_lookup) {
3759 auto em = ref->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3760 if (em.second) {
3761 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr, mdcache);
3762 } else {
3763 dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3764 em.first->second->add_request(mdr);
3765 return;
3766 }
3767 } else {
3768 CDentry* dn = mdr->dn[0].back();
3769 auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3770 if (em.second) {
3771 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr, mdcache);
3772 mdr->pin(dn);
3773 } else {
3774 dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3775 em.first->second->add_request(mdr);
3776 return;
3777 }
3778 }
3779 mdr->is_batch_head = true;
3780 }
3781
3782 /*
3783 * if client currently holds the EXCL cap on a field, do not rdlock
3784 * it; client's stat() will result in valid info if _either_ EXCL
3785 * cap is held or MDS rdlocks and reads the value here.
3786 *
3787 * handling this case here is easier than weakening rdlock
3788 * semantics... that would cause problems elsewhere.
3789 */
3790 client_t client = mdr->get_client();
3791 int issued = 0;
3792 Capability *cap = ref->get_client_cap(client);
3793 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3794 mdr->snapid <= cap->client_follows))
3795 issued = cap->issued();
3796
3797 // FIXME
3798 MutationImpl::LockOpVec lov;
3799 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
3800 lov.add_rdlock(&ref->linklock);
3801 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
3802 lov.add_rdlock(&ref->authlock);
3803 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
3804 lov.add_rdlock(&ref->xattrlock);
3805 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3806 // Don't wait on unstable filelock if client is allowed to read file size.
3807 // This can reduce the response time of getattr in the case that multiple
3808 // clients do stat(2) and there are writers.
3809 // The downside of this optimization is that mds may not issue Fs caps along
3810 // with getattr reply. Client may need to send more getattr requests.
3811 if (mdr->is_rdlocked(&ref->filelock)) {
3812 lov.add_rdlock(&ref->filelock);
3813 } else if (ref->filelock.is_stable() ||
3814 ref->filelock.get_num_wrlocks() > 0 ||
3815 !ref->filelock.can_read(mdr->get_client())) {
3816 lov.add_rdlock(&ref->filelock);
3817 mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
3818 }
3819 }
3820
3821 if (!mds->locker->acquire_locks(mdr, lov))
3822 return;
3823
3824 if (!check_access(mdr, ref, MAY_READ))
3825 return;
3826
3827 utime_t now = ceph_clock_now();
3828 mdr->set_mds_stamp(now);
3829
3830 // note which caps are requested, so we return at least a snapshot
3831 // value for them. (currently this matters for xattrs and inline data)
3832 mdr->getattr_caps = mask;
3833
3834 mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
3835
3836 // reply
3837 dout(10) << "reply to stat on " << *req << dendl;
3838 mdr->tracei = ref;
3839 if (is_lookup)
3840 mdr->tracedn = mdr->dn[0].back();
3841 respond_to_request(mdr, 0);
3842 }
3843
3844 struct C_MDS_LookupIno2 : public ServerContext {
3845 MDRequestRef mdr;
3846 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3847 void finish(int r) override {
3848 server->_lookup_ino_2(mdr, r);
3849 }
3850 };
3851
3852 /*
3853 * filepath: ino
3854 */
3855 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3856 bool want_parent, bool want_dentry)
3857 {
3858 const cref_t<MClientRequest> &req = mdr->client_request;
3859
3860 if ((uint64_t)req->head.args.lookupino.snapid > 0)
3861 return _lookup_snap_ino(mdr);
3862
3863 inodeno_t ino = req->get_filepath().get_ino();
3864 CInode *in = mdcache->get_inode(ino);
3865 if (in && in->state_test(CInode::STATE_PURGING)) {
3866 respond_to_request(mdr, -ESTALE);
3867 return;
3868 }
3869 if (!in) {
3870 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3871 return;
3872 }
3873
3874 if (mdr && in->snaprealm && !in->snaprealm->have_past_parents_open() &&
3875 !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
3876 return;
3877 }
3878
3879 // check for nothing (not read or write); this still applies the
3880 // path check.
3881 if (!check_access(mdr, in, 0))
3882 return;
3883
3884 CDentry *dn = in->get_projected_parent_dn();
3885 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3886
3887 MutationImpl::LockOpVec lov;
3888 if (dn && (want_parent || want_dentry)) {
3889 mdr->pin(dn);
3890 lov.add_rdlock(&dn->lock);
3891 }
3892
3893 unsigned mask = req->head.args.lookupino.mask;
3894 if (mask) {
3895 Capability *cap = in->get_client_cap(mdr->get_client());
3896 int issued = 0;
3897 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3898 issued = cap->issued();
3899 // FIXME
3900 // permission bits, ACL/security xattrs
3901 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3902 lov.add_rdlock(&in->authlock);
3903 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3904 lov.add_rdlock(&in->xattrlock);
3905
3906 mdr->getattr_caps = mask;
3907 }
3908
3909 if (!lov.empty()) {
3910 if (!mds->locker->acquire_locks(mdr, lov))
3911 return;
3912
3913 if (diri != NULL) {
3914 // need read access to directory inode
3915 if (!check_access(mdr, diri, MAY_READ))
3916 return;
3917 }
3918 }
3919
3920 if (want_parent) {
3921 if (in->is_base()) {
3922 respond_to_request(mdr, -EINVAL);
3923 return;
3924 }
3925 if (!diri || diri->is_stray()) {
3926 respond_to_request(mdr, -ESTALE);
3927 return;
3928 }
3929 dout(10) << "reply to lookup_parent " << *in << dendl;
3930 mdr->tracei = diri;
3931 respond_to_request(mdr, 0);
3932 } else {
3933 if (want_dentry) {
3934 inodeno_t dirino = req->get_filepath2().get_ino();
3935 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3936 respond_to_request(mdr, -ENOENT);
3937 return;
3938 }
3939 dout(10) << "reply to lookup_name " << *in << dendl;
3940 } else
3941 dout(10) << "reply to lookup_ino " << *in << dendl;
3942
3943 mdr->tracei = in;
3944 if (want_dentry)
3945 mdr->tracedn = dn;
3946 respond_to_request(mdr, 0);
3947 }
3948 }
3949
3950 void Server::_lookup_snap_ino(MDRequestRef& mdr)
3951 {
3952 const cref_t<MClientRequest> &req = mdr->client_request;
3953
3954 vinodeno_t vino;
3955 vino.ino = req->get_filepath().get_ino();
3956 vino.snapid = (__u64)req->head.args.lookupino.snapid;
3957 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
3958 __u32 hash = req->head.args.lookupino.hash;
3959
3960 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
3961
3962 CInode *in = mdcache->lookup_snap_inode(vino);
3963 if (!in) {
3964 in = mdcache->get_inode(vino.ino);
3965 if (in) {
3966 if (in->state_test(CInode::STATE_PURGING) ||
3967 !in->has_snap_data(vino.snapid)) {
3968 if (in->is_dir() || !parent_ino) {
3969 respond_to_request(mdr, -ESTALE);
3970 return;
3971 }
3972 in = NULL;
3973 }
3974 }
3975 }
3976
3977 if (in) {
3978 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
3979 mdr->snapid = vino.snapid;
3980 mdr->tracei = in;
3981 respond_to_request(mdr, 0);
3982 return;
3983 }
3984
3985 CInode *diri = NULL;
3986 if (parent_ino) {
3987 diri = mdcache->get_inode(parent_ino);
3988 if (!diri) {
3989 mdcache->open_ino(parent_ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
3990 return;
3991 }
3992
3993 if (!diri->is_dir()) {
3994 respond_to_request(mdr, -EINVAL);
3995 return;
3996 }
3997
3998 MutationImpl::LockOpVec lov;
3999 lov.add_rdlock(&diri->dirfragtreelock);
4000 if (!mds->locker->acquire_locks(mdr, lov))
4001 return;
4002
4003 frag_t frag = diri->dirfragtree[hash];
4004 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
4005 if (!dir)
4006 return;
4007
4008 if (!dir->is_complete()) {
4009 if (dir->is_frozen()) {
4010 mds->locker->drop_locks(mdr.get());
4011 mdr->drop_local_auth_pins();
4012 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4013 return;
4014 }
4015 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4016 return;
4017 }
4018
4019 respond_to_request(mdr, -ESTALE);
4020 } else {
4021 mdcache->open_ino(vino.ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
4022 }
4023 }
4024
4025 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
4026 {
4027 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
4028 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
4029
4030 // `r` is a rank if >=0, else an error code
4031 if (r >= 0) {
4032 mds_rank_t dest_rank(r);
4033 if (dest_rank == mds->get_nodeid())
4034 dispatch_client_request(mdr);
4035 else
4036 mdcache->request_forward(mdr, dest_rank);
4037 return;
4038 }
4039
4040 // give up
4041 if (r == -ENOENT || r == -ENODATA)
4042 r = -ESTALE;
4043 respond_to_request(mdr, r);
4044 }
4045
4046
4047 /* This function takes responsibility for the passed mdr*/
4048 void Server::handle_client_open(MDRequestRef& mdr)
4049 {
4050 const cref_t<MClientRequest> &req = mdr->client_request;
4051 dout(7) << "open on " << req->get_filepath() << dendl;
4052
4053 int flags = req->head.args.open.flags;
4054 int cmode = ceph_flags_to_mode(flags);
4055 if (cmode < 0) {
4056 respond_to_request(mdr, -EINVAL);
4057 return;
4058 }
4059
4060 bool need_auth = !file_mode_is_readonly(cmode) ||
4061 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
4062
4063 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
4064 dout(7) << "read-only FS" << dendl;
4065 respond_to_request(mdr, -EROFS);
4066 return;
4067 }
4068
4069 CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
4070 if (!cur)
4071 return;
4072
4073 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
4074 ceph_assert(!need_auth);
4075 mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
4076 CInode *cur = rdlock_path_pin_ref(mdr, true);
4077 if (!cur)
4078 return;
4079 }
4080
4081 if (!cur->inode.is_file()) {
4082 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4083 cmode = CEPH_FILE_MODE_PIN;
4084 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4085 if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
4086 flags &= ~CEPH_O_TRUNC;
4087 }
4088
4089 dout(10) << "open flags = " << flags
4090 << ", filemode = " << cmode
4091 << ", need_auth = " << need_auth
4092 << dendl;
4093
4094 // regular file?
4095 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4096 dout(7) << "not a file or dir " << *cur << dendl;
4097 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
4098 return;
4099 }*/
4100 if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
4101 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
4102 respond_to_request(mdr, -EINVAL);
4103 return;
4104 }
4105
4106 if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
4107 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
4108 // we should return -EISDIR for directory, return -EINVAL for other non-regular
4109 respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
4110 return;
4111 }
4112
4113 if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
4114 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
4115 dout(7) << "old client cannot open inline data file " << *cur << dendl;
4116 respond_to_request(mdr, -EPERM);
4117 return;
4118 }
4119
4120 // snapped data is read only
4121 if (mdr->snapid != CEPH_NOSNAP &&
4122 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
4123 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
4124 respond_to_request(mdr, -EROFS);
4125 return;
4126 }
4127
4128 MutationImpl::LockOpVec lov;
4129
4130 unsigned mask = req->head.args.open.mask;
4131 if (mask) {
4132 Capability *cap = cur->get_client_cap(mdr->get_client());
4133 int issued = 0;
4134 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4135 issued = cap->issued();
4136 // permission bits, ACL/security xattrs
4137 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
4138 lov.add_rdlock(&cur->authlock);
4139 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
4140 lov.add_rdlock(&cur->xattrlock);
4141
4142 mdr->getattr_caps = mask;
4143 }
4144
4145 // O_TRUNC
4146 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
4147 ceph_assert(cur->is_auth());
4148
4149 lov.add_xlock(&cur->filelock);
4150 if (!mds->locker->acquire_locks(mdr, lov))
4151 return;
4152
4153 if (!check_access(mdr, cur, MAY_WRITE))
4154 return;
4155
4156 // wait for pending truncate?
4157 const auto pi = cur->get_projected_inode();
4158 if (pi->is_truncating()) {
4159 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4160 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4161 mds->locker->drop_locks(mdr.get());
4162 mdr->drop_local_auth_pins();
4163 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4164 return;
4165 }
4166
4167 do_open_truncate(mdr, cmode);
4168 return;
4169 }
4170
4171 // sync filelock if snapped.
4172 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4173 // and that data itself is flushed so that we can read the snapped data off disk.
4174 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
4175 lov.add_rdlock(&cur->filelock);
4176 }
4177
4178 if (!mds->locker->acquire_locks(mdr, lov))
4179 return;
4180
4181 mask = MAY_READ;
4182 if (cmode & CEPH_FILE_MODE_WR)
4183 mask |= MAY_WRITE;
4184 if (!check_access(mdr, cur, mask))
4185 return;
4186
4187 utime_t now = ceph_clock_now();
4188 mdr->set_mds_stamp(now);
4189
4190 if (cur->is_file() || cur->is_dir()) {
4191 if (mdr->snapid == CEPH_NOSNAP) {
4192 // register new cap
4193 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
4194 if (cap)
4195 dout(12) << "open issued caps " << ccap_string(cap->pending())
4196 << " for " << req->get_source()
4197 << " on " << *cur << dendl;
4198 } else {
4199 int caps = ceph_caps_for_mode(cmode);
4200 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
4201 << " for " << req->get_source()
4202 << " snapid " << mdr->snapid
4203 << " on " << *cur << dendl;
4204 mdr->snap_caps = caps;
4205 }
4206 }
4207
4208 // increase max_size?
4209 if (cmode & CEPH_FILE_MODE_WR)
4210 mds->locker->check_inode_max_size(cur);
4211
4212 // make sure this inode gets into the journal
4213 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
4214 mdcache->open_file_table.should_log_open(cur)) {
4215 EOpen *le = new EOpen(mds->mdlog);
4216 mdlog->start_entry(le);
4217 le->add_clean_inode(cur);
4218 mdlog->submit_entry(le);
4219 }
4220
4221 // hit pop
4222 if (cmode & CEPH_FILE_MODE_WR)
4223 mds->balancer->hit_inode(cur, META_POP_IWR);
4224 else
4225 mds->balancer->hit_inode(cur, META_POP_IRD,
4226 mdr->client_request->get_source().num());
4227
4228 CDentry *dn = 0;
4229 if (req->get_dentry_wanted()) {
4230 ceph_assert(mdr->dn[0].size());
4231 dn = mdr->dn[0].back();
4232 }
4233
4234 mdr->tracei = cur;
4235 mdr->tracedn = dn;
4236 respond_to_request(mdr, 0);
4237 }
4238
4239 class C_MDS_openc_finish : public ServerLogContext {
4240 CDentry *dn;
4241 CInode *newi;
4242 public:
4243 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4244 ServerLogContext(s, r), dn(d), newi(ni) {}
4245 void finish(int r) override {
4246 ceph_assert(r == 0);
4247
4248 dn->pop_projected_linkage();
4249
4250 // dirty inode, dn, dir
4251 newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
4252 newi->mark_dirty(newi->inode.version+1, mdr->ls);
4253 newi->mark_dirty_parent(mdr->ls, true);
4254
4255 mdr->apply();
4256
4257 get_mds()->locker->share_inode_max_size(newi);
4258
4259 MDRequestRef null_ref;
4260 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4261
4262 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
4263
4264 server->respond_to_request(mdr, 0);
4265
4266 ceph_assert(g_conf()->mds_kill_openc_at != 1);
4267 }
4268 };
4269
4270 /* This function takes responsibility for the passed mdr*/
4271 void Server::handle_client_openc(MDRequestRef& mdr)
4272 {
4273 const cref_t<MClientRequest> &req = mdr->client_request;
4274 client_t client = mdr->get_client();
4275
4276 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4277
4278 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4279 if (cmode < 0) {
4280 respond_to_request(mdr, -EINVAL);
4281 return;
4282 }
4283
4284 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
4285 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true);
4286 if (!dn)
4287 return;
4288
4289 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4290 if (!excl && !dnl->is_null()) {
4291 // it existed.
4292 mds->locker->xlock_downgrade(&dn->lock, mdr.get());
4293
4294 MutationImpl::LockOpVec lov;
4295 lov.add_rdlock(&dnl->get_inode()->snaplock);
4296 if (!mds->locker->acquire_locks(mdr, lov))
4297 return;
4298
4299 handle_client_open(mdr);
4300 return;
4301 }
4302
4303 ceph_assert(dnl->is_null());
4304
4305 // set layout
4306 file_layout_t layout;
4307 if (mdr->dir_layout != file_layout_t())
4308 layout = mdr->dir_layout;
4309 else
4310 layout = mdcache->default_file_layout;
4311
4312 // What kind of client caps are required to complete this operation
4313 uint64_t access = MAY_WRITE;
4314
4315 const auto default_layout = layout;
4316
4317 // fill in any special params from client
4318 if (req->head.args.open.stripe_unit)
4319 layout.stripe_unit = req->head.args.open.stripe_unit;
4320 if (req->head.args.open.stripe_count)
4321 layout.stripe_count = req->head.args.open.stripe_count;
4322 if (req->head.args.open.object_size)
4323 layout.object_size = req->head.args.open.object_size;
4324 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4325 (__s32)req->head.args.open.pool >= 0) {
4326 layout.pool_id = req->head.args.open.pool;
4327
4328 // make sure we have as new a map as the client
4329 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4330 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4331 return;
4332 }
4333 }
4334
4335 // If client doesn't have capability to modify layout pools, then
4336 // only permit this request if the requested pool matches what the
4337 // file would have inherited anyway from its parent.
4338 if (default_layout != layout) {
4339 access |= MAY_SET_VXATTR;
4340 }
4341
4342 if (!layout.is_valid()) {
4343 dout(10) << " invalid initial file layout" << dendl;
4344 respond_to_request(mdr, -EINVAL);
4345 return;
4346 }
4347 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4348 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4349 respond_to_request(mdr, -EINVAL);
4350 return;
4351 }
4352
4353 // created null dn.
4354 CDir *dir = dn->get_dir();
4355 CInode *diri = dir->get_inode();
4356 if (!check_access(mdr, diri, access))
4357 return;
4358 if (!check_fragment_space(mdr, dir))
4359 return;
4360
4361 if (mdr->dn[0].size() == 1)
4362 mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
4363
4364 // create inode.
4365 CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4366 req->head.args.open.mode | S_IFREG, &layout);
4367 ceph_assert(in);
4368
4369 // it's a file.
4370 dn->push_projected_linkage(in);
4371
4372 in->inode.version = dn->pre_dirty();
4373 if (layout.pool_id != mdcache->default_file_layout.pool_id)
4374 in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
4375 in->inode.update_backtrace();
4376 in->inode.rstat.rfiles = 1;
4377
4378 SnapRealm *realm = diri->find_snaprealm();
4379 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4380 ceph_assert(follows >= realm->get_newest_seq());
4381
4382 ceph_assert(dn->first == follows+1);
4383 in->first = dn->first;
4384
4385 // do the open
4386 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
4387 in->authlock.set_state(LOCK_EXCL);
4388 in->xattrlock.set_state(LOCK_EXCL);
4389
4390 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
4391 in->inode.client_ranges[client].range.first = 0;
4392 in->inode.client_ranges[client].range.last = in->inode.layout.stripe_unit;
4393 in->inode.client_ranges[client].follows = follows;
4394 cap->mark_clientwriteable();
4395 }
4396
4397 // prepare finisher
4398 mdr->ls = mdlog->get_current_segment();
4399 EUpdate *le = new EUpdate(mdlog, "openc");
4400 mdlog->start_entry(le);
4401 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4402 journal_allocated_inos(mdr, &le->metablob);
4403 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4404 le->metablob.add_primary_dentry(dn, in, true, true, true);
4405
4406 // make sure this inode gets into the journal
4407 le->metablob.add_opened_ino(in->ino());
4408
4409 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in);
4410
4411 if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
4412 openc_response_t ocresp;
4413
4414 dout(10) << "adding created_ino and delegated_inos" << dendl;
4415 ocresp.created_ino = in->inode.ino;
4416
4417 if (delegate_inos_pct && !req->is_queued_for_replay()) {
4418 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4419 unsigned frac = 100 / delegate_inos_pct;
4420 if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
4421 mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
4422 }
4423
4424 encode(ocresp, mdr->reply_extra_bl);
4425 } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
4426 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4427 // add the file created flag onto the reply if create_flags features is supported
4428 encode(in->inode.ino, mdr->reply_extra_bl);
4429 }
4430
4431 journal_and_reply(mdr, in, dn, le, fin);
4432
4433 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4434 // have overshot the split size (multiple opencs in flight), so here is
4435 // an early chance to split the dir if this openc makes it oversized.
4436 mds->balancer->maybe_fragment(dir, false);
4437 }
4438
4439
4440
4441 void Server::handle_client_readdir(MDRequestRef& mdr)
4442 {
4443 const cref_t<MClientRequest> &req = mdr->client_request;
4444 client_t client = req->get_source().num();
4445 MutationImpl::LockOpVec lov;
4446 CInode *diri = rdlock_path_pin_ref(mdr, false, true);
4447 if (!diri) return;
4448
4449 // it's a directory, right?
4450 if (!diri->is_dir()) {
4451 // not a dir
4452 dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
4453 respond_to_request(mdr, -ENOTDIR);
4454 return;
4455 }
4456
4457 lov.add_rdlock(&diri->filelock);
4458 lov.add_rdlock(&diri->dirfragtreelock);
4459
4460 if (!mds->locker->acquire_locks(mdr, lov))
4461 return;
4462
4463 if (!check_access(mdr, diri, MAY_READ))
4464 return;
4465
4466 // which frag?
4467 frag_t fg = (__u32)req->head.args.readdir.frag;
4468 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4469 string offset_str = req->get_path2();
4470
4471 __u32 offset_hash = 0;
4472 if (!offset_str.empty())
4473 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4474 else
4475 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4476
4477 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4478 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4479
4480 // does the frag exist?
4481 if (diri->dirfragtree[fg.value()] != fg) {
4482 frag_t newfg;
4483 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4484 if (fg.contains((unsigned)offset_hash)) {
4485 newfg = diri->dirfragtree[offset_hash];
4486 } else {
4487 // client actually wants next frag
4488 newfg = diri->dirfragtree[fg.value()];
4489 }
4490 } else {
4491 offset_str.clear();
4492 newfg = diri->dirfragtree[fg.value()];
4493 }
4494 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4495 fg = newfg;
4496 }
4497
4498 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4499 if (!dir) return;
4500
4501 // ok!
4502 dout(10) << "handle_client_readdir on " << *dir << dendl;
4503 ceph_assert(dir->is_auth());
4504
4505 if (!dir->is_complete()) {
4506 if (dir->is_frozen()) {
4507 dout(7) << "dir is frozen " << *dir << dendl;
4508 mds->locker->drop_locks(mdr.get());
4509 mdr->drop_local_auth_pins();
4510 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4511 return;
4512 }
4513 // fetch
4514 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4515 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4516 return;
4517 }
4518
4519 #ifdef MDS_VERIFY_FRAGSTAT
4520 dir->verify_fragstat();
4521 #endif
4522
4523 utime_t now = ceph_clock_now();
4524 mdr->set_mds_stamp(now);
4525
4526 snapid_t snapid = mdr->snapid;
4527 dout(10) << "snapid " << snapid << dendl;
4528
4529 SnapRealm *realm = diri->find_snaprealm();
4530
4531 unsigned max = req->head.args.readdir.max_entries;
4532 if (!max)
4533 max = dir->get_num_any(); // whatever, something big.
4534 unsigned max_bytes = req->head.args.readdir.max_bytes;
4535 if (!max_bytes)
4536 // make sure at least one item can be encoded
4537 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
4538
4539 // start final blob
4540 bufferlist dirbl;
4541 DirStat ds;
4542 ds.frag = dir->get_frag();
4543 ds.auth = dir->get_dir_auth().first;
4544 if (dir->is_auth() && !mdcache->forward_all_reqs_to_auth())
4545 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4546
4547 dir->encode_dirstat(dirbl, mdr->session->info, ds);
4548
4549 // count bytes available.
4550 // this isn't perfect, but we should capture the main variable/unbounded size items!
4551 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4552 int bytes_left = max_bytes - front_bytes;
4553 bytes_left -= realm->get_snap_trace().length();
4554
4555 // build dir contents
4556 bufferlist dnbl;
4557 __u32 numfiles = 0;
4558 bool start = !offset_hash && offset_str.empty();
4559 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4560 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
4561 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4562 bool end = (it == dir->end());
4563 for (; !end && numfiles < max; end = (it == dir->end())) {
4564 CDentry *dn = it->second;
4565 ++it;
4566
4567 if (dn->state_test(CDentry::STATE_PURGING))
4568 continue;
4569
4570 bool dnp = dn->use_projected(client, mdr);
4571 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4572
4573 if (dnl->is_null())
4574 continue;
4575
4576 if (dn->last < snapid || dn->first > snapid) {
4577 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4578 continue;
4579 }
4580
4581 if (!start) {
4582 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4583 if (!(offset_key < dn->key()))
4584 continue;
4585 }
4586
4587 CInode *in = dnl->get_inode();
4588
4589 if (in && in->ino() == CEPH_INO_CEPH)
4590 continue;
4591
4592 // remote link?
4593 // better for the MDS to do the work, if we think the client will stat any of these files.
4594 if (dnl->is_remote() && !in) {
4595 in = mdcache->get_inode(dnl->get_remote_ino());
4596 if (in) {
4597 dn->link_remote(dnl, in);
4598 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4599 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4600 continue;
4601 } else {
4602 // touch everything i _do_ have
4603 for (auto &p : *dir) {
4604 if (!p.second->get_linkage()->is_null())
4605 mdcache->lru.lru_touch(p.second);
4606 }
4607
4608 // already issued caps and leases, reply immediately.
4609 if (dnbl.length() > 0) {
4610 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4611 dout(10) << " open remote dentry after caps were issued, stopping at "
4612 << dnbl.length() << " < " << bytes_left << dendl;
4613 break;
4614 }
4615
4616 mds->locker->drop_locks(mdr.get());
4617 mdr->drop_local_auth_pins();
4618 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4619 return;
4620 }
4621 }
4622 ceph_assert(in);
4623
4624 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
4625 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4626 break;
4627 }
4628
4629 unsigned start_len = dnbl.length();
4630
4631 // dentry
4632 dout(12) << "including dn " << *dn << dendl;
4633 encode(dn->get_name(), dnbl);
4634 int lease_mask = dnl->is_primary() ? CEPH_LEASE_PRIMARY_LINK : 0;
4635 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, dnbl);
4636
4637 // inode
4638 dout(12) << "including inode " << *in << dendl;
4639 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4640 if (r < 0) {
4641 // chop off dn->name, lease
4642 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4643 bufferlist keep;
4644 keep.substr_of(dnbl, 0, start_len);
4645 dnbl.swap(keep);
4646 break;
4647 }
4648 ceph_assert(r >= 0);
4649 numfiles++;
4650
4651 // touch dn
4652 mdcache->lru.lru_touch(dn);
4653 }
4654
4655 __u16 flags = 0;
4656 if (end) {
4657 flags = CEPH_READDIR_FRAG_END;
4658 if (start)
4659 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4660 }
4661 // client only understand END and COMPLETE flags ?
4662 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4663 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4664 }
4665
4666 // finish final blob
4667 encode(numfiles, dirbl);
4668 encode(flags, dirbl);
4669 dirbl.claim_append(dnbl);
4670
4671 // yay, reply
4672 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4673 << " bytes=" << dirbl.length()
4674 << " start=" << (int)start
4675 << " end=" << (int)end
4676 << dendl;
4677 mdr->reply_extra_bl = dirbl;
4678
4679 // bump popularity. NOTE: this doesn't quite capture it.
4680 mds->balancer->hit_dir(dir, META_POP_IRD, -1, numfiles);
4681
4682 // reply
4683 mdr->tracei = diri;
4684 respond_to_request(mdr, 0);
4685 }
4686
4687
4688
4689 // ===============================================================================
4690 // INODE UPDATES
4691
4692
4693 /*
4694 * finisher for basic inode updates
4695 */
4696 class C_MDS_inode_update_finish : public ServerLogContext {
4697 CInode *in;
4698 bool truncating_smaller, changed_ranges, new_realm;
4699 public:
4700 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
4701 bool sm=false, bool cr=false, bool nr=false) :
4702 ServerLogContext(s, r), in(i),
4703 truncating_smaller(sm), changed_ranges(cr), new_realm(nr) { }
4704 void finish(int r) override {
4705 ceph_assert(r == 0);
4706
4707 // apply
4708 in->pop_and_dirty_projected_inode(mdr->ls);
4709 mdr->apply();
4710
4711 MDSRank *mds = get_mds();
4712
4713 // notify any clients
4714 if (truncating_smaller && in->inode.is_truncating()) {
4715 mds->locker->issue_truncate(in);
4716 mds->mdcache->truncate_inode(in, mdr->ls);
4717 }
4718
4719 if (new_realm) {
4720 int op = CEPH_SNAP_OP_SPLIT;
4721 mds->mdcache->send_snap_update(in, 0, op);
4722 mds->mdcache->do_realm_invalidate_and_update_notify(in, op);
4723 }
4724
4725 get_mds()->balancer->hit_inode(in, META_POP_IWR);
4726
4727 server->respond_to_request(mdr, 0);
4728
4729 if (changed_ranges)
4730 get_mds()->locker->share_inode_max_size(in);
4731 }
4732 };
4733
4734 void Server::handle_client_file_setlock(MDRequestRef& mdr)
4735 {
4736 const cref_t<MClientRequest> &req = mdr->client_request;
4737 MutationImpl::LockOpVec lov;
4738
4739 // get the inode to operate on, and set up any locks needed for that
4740 CInode *cur = rdlock_path_pin_ref(mdr, true);
4741 if (!cur)
4742 return;
4743
4744 lov.add_xlock(&cur->flocklock);
4745 /* acquire_locks will return true if it gets the locks. If it fails,
4746 it will redeliver this request at a later date, so drop the request.
4747 */
4748 if (!mds->locker->acquire_locks(mdr, lov)) {
4749 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
4750 return;
4751 }
4752
4753 // copy the lock change into a ceph_filelock so we can store/apply it
4754 ceph_filelock set_lock;
4755 set_lock.start = req->head.args.filelock_change.start;
4756 set_lock.length = req->head.args.filelock_change.length;
4757 set_lock.client = req->get_orig_source().num();
4758 set_lock.owner = req->head.args.filelock_change.owner;
4759 set_lock.pid = req->head.args.filelock_change.pid;
4760 set_lock.type = req->head.args.filelock_change.type;
4761 bool will_wait = req->head.args.filelock_change.wait;
4762
4763 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
4764
4765 ceph_lock_state_t *lock_state = NULL;
4766 bool interrupt = false;
4767
4768 // get the appropriate lock state
4769 switch (req->head.args.filelock_change.rule) {
4770 case CEPH_LOCK_FLOCK_INTR:
4771 interrupt = true;
4772 // fall-thru
4773 case CEPH_LOCK_FLOCK:
4774 lock_state = cur->get_flock_lock_state();
4775 break;
4776
4777 case CEPH_LOCK_FCNTL_INTR:
4778 interrupt = true;
4779 // fall-thru
4780 case CEPH_LOCK_FCNTL:
4781 lock_state = cur->get_fcntl_lock_state();
4782 break;
4783
4784 default:
4785 dout(10) << "got unknown lock type " << set_lock.type
4786 << ", dropping request!" << dendl;
4787 respond_to_request(mdr, -EOPNOTSUPP);
4788 return;
4789 }
4790
4791 dout(10) << " state prior to lock change: " << *lock_state << dendl;
4792 if (CEPH_LOCK_UNLOCK == set_lock.type) {
4793 list<ceph_filelock> activated_locks;
4794 MDSContext::vec waiters;
4795 if (lock_state->is_waiting(set_lock)) {
4796 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
4797 lock_state->remove_waiting(set_lock);
4798 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4799 } else if (!interrupt) {
4800 dout(10) << " unlock attempt on " << set_lock << dendl;
4801 lock_state->remove_lock(set_lock, activated_locks);
4802 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4803 }
4804 mds->queue_waiters(waiters);
4805
4806 respond_to_request(mdr, 0);
4807 } else {
4808 dout(10) << " lock attempt on " << set_lock << dendl;
4809 bool deadlock = false;
4810 if (mdr->more()->flock_was_waiting &&
4811 !lock_state->is_waiting(set_lock)) {
4812 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
4813 respond_to_request(mdr, -EINTR);
4814 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
4815 dout(10) << " it failed on this attempt" << dendl;
4816 // couldn't set lock right now
4817 if (deadlock) {
4818 respond_to_request(mdr, -EDEADLK);
4819 } else if (!will_wait) {
4820 respond_to_request(mdr, -EWOULDBLOCK);
4821 } else {
4822 dout(10) << " added to waiting list" << dendl;
4823 ceph_assert(lock_state->is_waiting(set_lock));
4824 mdr->more()->flock_was_waiting = true;
4825 mds->locker->drop_locks(mdr.get());
4826 mdr->drop_local_auth_pins();
4827 mdr->mark_event("failed to add lock, waiting");
4828 mdr->mark_nowarn();
4829 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
4830 }
4831 } else
4832 respond_to_request(mdr, 0);
4833 }
4834 dout(10) << " state after lock change: " << *lock_state << dendl;
4835 }
4836
4837 void Server::handle_client_file_readlock(MDRequestRef& mdr)
4838 {
4839 const cref_t<MClientRequest> &req = mdr->client_request;
4840 MutationImpl::LockOpVec lov;
4841
4842 // get the inode to operate on, and set up any locks needed for that
4843 CInode *cur = rdlock_path_pin_ref(mdr, true);
4844 if (!cur)
4845 return;
4846
4847 /* acquire_locks will return true if it gets the locks. If it fails,
4848 it will redeliver this request at a later date, so drop the request.
4849 */
4850 lov.add_rdlock(&cur->flocklock);
4851 if (!mds->locker->acquire_locks(mdr, lov)) {
4852 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
4853 return;
4854 }
4855
4856 // copy the lock change into a ceph_filelock so we can store/apply it
4857 ceph_filelock checking_lock;
4858 checking_lock.start = req->head.args.filelock_change.start;
4859 checking_lock.length = req->head.args.filelock_change.length;
4860 checking_lock.client = req->get_orig_source().num();
4861 checking_lock.owner = req->head.args.filelock_change.owner;
4862 checking_lock.pid = req->head.args.filelock_change.pid;
4863 checking_lock.type = req->head.args.filelock_change.type;
4864
4865 // get the appropriate lock state
4866 ceph_lock_state_t *lock_state = NULL;
4867 switch (req->head.args.filelock_change.rule) {
4868 case CEPH_LOCK_FLOCK:
4869 lock_state = cur->get_flock_lock_state();
4870 break;
4871
4872 case CEPH_LOCK_FCNTL:
4873 lock_state = cur->get_fcntl_lock_state();
4874 break;
4875
4876 default:
4877 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
4878 respond_to_request(mdr, -EINVAL);
4879 return;
4880 }
4881 lock_state->look_for_lock(checking_lock);
4882
4883 bufferlist lock_bl;
4884 encode(checking_lock, lock_bl);
4885
4886 mdr->reply_extra_bl = lock_bl;
4887 respond_to_request(mdr, 0);
4888 }
4889
4890 void Server::handle_client_setattr(MDRequestRef& mdr)
4891 {
4892 const cref_t<MClientRequest> &req = mdr->client_request;
4893 MutationImpl::LockOpVec lov;
4894 CInode *cur = rdlock_path_pin_ref(mdr, true);
4895 if (!cur) return;
4896
4897 if (mdr->snapid != CEPH_NOSNAP) {
4898 respond_to_request(mdr, -EROFS);
4899 return;
4900 }
4901 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
4902 respond_to_request(mdr, -EPERM);
4903 return;
4904 }
4905
4906 __u32 mask = req->head.args.setattr.mask;
4907 __u32 access_mask = MAY_WRITE;
4908
4909 // xlock inode
4910 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
4911 lov.add_xlock(&cur->authlock);
4912 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
4913 lov.add_xlock(&cur->filelock);
4914 if (mask & CEPH_SETATTR_CTIME)
4915 lov.add_wrlock(&cur->versionlock);
4916
4917 if (!mds->locker->acquire_locks(mdr, lov))
4918 return;
4919
4920 if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
4921 access_mask |= MAY_CHOWN;
4922
4923 if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
4924 access_mask |= MAY_CHGRP;
4925
4926 if (!check_access(mdr, cur, access_mask))
4927 return;
4928
4929 // trunc from bigger -> smaller?
4930 auto pip = cur->get_projected_inode();
4931
4932 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
4933
4934 // ENOSPC on growing file while full, but allow shrinks
4935 if (is_full && req->head.args.setattr.size > old_size) {
4936 dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
4937 respond_to_request(mdr, -ENOSPC);
4938 return;
4939 }
4940
4941 bool truncating_smaller = false;
4942 if (mask & CEPH_SETATTR_SIZE) {
4943 truncating_smaller = req->head.args.setattr.size < old_size;
4944 if (truncating_smaller && pip->is_truncating()) {
4945 dout(10) << " waiting for pending truncate from " << pip->truncate_from
4946 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
4947 mds->locker->drop_locks(mdr.get());
4948 mdr->drop_local_auth_pins();
4949 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4950 return;
4951 }
4952 }
4953
4954 bool changed_ranges = false;
4955
4956 // project update
4957 mdr->ls = mdlog->get_current_segment();
4958 EUpdate *le = new EUpdate(mdlog, "setattr");
4959 mdlog->start_entry(le);
4960
4961 auto &pi = cur->project_inode();
4962
4963 if (mask & CEPH_SETATTR_UID)
4964 pi.inode.uid = req->head.args.setattr.uid;
4965 if (mask & CEPH_SETATTR_GID)
4966 pi.inode.gid = req->head.args.setattr.gid;
4967
4968 if (mask & CEPH_SETATTR_MODE)
4969 pi.inode.mode = (pi.inode.mode & ~07777) | (req->head.args.setattr.mode & 07777);
4970 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
4971 S_ISREG(pi.inode.mode) &&
4972 (pi.inode.mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
4973 pi.inode.mode &= ~(S_ISUID|S_ISGID);
4974 }
4975
4976 if (mask & CEPH_SETATTR_MTIME)
4977 pi.inode.mtime = req->head.args.setattr.mtime;
4978 if (mask & CEPH_SETATTR_ATIME)
4979 pi.inode.atime = req->head.args.setattr.atime;
4980 if (mask & CEPH_SETATTR_BTIME)
4981 pi.inode.btime = req->head.args.setattr.btime;
4982 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
4983 pi.inode.time_warp_seq++; // maybe not a timewarp, but still a serialization point.
4984 if (mask & CEPH_SETATTR_SIZE) {
4985 if (truncating_smaller) {
4986 pi.inode.truncate(old_size, req->head.args.setattr.size);
4987 le->metablob.add_truncate_start(cur->ino());
4988 } else {
4989 pi.inode.size = req->head.args.setattr.size;
4990 pi.inode.rstat.rbytes = pi.inode.size;
4991 }
4992 pi.inode.mtime = mdr->get_op_stamp();
4993
4994 // adjust client's max_size?
4995 CInode::mempool_inode::client_range_map new_ranges;
4996 bool max_increased = false;
4997 mds->locker->calc_new_client_ranges(cur, pi.inode.size, true, &new_ranges, &max_increased);
4998 if (pi.inode.client_ranges != new_ranges) {
4999 dout(10) << " client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl;
5000 pi.inode.client_ranges = new_ranges;
5001 changed_ranges = true;
5002 }
5003 }
5004
5005 pi.inode.version = cur->pre_dirty();
5006 pi.inode.ctime = mdr->get_op_stamp();
5007 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5008 pi.inode.rstat.rctime = mdr->get_op_stamp();
5009 pi.inode.change_attr++;
5010
5011 // log + wait
5012 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5013 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5014 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5015
5016 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5017 truncating_smaller, changed_ranges));
5018
5019 // flush immediately if there are readers/writers waiting
5020 if (mdr->is_xlocked(&cur->filelock) &&
5021 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
5022 mds->mdlog->flush();
5023 }
5024
5025 /* Takes responsibility for mdr */
5026 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
5027 {
5028 CInode *in = mdr->in[0];
5029 client_t client = mdr->get_client();
5030 ceph_assert(in);
5031
5032 dout(10) << "do_open_truncate " << *in << dendl;
5033
5034 SnapRealm *realm = in->find_snaprealm();
5035 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
5036
5037 mdr->ls = mdlog->get_current_segment();
5038 EUpdate *le = new EUpdate(mdlog, "open_truncate");
5039 mdlog->start_entry(le);
5040
5041 // prepare
5042 auto &pi = in->project_inode();
5043 pi.inode.version = in->pre_dirty();
5044 pi.inode.mtime = pi.inode.ctime = mdr->get_op_stamp();
5045 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5046 pi.inode.rstat.rctime = mdr->get_op_stamp();
5047 pi.inode.change_attr++;
5048
5049 uint64_t old_size = std::max<uint64_t>(pi.inode.size, mdr->client_request->head.args.open.old_size);
5050 if (old_size > 0) {
5051 pi.inode.truncate(old_size, 0);
5052 le->metablob.add_truncate_start(in->ino());
5053 }
5054
5055 bool changed_ranges = false;
5056 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
5057 pi.inode.client_ranges[client].range.first = 0;
5058 pi.inode.client_ranges[client].range.last = pi.inode.get_layout_size_increment();
5059 pi.inode.client_ranges[client].follows = realm->get_newest_seq();
5060 changed_ranges = true;
5061 cap->mark_clientwriteable();
5062 }
5063
5064 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5065
5066 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5067 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5068
5069 // make sure ino gets into the journal
5070 le->metablob.add_opened_ino(in->ino());
5071
5072 mdr->o_trunc = true;
5073
5074 CDentry *dn = 0;
5075 if (mdr->client_request->get_dentry_wanted()) {
5076 ceph_assert(mdr->dn[0].size());
5077 dn = mdr->dn[0].back();
5078 }
5079
5080 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
5081 changed_ranges));
5082 // Although the `open` part can give an early reply, the truncation won't
5083 // happen until our EUpdate is persistent, to give the client a prompt
5084 // response we must also flush that event.
5085 mdlog->flush();
5086 }
5087
5088
5089 /* This function cleans up the passed mdr */
5090 void Server::handle_client_setlayout(MDRequestRef& mdr)
5091 {
5092 const cref_t<MClientRequest> &req = mdr->client_request;
5093 CInode *cur = rdlock_path_pin_ref(mdr, true);
5094 if (!cur) return;
5095
5096 if (mdr->snapid != CEPH_NOSNAP) {
5097 respond_to_request(mdr, -EROFS);
5098 return;
5099 }
5100 if (!cur->is_file()) {
5101 respond_to_request(mdr, -EINVAL);
5102 return;
5103 }
5104 if (cur->get_projected_inode()->size ||
5105 cur->get_projected_inode()->truncate_seq > 1) {
5106 respond_to_request(mdr, -ENOTEMPTY);
5107 return;
5108 }
5109
5110 // validate layout
5111 file_layout_t layout = cur->get_projected_inode()->layout;
5112 // save existing layout for later
5113 const auto old_layout = layout;
5114
5115 int access = MAY_WRITE;
5116
5117 if (req->head.args.setlayout.layout.fl_object_size > 0)
5118 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5119 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5120 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5121 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5122 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5123 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5124 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5125
5126 // make sure we have as new a map as the client
5127 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5128 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5129 return;
5130 }
5131 }
5132
5133 // Don't permit layout modifications without 'p' caps
5134 if (layout != old_layout) {
5135 access |= MAY_SET_VXATTR;
5136 }
5137
5138 if (!layout.is_valid()) {
5139 dout(10) << "bad layout" << dendl;
5140 respond_to_request(mdr, -EINVAL);
5141 return;
5142 }
5143 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5144 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5145 respond_to_request(mdr, -EINVAL);
5146 return;
5147 }
5148
5149 MutationImpl::LockOpVec lov;
5150 lov.add_xlock(&cur->filelock);
5151 if (!mds->locker->acquire_locks(mdr, lov))
5152 return;
5153
5154 if (!check_access(mdr, cur, access))
5155 return;
5156
5157 // project update
5158 auto &pi = cur->project_inode();
5159 pi.inode.layout = layout;
5160 // add the old pool to the inode
5161 pi.inode.add_old_pool(old_layout.pool_id);
5162 pi.inode.version = cur->pre_dirty();
5163 pi.inode.ctime = mdr->get_op_stamp();
5164 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5165 pi.inode.rstat.rctime = mdr->get_op_stamp();
5166 pi.inode.change_attr++;
5167
5168 // log + wait
5169 mdr->ls = mdlog->get_current_segment();
5170 EUpdate *le = new EUpdate(mdlog, "setlayout");
5171 mdlog->start_entry(le);
5172 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5173 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5174 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5175
5176 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5177 }
5178
5179 bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
5180 {
5181 if (mdr->locking_state & MutationImpl::ALL_LOCKED)
5182 return true;
5183
5184 MutationImpl::LockOpVec lov;
5185 lov.add_xlock(&in->policylock);
5186 if (xlock_snaplock)
5187 lov.add_xlock(&in->snaplock);
5188 else
5189 lov.add_rdlock(&in->snaplock);
5190 if (!mds->locker->acquire_locks(mdr, lov))
5191 return false;
5192
5193 if (want_layout && in->get_projected_inode()->has_layout()) {
5194 mdr->dir_layout = in->get_projected_inode()->layout;
5195 want_layout = false;
5196 }
5197 if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
5198 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
5199 return false;
5200 }
5201
5202 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5203 return true;
5204 }
5205
5206 CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
5207 {
5208 CInode *in = mdcache->get_inode(ino);
5209 if (!in || in->state_test(CInode::STATE_PURGING)) {
5210 respond_to_request(mdr, -ESTALE);
5211 return nullptr;
5212 }
5213 if (!in->is_auth()) {
5214 mdcache->request_forward(mdr, in->authority().first);
5215 return nullptr;
5216 }
5217
5218 return in;
5219 }
5220
5221 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
5222 {
5223 const cref_t<MClientRequest> &req = mdr->client_request;
5224
5225 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5226 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5227 if (!cur)
5228 return;
5229
5230 if (!cur->is_dir()) {
5231 respond_to_request(mdr, -ENOTDIR);
5232 return;
5233 }
5234
5235 if (!xlock_policylock(mdr, cur, true))
5236 return;
5237
5238 // validate layout
5239 const auto old_pi = cur->get_projected_inode();
5240 file_layout_t layout;
5241 if (old_pi->has_layout())
5242 layout = old_pi->layout;
5243 else if (mdr->dir_layout != file_layout_t())
5244 layout = mdr->dir_layout;
5245 else
5246 layout = mdcache->default_file_layout;
5247
5248 // Level of access required to complete
5249 int access = MAY_WRITE;
5250
5251 const auto old_layout = layout;
5252
5253 if (req->head.args.setlayout.layout.fl_object_size > 0)
5254 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5255 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5256 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5257 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5258 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5259 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5260 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5261 // make sure we have as new a map as the client
5262 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5263 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5264 return;
5265 }
5266 }
5267
5268 if (layout != old_layout) {
5269 access |= MAY_SET_VXATTR;
5270 }
5271
5272 if (!layout.is_valid()) {
5273 dout(10) << "bad layout" << dendl;
5274 respond_to_request(mdr, -EINVAL);
5275 return;
5276 }
5277 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5278 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5279 respond_to_request(mdr, -EINVAL);
5280 return;
5281 }
5282
5283 if (!check_access(mdr, cur, access))
5284 return;
5285
5286 auto &pi = cur->project_inode();
5287 pi.inode.layout = layout;
5288 pi.inode.version = cur->pre_dirty();
5289
5290 // log + wait
5291 mdr->ls = mdlog->get_current_segment();
5292 EUpdate *le = new EUpdate(mdlog, "setlayout");
5293 mdlog->start_entry(le);
5294 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5295 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5296 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5297
5298 mdr->no_early_reply = true;
5299 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5300 }
5301
5302 // XATTRS
5303
5304 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5305 file_layout_t *layout, bool validate)
5306 {
5307 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
5308 try {
5309 if (name == "layout") {
5310 string::iterator begin = value.begin();
5311 string::iterator end = value.end();
5312 keys_and_values<string::iterator> p; // create instance of parser
5313 std::map<string, string> m; // map to receive results
5314 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5315 return -EINVAL;
5316 }
5317 string left(begin, end);
5318 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5319 if (begin != end)
5320 return -EINVAL;
5321 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5322 // Skip validation on each attr, we do it once at the end (avoid
5323 // rejecting intermediate states if the overall result is ok)
5324 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
5325 osdmap, layout, false);
5326 if (r < 0)
5327 return r;
5328 }
5329 } else if (name == "layout.object_size") {
5330 layout->object_size = boost::lexical_cast<unsigned>(value);
5331 } else if (name == "layout.stripe_unit") {
5332 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5333 } else if (name == "layout.stripe_count") {
5334 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5335 } else if (name == "layout.pool") {
5336 try {
5337 layout->pool_id = boost::lexical_cast<unsigned>(value);
5338 } catch (boost::bad_lexical_cast const&) {
5339 int64_t pool = osdmap.lookup_pg_pool_name(value);
5340 if (pool < 0) {
5341 dout(10) << " unknown pool " << value << dendl;
5342 return -ENOENT;
5343 }
5344 layout->pool_id = pool;
5345 }
5346 } else if (name == "layout.pool_namespace") {
5347 layout->pool_ns = value;
5348 } else {
5349 dout(10) << " unknown layout vxattr " << name << dendl;
5350 return -EINVAL;
5351 }
5352 } catch (boost::bad_lexical_cast const&) {
5353 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5354 return -EINVAL;
5355 }
5356
5357 if (validate && !layout->is_valid()) {
5358 dout(10) << "bad layout" << dendl;
5359 return -EINVAL;
5360 }
5361 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
5362 dout(10) << " invalid data pool " << layout->pool_id << dendl;
5363 return -EINVAL;
5364 }
5365 return 0;
5366 }
5367
5368 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5369 {
5370 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5371 try {
5372 if (name == "quota") {
5373 string::iterator begin = value.begin();
5374 string::iterator end = value.end();
5375 if (begin == end) {
5376 // keep quota unchanged. (for create_quota_realm())
5377 return 0;
5378 }
5379 keys_and_values<string::iterator> p; // create instance of parser
5380 std::map<string, string> m; // map to receive results
5381 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5382 return -EINVAL;
5383 }
5384 string left(begin, end);
5385 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5386 if (begin != end)
5387 return -EINVAL;
5388 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5389 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5390 if (r < 0)
5391 return r;
5392 }
5393 } else if (name == "quota.max_bytes") {
5394 int64_t q = boost::lexical_cast<int64_t>(value);
5395 if (q < 0)
5396 return -EINVAL;
5397 quota->max_bytes = q;
5398 } else if (name == "quota.max_files") {
5399 int64_t q = boost::lexical_cast<int64_t>(value);
5400 if (q < 0)
5401 return -EINVAL;
5402 quota->max_files = q;
5403 } else {
5404 dout(10) << " unknown quota vxattr " << name << dendl;
5405 return -EINVAL;
5406 }
5407 } catch (boost::bad_lexical_cast const&) {
5408 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5409 return -EINVAL;
5410 }
5411
5412 if (!quota->is_valid()) {
5413 dout(10) << "bad quota" << dendl;
5414 return -EINVAL;
5415 }
5416 return 0;
5417 }
5418
5419 void Server::create_quota_realm(CInode *in)
5420 {
5421 dout(10) << __func__ << " " << *in << dendl;
5422
5423 auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
5424 req->set_filepath(filepath(in->ino()));
5425 req->set_string2("ceph.quota");
5426 // empty vxattr value
5427 req->set_tid(mds->issue_tid());
5428
5429 mds->send_message_mds(req, in->authority().first);
5430 }
5431
5432 /*
5433 * Verify that the file layout attribute carried by client
5434 * is well-formatted.
5435 * Return 0 on success, otherwise this function takes
5436 * responsibility for the passed mdr.
5437 */
5438 int Server::check_layout_vxattr(MDRequestRef& mdr,
5439 string name,
5440 string value,
5441 file_layout_t *layout)
5442 {
5443 const cref_t<MClientRequest> &req = mdr->client_request;
5444 epoch_t epoch;
5445 int r;
5446
5447 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5448 r = parse_layout_vxattr(name, value, osdmap, layout);
5449 epoch = osdmap.get_epoch();
5450 });
5451
5452 if (r == -ENOENT) {
5453
5454 // we don't have the specified pool, make sure our map
5455 // is newer than or as new as the client.
5456 epoch_t req_epoch = req->get_osdmap_epoch();
5457
5458 if (req_epoch > epoch) {
5459
5460 // well, our map is older. consult mds.
5461 Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
5462
5463 if (!mds->objecter->wait_for_map(req_epoch, fin))
5464 return r; // wait, fin will retry this request later
5465
5466 delete fin;
5467
5468 // now we have at least as new a map as the client, try again.
5469 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5470 r = parse_layout_vxattr(name, value, osdmap, layout);
5471 epoch = osdmap.get_epoch();
5472 });
5473
5474 ceph_assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
5475
5476 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5477
5478 // For compatibility with client w/ old code, we still need get the
5479 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5480 // we can remove those code.
5481 mdr->waited_for_osdmap = true;
5482 mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
5483 mds, new C_MDS_RetryRequest(mdcache, mdr)));
5484 return r;
5485 }
5486 }
5487
5488 if (r < 0) {
5489
5490 if (r == -ENOENT)
5491 r = -EINVAL;
5492
5493 respond_to_request(mdr, r);
5494 return r;
5495 }
5496
5497 // all is well
5498 return 0;
5499 }
5500
5501 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
5502 {
5503 const cref_t<MClientRequest> &req = mdr->client_request;
5504 string name(req->get_path2());
5505 bufferlist bl = req->get_data();
5506 string value (bl.c_str(), bl.length());
5507 dout(10) << "handle_set_vxattr " << name
5508 << " val " << value.length()
5509 << " bytes on " << *cur
5510 << dendl;
5511
5512 CInode::mempool_inode *pip = nullptr;
5513 string rest;
5514
5515 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5516 return;
5517 }
5518
5519 bool new_realm = false;
5520 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5521 if (!cur->is_dir()) {
5522 respond_to_request(mdr, -EINVAL);
5523 return;
5524 }
5525
5526 if (!xlock_policylock(mdr, cur, true))
5527 return;
5528
5529 file_layout_t layout;
5530 if (cur->get_projected_inode()->has_layout())
5531 layout = cur->get_projected_inode()->layout;
5532 else if (mdr->dir_layout != file_layout_t())
5533 layout = mdr->dir_layout;
5534 else
5535 layout = mdcache->default_file_layout;
5536
5537 rest = name.substr(name.find("layout"));
5538 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5539 return;
5540
5541 auto &pi = cur->project_inode();
5542 pi.inode.layout = layout;
5543 mdr->no_early_reply = true;
5544 pip = &pi.inode;
5545 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
5546 if (!cur->is_file()) {
5547 respond_to_request(mdr, -EINVAL);
5548 return;
5549 }
5550 if (cur->get_projected_inode()->size ||
5551 cur->get_projected_inode()->truncate_seq > 1) {
5552 respond_to_request(mdr, -ENOTEMPTY);
5553 return;
5554 }
5555 file_layout_t layout = cur->get_projected_inode()->layout;
5556 rest = name.substr(name.find("layout"));
5557 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5558 return;
5559
5560 MutationImpl::LockOpVec lov;
5561 lov.add_xlock(&cur->filelock);
5562 if (!mds->locker->acquire_locks(mdr, lov))
5563 return;
5564
5565 auto &pi = cur->project_inode();
5566 int64_t old_pool = pi.inode.layout.pool_id;
5567 pi.inode.add_old_pool(old_pool);
5568 pi.inode.layout = layout;
5569 pip = &pi.inode;
5570 } else if (name.compare(0, 10, "ceph.quota") == 0) {
5571 if (!cur->is_dir() || cur->is_root()) {
5572 respond_to_request(mdr, -EINVAL);
5573 return;
5574 }
5575
5576 quota_info_t quota = cur->get_projected_inode()->quota;
5577
5578 rest = name.substr(name.find("quota"));
5579 int r = parse_quota_vxattr(rest, value, &quota);
5580 if (r < 0) {
5581 respond_to_request(mdr, r);
5582 return;
5583 }
5584
5585 if (quota.is_enable() && !cur->get_projected_srnode())
5586 new_realm = true;
5587
5588 if (!xlock_policylock(mdr, cur, false, new_realm))
5589 return;
5590
5591 auto &pi = cur->project_inode(false, new_realm);
5592 pi.inode.quota = quota;
5593
5594 if (new_realm) {
5595 SnapRealm *realm = cur->find_snaprealm();
5596 auto seq = realm->get_newest_seq();
5597 auto &newsnap = *pi.snapnode;
5598 newsnap.created = seq;
5599 newsnap.seq = seq;
5600 }
5601 mdr->no_early_reply = true;
5602 pip = &pi.inode;
5603
5604 client_t exclude_ct = mdr->get_client();
5605 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
5606 } else if (name.find("ceph.dir.pin") == 0) {
5607 if (!cur->is_dir() || cur->is_root()) {
5608 respond_to_request(mdr, -EINVAL);
5609 return;
5610 }
5611
5612 mds_rank_t rank;
5613 try {
5614 rank = boost::lexical_cast<mds_rank_t>(value);
5615 if (rank < 0) rank = MDS_RANK_NONE;
5616 } catch (boost::bad_lexical_cast const&) {
5617 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5618 respond_to_request(mdr, -EINVAL);
5619 return;
5620 }
5621
5622 if (!xlock_policylock(mdr, cur))
5623 return;
5624
5625 auto &pi = cur->project_inode();
5626 cur->set_export_pin(rank);
5627 pip = &pi.inode;
5628 } else {
5629 dout(10) << " unknown vxattr " << name << dendl;
5630 respond_to_request(mdr, -EINVAL);
5631 return;
5632 }
5633
5634 pip->change_attr++;
5635 pip->ctime = mdr->get_op_stamp();
5636 if (mdr->get_op_stamp() > pip->rstat.rctime)
5637 pip->rstat.rctime = mdr->get_op_stamp();
5638 pip->version = cur->pre_dirty();
5639 if (cur->is_file())
5640 pip->update_backtrace();
5641
5642 // log + wait
5643 mdr->ls = mdlog->get_current_segment();
5644 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
5645 mdlog->start_entry(le);
5646 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5647 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5648 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5649
5650 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5651 false, false, new_realm));
5652 return;
5653 }
5654
5655 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
5656 {
5657 const cref_t<MClientRequest> &req = mdr->client_request;
5658 string name(req->get_path2());
5659
5660 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
5661
5662 if (name == "ceph.dir.layout") {
5663 if (!cur->is_dir()) {
5664 respond_to_request(mdr, -ENODATA);
5665 return;
5666 }
5667 if (cur->is_root()) {
5668 dout(10) << "can't remove layout policy on the root directory" << dendl;
5669 respond_to_request(mdr, -EINVAL);
5670 return;
5671 }
5672
5673 if (!cur->get_projected_inode()->has_layout()) {
5674 respond_to_request(mdr, -ENODATA);
5675 return;
5676 }
5677
5678 MutationImpl::LockOpVec lov;
5679 lov.add_xlock(&cur->policylock);
5680 if (!mds->locker->acquire_locks(mdr, lov))
5681 return;
5682
5683 auto &pi = cur->project_inode();
5684 pi.inode.clear_layout();
5685 pi.inode.version = cur->pre_dirty();
5686
5687 // log + wait
5688 mdr->ls = mdlog->get_current_segment();
5689 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
5690 mdlog->start_entry(le);
5691 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5692 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5693 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5694
5695 mdr->no_early_reply = true;
5696 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5697 return;
5698 } else if (name == "ceph.dir.layout.pool_namespace"
5699 || name == "ceph.file.layout.pool_namespace") {
5700 // Namespace is the only layout field that has a meaningful
5701 // null/none value (empty string, means default layout). Is equivalent
5702 // to a setxattr with empty string: pass through the empty payload of
5703 // the rmxattr request to do this.
5704 handle_set_vxattr(mdr, cur);
5705 return;
5706 }
5707
5708 respond_to_request(mdr, -ENODATA);
5709 }
5710
5711 class C_MDS_inode_xattr_update_finish : public ServerLogContext {
5712 CInode *in;
5713 public:
5714
5715 C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
5716 ServerLogContext(s, r), in(i) { }
5717 void finish(int r) override {
5718 ceph_assert(r == 0);
5719
5720 // apply
5721 in->pop_and_dirty_projected_inode(mdr->ls);
5722
5723 mdr->apply();
5724
5725 get_mds()->balancer->hit_inode(in, META_POP_IWR);
5726
5727 server->respond_to_request(mdr, 0);
5728 }
5729 };
5730
5731 void Server::handle_client_setxattr(MDRequestRef& mdr)
5732 {
5733 const cref_t<MClientRequest> &req = mdr->client_request;
5734 string name(req->get_path2());
5735
5736 // magic ceph.* namespace?
5737 if (name.compare(0, 5, "ceph.") == 0) {
5738 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5739 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5740 if (!cur)
5741 return;
5742
5743 handle_set_vxattr(mdr, cur);
5744 return;
5745 }
5746
5747 CInode *cur = rdlock_path_pin_ref(mdr, true);
5748 if (!cur)
5749 return;
5750
5751 if (mdr->snapid != CEPH_NOSNAP) {
5752 respond_to_request(mdr, -EROFS);
5753 return;
5754 }
5755
5756 int flags = req->head.args.setxattr.flags;
5757
5758 MutationImpl::LockOpVec lov;
5759 lov.add_xlock(&cur->xattrlock);
5760 if (!mds->locker->acquire_locks(mdr, lov))
5761 return;
5762
5763 if (!check_access(mdr, cur, MAY_WRITE))
5764 return;
5765
5766 auto pxattrs = cur->get_projected_xattrs();
5767 size_t len = req->get_data().length();
5768 size_t inc = len + name.length();
5769
5770 // check xattrs kv pairs size
5771 size_t cur_xattrs_size = 0;
5772 for (const auto& p : *pxattrs) {
5773 if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
5774 continue;
5775 }
5776 cur_xattrs_size += p.first.length() + p.second.length();
5777 }
5778
5779 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
5780 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
5781 << cur_xattrs_size << ", inc " << inc << dendl;
5782 respond_to_request(mdr, -ENOSPC);
5783 return;
5784 }
5785
5786 if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(mempool::mds_co::string(name))) {
5787 dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
5788 respond_to_request(mdr, -EEXIST);
5789 return;
5790 }
5791 if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(mempool::mds_co::string(name))) {
5792 dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
5793 respond_to_request(mdr, -ENODATA);
5794 return;
5795 }
5796
5797 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
5798
5799 // project update
5800 auto &pi = cur->project_inode(true);
5801 pi.inode.version = cur->pre_dirty();
5802 pi.inode.ctime = mdr->get_op_stamp();
5803 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5804 pi.inode.rstat.rctime = mdr->get_op_stamp();
5805 pi.inode.change_attr++;
5806 pi.inode.xattr_version++;
5807 auto &px = *pi.xattrs;
5808 if ((flags & CEPH_XATTR_REMOVE)) {
5809 px.erase(mempool::mds_co::string(name));
5810 } else {
5811 bufferptr b = buffer::create(len);
5812 if (len)
5813 req->get_data().begin().copy(len, b.c_str());
5814 auto em = px.emplace(std::piecewise_construct, std::forward_as_tuple(mempool::mds_co::string(name)), std::forward_as_tuple(b));
5815 if (!em.second)
5816 em.first->second = b;
5817 }
5818
5819 // log + wait
5820 mdr->ls = mdlog->get_current_segment();
5821 EUpdate *le = new EUpdate(mdlog, "setxattr");
5822 mdlog->start_entry(le);
5823 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5824 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5825 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5826
5827 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5828 }
5829
5830 void Server::handle_client_removexattr(MDRequestRef& mdr)
5831 {
5832 const cref_t<MClientRequest> &req = mdr->client_request;
5833 std::string name(req->get_path2());
5834
5835 if (name.compare(0, 5, "ceph.") == 0) {
5836 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5837 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5838 if (!cur)
5839 return;
5840
5841 handle_remove_vxattr(mdr, cur);
5842 return;
5843 }
5844
5845 CInode* cur = rdlock_path_pin_ref(mdr, true);
5846 if (!cur)
5847 return;
5848
5849 if (mdr->snapid != CEPH_NOSNAP) {
5850 respond_to_request(mdr, -EROFS);
5851 return;
5852 }
5853
5854 MutationImpl::LockOpVec lov;
5855 lov.add_xlock(&cur->xattrlock);
5856 if (!mds->locker->acquire_locks(mdr, lov))
5857 return;
5858
5859 auto pxattrs = cur->get_projected_xattrs();
5860 if (pxattrs->count(mempool::mds_co::string(name)) == 0) {
5861 dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
5862 respond_to_request(mdr, -ENODATA);
5863 return;
5864 }
5865
5866 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
5867
5868 // project update
5869 auto &pi = cur->project_inode(true);
5870 auto &px = *pi.xattrs;
5871 pi.inode.version = cur->pre_dirty();
5872 pi.inode.ctime = mdr->get_op_stamp();
5873 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5874 pi.inode.rstat.rctime = mdr->get_op_stamp();
5875 pi.inode.change_attr++;
5876 pi.inode.xattr_version++;
5877 px.erase(mempool::mds_co::string(name));
5878
5879 // log + wait
5880 mdr->ls = mdlog->get_current_segment();
5881 EUpdate *le = new EUpdate(mdlog, "removexattr");
5882 mdlog->start_entry(le);
5883 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5884 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5885 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5886
5887 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5888 }
5889
5890
5891 // =================================================================
5892 // DIRECTORY and NAMESPACE OPS
5893
5894
5895 // ------------------------------------------------
5896
5897 // MKNOD
5898
5899 class C_MDS_mknod_finish : public ServerLogContext {
5900 CDentry *dn;
5901 CInode *newi;
5902 public:
5903 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
5904 ServerLogContext(s, r), dn(d), newi(ni) {}
5905 void finish(int r) override {
5906 ceph_assert(r == 0);
5907
5908 // link the inode
5909 dn->pop_projected_linkage();
5910
5911 // be a bit hacky with the inode version, here.. we decrement it
5912 // just to keep mark_dirty() happen. (we didn't bother projecting
5913 // a new version of hte inode since it's just been created)
5914 newi->inode.version--;
5915 newi->mark_dirty(newi->inode.version + 1, mdr->ls);
5916 newi->mark_dirty_parent(mdr->ls, true);
5917
5918 // mkdir?
5919 if (newi->inode.is_dir()) {
5920 CDir *dir = newi->get_dirfrag(frag_t());
5921 ceph_assert(dir);
5922 dir->fnode.version--;
5923 dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
5924 dir->mark_new(mdr->ls);
5925 }
5926
5927 mdr->apply();
5928
5929 MDRequestRef null_ref;
5930 get_mds()->mdcache->send_dentry_link(dn, null_ref);
5931
5932 if (newi->inode.is_file())
5933 get_mds()->locker->share_inode_max_size(newi);
5934
5935 // hit pop
5936 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
5937
5938 // reply
5939 server->respond_to_request(mdr, 0);
5940 }
5941 };
5942
5943
5944 void Server::handle_client_mknod(MDRequestRef& mdr)
5945 {
5946 const cref_t<MClientRequest> &req = mdr->client_request;
5947 client_t client = mdr->get_client();
5948
5949 unsigned mode = req->head.args.mknod.mode;
5950 if ((mode & S_IFMT) == 0)
5951 mode |= S_IFREG;
5952
5953 mdr->disable_lock_cache();
5954 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, S_ISREG(mode));
5955 if (!dn)
5956 return;
5957
5958 CDir *dir = dn->get_dir();
5959 CInode *diri = dir->get_inode();
5960 if (!check_access(mdr, diri, MAY_WRITE))
5961 return;
5962 if (!check_fragment_space(mdr, dn->get_dir()))
5963 return;
5964
5965 // set layout
5966 file_layout_t layout;
5967 if (mdr->dir_layout != file_layout_t())
5968 layout = mdr->dir_layout;
5969 else
5970 layout = mdcache->default_file_layout;
5971
5972 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
5973 ceph_assert(newi);
5974
5975 dn->push_projected_linkage(newi);
5976
5977 newi->inode.rdev = req->head.args.mknod.rdev;
5978 newi->inode.version = dn->pre_dirty();
5979 newi->inode.rstat.rfiles = 1;
5980 if (layout.pool_id != mdcache->default_file_layout.pool_id)
5981 newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
5982 newi->inode.update_backtrace();
5983
5984 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
5985 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
5986 ceph_assert(follows >= realm->get_newest_seq());
5987
5988 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
5989 // want to write to it (e.g., if they are reexporting NFS)
5990 if (S_ISREG(newi->inode.mode)) {
5991 // issue a cap on the file
5992 int cmode = CEPH_FILE_MODE_RDWR;
5993 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
5994 if (cap) {
5995 cap->set_wanted(0);
5996
5997 // put locks in excl mode
5998 newi->filelock.set_state(LOCK_EXCL);
5999 newi->authlock.set_state(LOCK_EXCL);
6000 newi->xattrlock.set_state(LOCK_EXCL);
6001
6002 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
6003 newi->inode.client_ranges[client].range.first = 0;
6004 newi->inode.client_ranges[client].range.last = newi->inode.layout.stripe_unit;
6005 newi->inode.client_ranges[client].follows = follows;
6006 cap->mark_clientwriteable();
6007 }
6008 }
6009
6010 ceph_assert(dn->first == follows + 1);
6011 newi->first = dn->first;
6012
6013 dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
6014
6015 // prepare finisher
6016 mdr->ls = mdlog->get_current_segment();
6017 EUpdate *le = new EUpdate(mdlog, "mknod");
6018 mdlog->start_entry(le);
6019 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6020 journal_allocated_inos(mdr, &le->metablob);
6021
6022 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
6023 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6024 le->metablob.add_primary_dentry(dn, newi, true, true, true);
6025
6026 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6027 mds->balancer->maybe_fragment(dn->get_dir(), false);
6028 }
6029
6030
6031
6032 // MKDIR
6033 /* This function takes responsibility for the passed mdr*/
6034 void Server::handle_client_mkdir(MDRequestRef& mdr)
6035 {
6036 const cref_t<MClientRequest> &req = mdr->client_request;
6037
6038 mdr->disable_lock_cache();
6039 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6040 if (!dn)
6041 return;
6042
6043 CDir *dir = dn->get_dir();
6044 CInode *diri = dir->get_inode();
6045
6046 // mkdir check access
6047 if (!check_access(mdr, diri, MAY_WRITE))
6048 return;
6049
6050 if (!check_fragment_space(mdr, dir))
6051 return;
6052
6053 // new inode
6054 unsigned mode = req->head.args.mkdir.mode;
6055 mode &= ~S_IFMT;
6056 mode |= S_IFDIR;
6057 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
6058 ceph_assert(newi);
6059
6060 // it's a directory.
6061 dn->push_projected_linkage(newi);
6062
6063 newi->inode.version = dn->pre_dirty();
6064 newi->inode.rstat.rsubdirs = 1;
6065 newi->inode.update_backtrace();
6066
6067 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6068 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6069 ceph_assert(follows >= realm->get_newest_seq());
6070
6071 dout(12) << " follows " << follows << dendl;
6072 ceph_assert(dn->first == follows + 1);
6073 newi->first = dn->first;
6074
6075 // ...and that new dir is empty.
6076 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
6077 newdir->state_set(CDir::STATE_CREATING);
6078 newdir->mark_complete();
6079 newdir->fnode.version = newdir->pre_dirty();
6080
6081 // prepare finisher
6082 mdr->ls = mdlog->get_current_segment();
6083 EUpdate *le = new EUpdate(mdlog, "mkdir");
6084 mdlog->start_entry(le);
6085 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6086 journal_allocated_inos(mdr, &le->metablob);
6087 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6088 le->metablob.add_primary_dentry(dn, newi, true, true);
6089 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
6090
6091 // issue a cap on the directory
6092 int cmode = CEPH_FILE_MODE_RDWR;
6093 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
6094 if (cap) {
6095 cap->set_wanted(0);
6096
6097 // put locks in excl mode
6098 newi->filelock.set_state(LOCK_EXCL);
6099 newi->authlock.set_state(LOCK_EXCL);
6100 newi->xattrlock.set_state(LOCK_EXCL);
6101 }
6102
6103 // make sure this inode gets into the journal
6104 le->metablob.add_opened_ino(newi->ino());
6105
6106 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6107
6108 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6109 // have overshot the split size (multiple mkdir in flight), so here is
6110 // an early chance to split the dir if this mkdir makes it oversized.
6111 mds->balancer->maybe_fragment(dir, false);
6112 }
6113
6114
6115 // SYMLINK
6116
6117 void Server::handle_client_symlink(MDRequestRef& mdr)
6118 {
6119 mdr->disable_lock_cache();
6120 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6121 if (!dn)
6122 return;
6123
6124 CDir *dir = dn->get_dir();
6125 CInode *diri = dir->get_inode();
6126
6127 if (!check_access(mdr, diri, MAY_WRITE))
6128 return;
6129 if (!check_fragment_space(mdr, dir))
6130 return;
6131
6132 const cref_t<MClientRequest> &req = mdr->client_request;
6133
6134 unsigned mode = S_IFLNK | 0777;
6135 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
6136 ceph_assert(newi);
6137
6138 // it's a symlink
6139 dn->push_projected_linkage(newi);
6140
6141 newi->symlink = req->get_path2();
6142 newi->inode.size = newi->symlink.length();
6143 newi->inode.rstat.rbytes = newi->inode.size;
6144 newi->inode.rstat.rfiles = 1;
6145 newi->inode.version = dn->pre_dirty();
6146 newi->inode.update_backtrace();
6147
6148 newi->first = dn->first;
6149
6150 // prepare finisher
6151 mdr->ls = mdlog->get_current_segment();
6152 EUpdate *le = new EUpdate(mdlog, "symlink");
6153 mdlog->start_entry(le);
6154 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6155 journal_allocated_inos(mdr, &le->metablob);
6156 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6157 le->metablob.add_primary_dentry(dn, newi, true, true);
6158
6159 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6160 mds->balancer->maybe_fragment(dir, false);
6161 }
6162
6163
6164
6165
6166
6167 // LINK
6168
6169 void Server::handle_client_link(MDRequestRef& mdr)
6170 {
6171 const cref_t<MClientRequest> &req = mdr->client_request;
6172
6173 dout(7) << "handle_client_link " << req->get_filepath()
6174 << " to " << req->get_filepath2()
6175 << dendl;
6176
6177 mdr->disable_lock_cache();
6178
6179 CDentry *destdn;
6180 CInode *targeti;
6181
6182 if (req->get_filepath2().depth() == 0) {
6183 targeti = mdcache->get_inode(req->get_filepath2().get_ino());
6184 if (!targeti) {
6185 dout(10) << "ESTALE on path2, attempting recovery" << dendl;
6186 mdcache->find_ino_peers(req->get_filepath2().get_ino(), new C_MDS_TryFindInode(this, mdr));
6187 return;
6188 }
6189 mdr->pin(targeti);
6190
6191 if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
6192 CDentry *pdn = targeti->get_projected_parent_dn();
6193 if (!pdn) {
6194 dout(7) << "target has no parent dn, failing..." << dendl;
6195 respond_to_request(mdr, -EINVAL);
6196 return;
6197 }
6198 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
6199 return;
6200 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
6201 }
6202
6203 destdn = rdlock_path_xlock_dentry(mdr, false);
6204 if (!destdn)
6205 return;
6206
6207 } else {
6208 auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
6209 destdn = ret.first;
6210 if (!destdn)
6211 return;
6212
6213 if (!destdn->get_projected_linkage()->is_null()) {
6214 respond_to_request(mdr, -EEXIST);
6215 return;
6216 }
6217
6218 targeti = ret.second->get_projected_linkage()->get_inode();
6219 }
6220
6221 if (targeti->is_dir()) {
6222 dout(7) << "target is a dir, failing..." << dendl;
6223 respond_to_request(mdr, -EINVAL);
6224 return;
6225 }
6226
6227 CDir *dir = destdn->get_dir();
6228 dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
6229 dout(7) << "target is " << *targeti << dendl;
6230
6231 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
6232 MutationImpl::LockOpVec lov;
6233 lov.add_xlock(&targeti->snaplock);
6234 lov.add_xlock(&targeti->linklock);
6235
6236 if (!mds->locker->acquire_locks(mdr, lov))
6237 return;
6238
6239 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6240 }
6241
6242 if (targeti->get_projected_inode()->nlink == 0) {
6243 dout(7) << "target has no link, failing..." << dendl;
6244 respond_to_request(mdr, -ENOENT);
6245 }
6246
6247 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6248 if (!check_access(mdr, targeti, MAY_WRITE))
6249 return;
6250
6251 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
6252 return;
6253
6254 if (!check_fragment_space(mdr, dir))
6255 return;
6256 }
6257
6258 // go!
6259 ceph_assert(g_conf()->mds_kill_link_at != 1);
6260
6261 // local or remote?
6262 if (targeti->is_auth())
6263 _link_local(mdr, destdn, targeti);
6264 else
6265 _link_remote(mdr, true, destdn, targeti);
6266 mds->balancer->maybe_fragment(dir, false);
6267 }
6268
6269
6270 class C_MDS_link_local_finish : public ServerLogContext {
6271 CDentry *dn;
6272 CInode *targeti;
6273 version_t dnpv;
6274 version_t tipv;
6275 bool adjust_realm;
6276 public:
6277 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
6278 version_t dnpv_, version_t tipv_, bool ar) :
6279 ServerLogContext(s, r), dn(d), targeti(ti),
6280 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
6281 void finish(int r) override {
6282 ceph_assert(r == 0);
6283 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
6284 }
6285 };
6286
6287
6288 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
6289 {
6290 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
6291
6292 mdr->ls = mdlog->get_current_segment();
6293
6294 // predirty NEW dentry
6295 version_t dnpv = dn->pre_dirty();
6296 version_t tipv = targeti->pre_dirty();
6297
6298 // project inode update
6299 auto &pi = targeti->project_inode();
6300 pi.inode.nlink++;
6301 pi.inode.ctime = mdr->get_op_stamp();
6302 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
6303 pi.inode.rstat.rctime = mdr->get_op_stamp();
6304 pi.inode.change_attr++;
6305 pi.inode.version = tipv;
6306
6307 bool adjust_realm = false;
6308 if (!targeti->is_projected_snaprealm_global()) {
6309 sr_t *newsnap = targeti->project_snaprealm();
6310 targeti->mark_snaprealm_global(newsnap);
6311 targeti->record_snaprealm_parent_dentry(newsnap, NULL, targeti->get_projected_parent_dn(), true);
6312 adjust_realm = true;
6313 }
6314
6315 // log + wait
6316 EUpdate *le = new EUpdate(mdlog, "link_local");
6317 mdlog->start_entry(le);
6318 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6319 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
6320 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
6321 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6322 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
6323
6324 // do this after predirty_*, to avoid funky extra dnl arg
6325 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6326
6327 journal_and_reply(mdr, targeti, dn, le,
6328 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
6329 }
6330
6331 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
6332 version_t dnpv, version_t tipv, bool adjust_realm)
6333 {
6334 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
6335
6336 // link and unlock the NEW dentry
6337 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6338 if (!dnl->get_inode())
6339 dn->link_remote(dnl, targeti);
6340 dn->mark_dirty(dnpv, mdr->ls);
6341
6342 // target inode
6343 targeti->pop_and_dirty_projected_inode(mdr->ls);
6344
6345 mdr->apply();
6346
6347 MDRequestRef null_ref;
6348 mdcache->send_dentry_link(dn, null_ref);
6349
6350 if (adjust_realm) {
6351 int op = CEPH_SNAP_OP_SPLIT;
6352 mds->mdcache->send_snap_update(targeti, 0, op);
6353 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6354 }
6355
6356 // bump target popularity
6357 mds->balancer->hit_inode(targeti, META_POP_IWR);
6358 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6359
6360 // reply
6361 respond_to_request(mdr, 0);
6362 }
6363
6364
6365 // link / unlink remote
6366
6367 class C_MDS_link_remote_finish : public ServerLogContext {
6368 bool inc;
6369 CDentry *dn;
6370 CInode *targeti;
6371 version_t dpv;
6372 public:
6373 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
6374 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
6375 dpv(d->get_projected_version()) {}
6376 void finish(int r) override {
6377 ceph_assert(r == 0);
6378 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
6379 }
6380 };
6381
6382 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
6383 {
6384 dout(10) << "_link_remote "
6385 << (inc ? "link ":"unlink ")
6386 << *dn << " to " << *targeti << dendl;
6387
6388 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6389 mds_rank_t linkauth = targeti->authority().first;
6390 if (mdr->more()->witnessed.count(linkauth) == 0) {
6391 if (mds->is_cluster_degraded() &&
6392 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
6393 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
6394 if (mdr->more()->waiting_on_slave.empty())
6395 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
6396 return;
6397 }
6398
6399 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
6400 int op;
6401 if (inc)
6402 op = MMDSSlaveRequest::OP_LINKPREP;
6403 else
6404 op = MMDSSlaveRequest::OP_UNLINKPREP;
6405 auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, op);
6406 targeti->set_object_info(req->get_object_info());
6407 req->op_stamp = mdr->get_op_stamp();
6408 if (auto& desti_srnode = mdr->more()->desti_srnode)
6409 encode(*desti_srnode, req->desti_snapbl);
6410 mds->send_message_mds(req, linkauth);
6411
6412 ceph_assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
6413 mdr->more()->waiting_on_slave.insert(linkauth);
6414 return;
6415 }
6416 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
6417
6418 ceph_assert(g_conf()->mds_kill_link_at != 2);
6419
6420 if (auto& desti_srnode = mdr->more()->desti_srnode) {
6421 delete desti_srnode;
6422 desti_srnode = NULL;
6423 }
6424
6425 mdr->set_mds_stamp(ceph_clock_now());
6426
6427 // add to event
6428 mdr->ls = mdlog->get_current_segment();
6429 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
6430 mdlog->start_entry(le);
6431 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6432 if (!mdr->more()->witnessed.empty()) {
6433 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6434 le->reqid = mdr->reqid;
6435 le->had_slaves = true;
6436 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6437 }
6438
6439 if (inc) {
6440 dn->pre_dirty();
6441 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
6442 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6443 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6444 } else {
6445 dn->pre_dirty();
6446 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
6447 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6448 le->metablob.add_null_dentry(dn, true);
6449 dn->push_projected_linkage();
6450 }
6451
6452 journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
6453 new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
6454 }
6455
6456 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
6457 CDentry *dn, CInode *targeti,
6458 version_t dpv)
6459 {
6460 dout(10) << "_link_remote_finish "
6461 << (inc ? "link ":"unlink ")
6462 << *dn << " to " << *targeti << dendl;
6463
6464 ceph_assert(g_conf()->mds_kill_link_at != 3);
6465
6466 if (!mdr->more()->witnessed.empty())
6467 mdcache->logged_master_update(mdr->reqid);
6468
6469 if (inc) {
6470 // link the new dentry
6471 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6472 if (!dnl->get_inode())
6473 dn->link_remote(dnl, targeti);
6474 dn->mark_dirty(dpv, mdr->ls);
6475 } else {
6476 // unlink main dentry
6477 dn->get_dir()->unlink_inode(dn);
6478 dn->pop_projected_linkage();
6479 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
6480 }
6481
6482 mdr->apply();
6483
6484 MDRequestRef null_ref;
6485 if (inc)
6486 mdcache->send_dentry_link(dn, null_ref);
6487 else
6488 mdcache->send_dentry_unlink(dn, NULL, null_ref);
6489
6490 // bump target popularity
6491 mds->balancer->hit_inode(targeti, META_POP_IWR);
6492 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6493
6494 // reply
6495 respond_to_request(mdr, 0);
6496
6497 if (!inc)
6498 // removing a new dn?
6499 dn->get_dir()->try_remove_unlinked_dn(dn);
6500 }
6501
6502
6503 // remote linking/unlinking
6504
6505 class C_MDS_SlaveLinkPrep : public ServerLogContext {
6506 CInode *targeti;
6507 bool adjust_realm;
6508 public:
6509 C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
6510 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
6511 void finish(int r) override {
6512 ceph_assert(r == 0);
6513 server->_logged_slave_link(mdr, targeti, adjust_realm);
6514 }
6515 };
6516
6517 class C_MDS_SlaveLinkCommit : public ServerContext {
6518 MDRequestRef mdr;
6519 CInode *targeti;
6520 public:
6521 C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
6522 ServerContext(s), mdr(r), targeti(t) { }
6523 void finish(int r) override {
6524 server->_commit_slave_link(mdr, r, targeti);
6525 }
6526 };
6527
6528 void Server::handle_slave_link_prep(MDRequestRef& mdr)
6529 {
6530 dout(10) << "handle_slave_link_prep " << *mdr
6531 << " on " << mdr->slave_request->get_object_info()
6532 << dendl;
6533
6534 ceph_assert(g_conf()->mds_kill_link_at != 4);
6535
6536 CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
6537 ceph_assert(targeti);
6538 dout(10) << "targeti " << *targeti << dendl;
6539 CDentry *dn = targeti->get_parent_dn();
6540 CDentry::linkage_t *dnl = dn->get_linkage();
6541 ceph_assert(dnl->is_primary());
6542
6543 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6544
6545 mdr->auth_pin(targeti);
6546
6547 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
6548 ceph_assert(g_conf()->mds_kill_link_at != 5);
6549
6550 // journal it
6551 mdr->ls = mdlog->get_current_segment();
6552 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
6553 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
6554 mdlog->start_entry(le);
6555
6556 auto &pi = dnl->get_inode()->project_inode();
6557
6558 // update journaled target inode
6559 bool inc;
6560 bool adjust_realm = false;
6561 bool realm_projected = false;
6562 if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
6563 inc = true;
6564 pi.inode.nlink++;
6565 if (!targeti->is_projected_snaprealm_global()) {
6566 sr_t *newsnap = targeti->project_snaprealm();
6567 targeti->mark_snaprealm_global(newsnap);
6568 targeti->record_snaprealm_parent_dentry(newsnap, NULL, targeti->get_projected_parent_dn(), true);
6569 adjust_realm = true;
6570 realm_projected = true;
6571 }
6572 } else {
6573 inc = false;
6574 pi.inode.nlink--;
6575 if (targeti->is_projected_snaprealm_global()) {
6576 ceph_assert(mdr->slave_request->desti_snapbl.length());
6577 auto p = mdr->slave_request->desti_snapbl.cbegin();
6578
6579 sr_t *newsnap = targeti->project_snaprealm();
6580 decode(*newsnap, p);
6581
6582 if (pi.inode.nlink == 0)
6583 ceph_assert(!newsnap->is_parent_global());
6584
6585 realm_projected = true;
6586 } else {
6587 ceph_assert(mdr->slave_request->desti_snapbl.length() == 0);
6588 }
6589 }
6590
6591 link_rollback rollback;
6592 rollback.reqid = mdr->reqid;
6593 rollback.ino = targeti->ino();
6594 rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
6595 const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
6596 rollback.old_dir_mtime = pf->fragstat.mtime;
6597 rollback.old_dir_rctime = pf->rstat.rctime;
6598 rollback.was_inc = inc;
6599 if (realm_projected) {
6600 if (targeti->snaprealm) {
6601 encode(true, rollback.snapbl);
6602 targeti->encode_snap_blob(rollback.snapbl);
6603 } else {
6604 encode(false, rollback.snapbl);
6605 }
6606 }
6607 encode(rollback, le->rollback);
6608 mdr->more()->rollback_bl = le->rollback;
6609
6610 pi.inode.ctime = mdr->get_op_stamp();
6611 pi.inode.version = targeti->pre_dirty();
6612
6613 dout(10) << " projected inode " << pi.inode.ino << " v " << pi.inode.version << dendl;
6614
6615 // commit case
6616 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
6617 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
6618 mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
6619
6620 // set up commit waiter
6621 mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
6622
6623 mdr->more()->slave_update_journaled = true;
6624 submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, adjust_realm),
6625 mdr, __func__);
6626 mdlog->flush();
6627 }
6628
6629 void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
6630 {
6631 dout(10) << "_logged_slave_link " << *mdr
6632 << " " << *targeti << dendl;
6633
6634 ceph_assert(g_conf()->mds_kill_link_at != 6);
6635
6636 // update the target
6637 targeti->pop_and_dirty_projected_inode(mdr->ls);
6638 mdr->apply();
6639
6640 // hit pop
6641 mds->balancer->hit_inode(targeti, META_POP_IWR);
6642
6643 // done.
6644 mdr->reset_slave_request();
6645
6646 if (adjust_realm) {
6647 int op = CEPH_SNAP_OP_SPLIT;
6648 mds->mdcache->send_snap_update(targeti, 0, op);
6649 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6650 }
6651
6652 // ack
6653 if (!mdr->aborted) {
6654 auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_LINKPREPACK);
6655 mds->send_message_mds(reply, mdr->slave_to_mds);
6656 } else {
6657 dout(10) << " abort flag set, finishing" << dendl;
6658 mdcache->request_finish(mdr);
6659 }
6660 }
6661
6662
6663 struct C_MDS_CommittedSlave : public ServerLogContext {
6664 C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
6665 void finish(int r) override {
6666 server->_committed_slave(mdr);
6667 }
6668 };
6669
6670 void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
6671 {
6672 dout(10) << "_commit_slave_link " << *mdr
6673 << " r=" << r
6674 << " " << *targeti << dendl;
6675
6676 ceph_assert(g_conf()->mds_kill_link_at != 7);
6677
6678 if (r == 0) {
6679 // drop our pins, etc.
6680 mdr->cleanup();
6681
6682 // write a commit to the journal
6683 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
6684 ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
6685 mdlog->start_entry(le);
6686 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
6687 mdlog->flush();
6688 } else {
6689 do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
6690 }
6691 }
6692
6693 void Server::_committed_slave(MDRequestRef& mdr)
6694 {
6695 dout(10) << "_committed_slave " << *mdr << dendl;
6696
6697 ceph_assert(g_conf()->mds_kill_link_at != 8);
6698
6699 bool assert_exist = mdr->more()->slave_update_journaled;
6700 mdcache->finish_uncommitted_slave(mdr->reqid, assert_exist);
6701 auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_COMMITTED);
6702 mds->send_message_mds(req, mdr->slave_to_mds);
6703 mdcache->request_finish(mdr);
6704 }
6705
6706 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
6707 MutationRef mut;
6708 map<client_t,ref_t<MClientSnap>> splits;
6709 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
6710 map<client_t,ref_t<MClientSnap>>&& _splits) :
6711 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
6712 }
6713 void finish(int r) override {
6714 server->_link_rollback_finish(mut, mdr, splits);
6715 }
6716 };
6717
6718 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
6719 {
6720 link_rollback rollback;
6721 auto p = rbl.cbegin();
6722 decode(rollback, p);
6723
6724 dout(10) << "do_link_rollback on " << rollback.reqid
6725 << (rollback.was_inc ? " inc":" dec")
6726 << " ino " << rollback.ino
6727 << dendl;
6728
6729 ceph_assert(g_conf()->mds_kill_link_at != 9);
6730
6731 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
6732 ceph_assert(mdr || mds->is_resolve());
6733
6734 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
6735 mut->ls = mds->mdlog->get_current_segment();
6736
6737 CInode *in = mdcache->get_inode(rollback.ino);
6738 ceph_assert(in);
6739 dout(10) << " target is " << *in << dendl;
6740 ceph_assert(!in->is_projected()); // live slave request hold versionlock xlock.
6741
6742 auto &pi = in->project_inode();
6743 pi.inode.version = in->pre_dirty();
6744 mut->add_projected_inode(in);
6745
6746 // parent dir rctime
6747 CDir *parent = in->get_projected_parent_dn()->get_dir();
6748 fnode_t *pf = parent->project_fnode();
6749 mut->add_projected_fnode(parent);
6750 pf->version = parent->pre_dirty();
6751 if (pf->fragstat.mtime == pi.inode.ctime) {
6752 pf->fragstat.mtime = rollback.old_dir_mtime;
6753 if (pf->rstat.rctime == pi.inode.ctime)
6754 pf->rstat.rctime = rollback.old_dir_rctime;
6755 mut->add_updated_lock(&parent->get_inode()->filelock);
6756 mut->add_updated_lock(&parent->get_inode()->nestlock);
6757 }
6758
6759 // inode
6760 pi.inode.ctime = rollback.old_ctime;
6761 if (rollback.was_inc)
6762 pi.inode.nlink--;
6763 else
6764 pi.inode.nlink++;
6765
6766 map<client_t,ref_t<MClientSnap>> splits;
6767 if (rollback.snapbl.length() && in->snaprealm) {
6768 bool hadrealm;
6769 auto p = rollback.snapbl.cbegin();
6770 decode(hadrealm, p);
6771 if (hadrealm) {
6772 if (!mds->is_resolve()) {
6773 sr_t *new_srnode = new sr_t();
6774 decode(*new_srnode, p);
6775 in->project_snaprealm(new_srnode);
6776 } else {
6777 decode(in->snaprealm->srnode, p);
6778 }
6779 } else {
6780 SnapRealm *realm = parent->get_inode()->find_snaprealm();
6781 if (!mds->is_resolve())
6782 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
6783 in->project_snaprealm(NULL);
6784 }
6785 }
6786
6787 // journal it
6788 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
6789 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
6790 mdlog->start_entry(le);
6791 le->commit.add_dir_context(parent);
6792 le->commit.add_dir(parent, true);
6793 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
6794
6795 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
6796 mdr, __func__);
6797 mdlog->flush();
6798 }
6799
6800 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
6801 map<client_t,ref_t<MClientSnap>>& splits)
6802 {
6803 dout(10) << "_link_rollback_finish" << dendl;
6804
6805 ceph_assert(g_conf()->mds_kill_link_at != 10);
6806
6807 mut->apply();
6808
6809 if (!mds->is_resolve())
6810 mdcache->send_snaps(splits);
6811
6812 if (mdr)
6813 mdcache->request_finish(mdr);
6814
6815 mdcache->finish_rollback(mut->reqid, mdr);
6816
6817 mut->cleanup();
6818 }
6819
6820
6821 void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &m)
6822 {
6823 dout(10) << "handle_slave_link_prep_ack " << *mdr
6824 << " " << *m << dendl;
6825 mds_rank_t from = mds_rank_t(m->get_source().num());
6826
6827 ceph_assert(g_conf()->mds_kill_link_at != 11);
6828
6829 // note slave
6830 mdr->more()->slaves.insert(from);
6831
6832 // witnessed!
6833 ceph_assert(mdr->more()->witnessed.count(from) == 0);
6834 mdr->more()->witnessed.insert(from);
6835 ceph_assert(!m->is_not_journaled());
6836 mdr->more()->has_journaled_slaves = true;
6837
6838 // remove from waiting list
6839 ceph_assert(mdr->more()->waiting_on_slave.count(from));
6840 mdr->more()->waiting_on_slave.erase(from);
6841
6842 ceph_assert(mdr->more()->waiting_on_slave.empty());
6843
6844 dispatch_client_request(mdr); // go again!
6845 }
6846
6847
6848
6849
6850
6851 // UNLINK
6852
6853 void Server::handle_client_unlink(MDRequestRef& mdr)
6854 {
6855 const cref_t<MClientRequest> &req = mdr->client_request;
6856 client_t client = mdr->get_client();
6857
6858 // rmdir or unlink?
6859 bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
6860
6861 if (rmdir)
6862 mdr->disable_lock_cache();
6863 CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
6864 if (!dn)
6865 return;
6866
6867 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
6868 ceph_assert(!dnl->is_null());
6869 CInode *in = dnl->get_inode();
6870
6871 if (rmdir) {
6872 dout(7) << "handle_client_rmdir on " << *dn << dendl;
6873 } else {
6874 dout(7) << "handle_client_unlink on " << *dn << dendl;
6875 }
6876 dout(7) << "dn links to " << *in << dendl;
6877
6878 // rmdir vs is_dir
6879 if (in->is_dir()) {
6880 if (rmdir) {
6881 // do empty directory checks
6882 if (_dir_is_nonempty_unlocked(mdr, in)) {
6883 respond_to_request(mdr, -ENOTEMPTY);
6884 return;
6885 }
6886 } else {
6887 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
6888 respond_to_request(mdr, -EISDIR);
6889 return;
6890 }
6891 } else {
6892 if (rmdir) {
6893 // unlink
6894 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
6895 respond_to_request(mdr, -ENOTDIR);
6896 return;
6897 }
6898 }
6899
6900 CInode *diri = dn->get_dir()->get_inode();
6901 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6902 if (!check_access(mdr, diri, MAY_WRITE))
6903 return;
6904 }
6905
6906 // -- create stray dentry? --
6907 CDentry *straydn = NULL;
6908 if (dnl->is_primary()) {
6909 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
6910 if (!straydn)
6911 return;
6912 dout(10) << " straydn is " << *straydn << dendl;
6913 } else if (mdr->straydn) {
6914 mdr->unpin(mdr->straydn);
6915 mdr->straydn = NULL;
6916 }
6917
6918 // lock
6919 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
6920 MutationImpl::LockOpVec lov;
6921
6922 lov.add_xlock(&in->linklock);
6923 lov.add_xlock(&in->snaplock);
6924 if (in->is_dir())
6925 lov.add_rdlock(&in->filelock); // to verify it's empty
6926
6927 if (straydn) {
6928 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
6929 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
6930 lov.add_xlock(&straydn->lock);
6931 }
6932
6933 if (!mds->locker->acquire_locks(mdr, lov))
6934 return;
6935
6936 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6937 }
6938
6939 if (in->is_dir() &&
6940 _dir_is_nonempty(mdr, in)) {
6941 respond_to_request(mdr, -ENOTEMPTY);
6942 return;
6943 }
6944
6945 if (straydn)
6946 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
6947
6948 if (!mdr->more()->desti_srnode) {
6949 if (in->is_projected_snaprealm_global()) {
6950 sr_t *new_srnode = in->prepare_new_srnode(0);
6951 in->record_snaprealm_parent_dentry(new_srnode, NULL, dn, dnl->is_primary());
6952 // dropping the last linkage or dropping the last remote linkage,
6953 // detch the inode from global snaprealm
6954 auto nlink = in->get_projected_inode()->nlink;
6955 if (nlink == 1 ||
6956 (nlink == 2 && !dnl->is_primary() &&
6957 !in->get_projected_parent_dir()->inode->is_stray()))
6958 in->clear_snaprealm_global(new_srnode);
6959 mdr->more()->desti_srnode = new_srnode;
6960 } else if (dnl->is_primary()) {
6961 // prepare snaprealm blob for slave request
6962 SnapRealm *realm = in->find_snaprealm();
6963 snapid_t follows = realm->get_newest_seq();
6964 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
6965 sr_t *new_srnode = in->prepare_new_srnode(follows);
6966 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
6967 mdr->more()->desti_srnode = new_srnode;
6968 }
6969 }
6970 }
6971
6972 // yay!
6973 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
6974 // subtree root auths need to be witnesses
6975 set<mds_rank_t> witnesses;
6976 in->list_replicas(witnesses);
6977 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
6978
6979 for (set<mds_rank_t>::iterator p = witnesses.begin();
6980 p != witnesses.end();
6981 ++p) {
6982 if (mdr->more()->witnessed.count(*p)) {
6983 dout(10) << " already witnessed by mds." << *p << dendl;
6984 } else if (mdr->more()->waiting_on_slave.count(*p)) {
6985 dout(10) << " already waiting on witness mds." << *p << dendl;
6986 } else {
6987 if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
6988 return;
6989 }
6990 }
6991 if (!mdr->more()->waiting_on_slave.empty())
6992 return; // we're waiting for a witness.
6993 }
6994
6995 if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
6996 mds->locker->create_lock_cache(mdr, diri);
6997
6998 // ok!
6999 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
7000 _link_remote(mdr, false, dn, dnl->get_inode());
7001 else
7002 _unlink_local(mdr, dn, straydn);
7003 }
7004
7005 class C_MDS_unlink_local_finish : public ServerLogContext {
7006 CDentry *dn;
7007 CDentry *straydn;
7008 version_t dnpv; // deleted dentry
7009 public:
7010 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
7011 ServerLogContext(s, r), dn(d), straydn(sd),
7012 dnpv(d->get_projected_version()) {}
7013 void finish(int r) override {
7014 ceph_assert(r == 0);
7015 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
7016 }
7017 };
7018
7019 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7020 {
7021 dout(10) << "_unlink_local " << *dn << dendl;
7022
7023 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7024 CInode *in = dnl->get_inode();
7025
7026
7027 // ok, let's do it.
7028 mdr->ls = mdlog->get_current_segment();
7029
7030 // prepare log entry
7031 EUpdate *le = new EUpdate(mdlog, "unlink_local");
7032 mdlog->start_entry(le);
7033 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7034 if (!mdr->more()->witnessed.empty()) {
7035 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
7036 le->reqid = mdr->reqid;
7037 le->had_slaves = true;
7038 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7039 }
7040
7041 if (straydn) {
7042 ceph_assert(dnl->is_primary());
7043 straydn->push_projected_linkage(in);
7044 }
7045
7046 // the unlinked dentry
7047 dn->pre_dirty();
7048
7049 auto &pi = in->project_inode();
7050 {
7051 std::string t;
7052 dn->make_path_string(t, true);
7053 pi.inode.stray_prior_path = std::move(t);
7054 }
7055 pi.inode.version = in->pre_dirty();
7056 pi.inode.ctime = mdr->get_op_stamp();
7057 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
7058 pi.inode.rstat.rctime = mdr->get_op_stamp();
7059 pi.inode.change_attr++;
7060 pi.inode.nlink--;
7061 if (pi.inode.nlink == 0)
7062 in->state_set(CInode::STATE_ORPHAN);
7063
7064 if (mdr->more()->desti_srnode) {
7065 auto& desti_srnode = mdr->more()->desti_srnode;
7066 in->project_snaprealm(desti_srnode);
7067 desti_srnode = NULL;
7068 }
7069
7070 if (straydn) {
7071 // will manually pop projected inode
7072
7073 // primary link. add stray dentry.
7074 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
7075 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7076
7077 pi.inode.update_backtrace();
7078 le->metablob.add_primary_dentry(straydn, in, true, true);
7079 } else {
7080 mdr->add_projected_inode(in);
7081 // remote link. update remote inode.
7082 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
7083 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
7084 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
7085 }
7086
7087 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7088 le->metablob.add_null_dentry(dn, true);
7089
7090 if (in->is_dir()) {
7091 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7092 le->metablob.renamed_dirino = in->ino();
7093 }
7094
7095 dn->push_projected_linkage();
7096
7097 if (straydn) {
7098 ceph_assert(in->first <= straydn->first);
7099 in->first = straydn->first;
7100 }
7101
7102 if (in->is_dir()) {
7103 ceph_assert(straydn);
7104 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7105 }
7106
7107 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
7108 }
7109
7110 void Server::_unlink_local_finish(MDRequestRef& mdr,
7111 CDentry *dn, CDentry *straydn,
7112 version_t dnpv)
7113 {
7114 dout(10) << "_unlink_local_finish " << *dn << dendl;
7115
7116 if (!mdr->more()->witnessed.empty())
7117 mdcache->logged_master_update(mdr->reqid);
7118
7119 CInode *strayin = NULL;
7120 bool hadrealm = false;
7121 if (straydn) {
7122 // if there is newly created snaprealm, need to split old snaprealm's
7123 // inodes_with_caps. So pop snaprealm before linkage changes.
7124 strayin = dn->get_linkage()->get_inode();
7125 hadrealm = strayin->snaprealm ? true : false;
7126 strayin->early_pop_projected_snaprealm();
7127 }
7128
7129 // unlink main dentry
7130 dn->get_dir()->unlink_inode(dn);
7131 dn->pop_projected_linkage();
7132
7133 // relink as stray? (i.e. was primary link?)
7134 if (straydn) {
7135 dout(20) << " straydn is " << *straydn << dendl;
7136 straydn->pop_projected_linkage();
7137
7138 strayin->pop_and_dirty_projected_inode(mdr->ls);
7139
7140 mdcache->touch_dentry_bottom(straydn);
7141 }
7142
7143 dn->mark_dirty(dnpv, mdr->ls);
7144 mdr->apply();
7145
7146 mdcache->send_dentry_unlink(dn, straydn, mdr);
7147
7148 if (straydn) {
7149 // update subtree map?
7150 if (strayin->is_dir())
7151 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
7152
7153 if (strayin->snaprealm && !hadrealm)
7154 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
7155 }
7156
7157 // bump pop
7158 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7159
7160 // reply
7161 respond_to_request(mdr, 0);
7162
7163 // removing a new dn?
7164 dn->get_dir()->try_remove_unlinked_dn(dn);
7165
7166 // clean up ?
7167 // respond_to_request() drops locks. So stray reintegration can race with us.
7168 if (straydn && !straydn->get_projected_linkage()->is_null()) {
7169 // Tip off the MDCache that this dentry is a stray that
7170 // might be elegible for purge.
7171 mdcache->notify_stray(straydn);
7172 }
7173 }
7174
7175 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
7176 {
7177 if (mds->is_cluster_degraded() &&
7178 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7179 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
7180 if (mdr->more()->waiting_on_slave.empty())
7181 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7182 return false;
7183 }
7184
7185 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
7186 auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREP);
7187 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
7188 for (auto dn : trace)
7189 req->srcdnpath.push_dentry(dn->get_name());
7190 mdcache->encode_replica_stray(straydn, who, req->straybl);
7191 if (mdr->more()->desti_srnode)
7192 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
7193
7194 req->op_stamp = mdr->get_op_stamp();
7195 mds->send_message_mds(req, who);
7196
7197 ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
7198 mdr->more()->waiting_on_slave.insert(who);
7199 return true;
7200 }
7201
7202 struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
7203 CDentry *dn, *straydn;
7204 C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
7205 : ServerLogContext(s, r), dn(d), straydn(st) {}
7206 void finish(int r) override {
7207 server->_logged_slave_rmdir(mdr, dn, straydn);
7208 }
7209 };
7210
7211 struct C_MDS_SlaveRmdirCommit : public ServerContext {
7212 MDRequestRef mdr;
7213 CDentry *straydn;
7214 C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
7215 : ServerContext(s), mdr(r), straydn(sd) { }
7216 void finish(int r) override {
7217 server->_commit_slave_rmdir(mdr, r, straydn);
7218 }
7219 };
7220
7221 void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
7222 {
7223 dout(10) << "handle_slave_rmdir_prep " << *mdr
7224 << " " << mdr->slave_request->srcdnpath
7225 << " to " << mdr->slave_request->destdnpath
7226 << dendl;
7227
7228 vector<CDentry*> trace;
7229 filepath srcpath(mdr->slave_request->srcdnpath);
7230 dout(10) << " src " << srcpath << dendl;
7231 CInode *in;
7232 CF_MDS_MDRContextFactory cf(mdcache, mdr, false);
7233 int r = mdcache->path_traverse(mdr, cf, srcpath,
7234 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
7235 &trace, &in);
7236 if (r > 0) return;
7237 if (r == -ESTALE) {
7238 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
7239 mdr->slave_to_mds, true);
7240 return;
7241 }
7242 ceph_assert(r == 0);
7243 CDentry *dn = trace.back();
7244 dout(10) << " dn " << *dn << dendl;
7245 mdr->pin(dn);
7246
7247 ceph_assert(mdr->straydn);
7248 CDentry *straydn = mdr->straydn;
7249 dout(10) << " straydn " << *straydn << dendl;
7250
7251 mdr->set_op_stamp(mdr->slave_request->op_stamp);
7252
7253 rmdir_rollback rollback;
7254 rollback.reqid = mdr->reqid;
7255 rollback.src_dir = dn->get_dir()->dirfrag();
7256 rollback.src_dname = dn->get_name();
7257 rollback.dest_dir = straydn->get_dir()->dirfrag();
7258 rollback.dest_dname = straydn->get_name();
7259 if (mdr->slave_request->desti_snapbl.length()) {
7260 if (in->snaprealm) {
7261 encode(true, rollback.snapbl);
7262 in->encode_snap_blob(rollback.snapbl);
7263 } else {
7264 encode(false, rollback.snapbl);
7265 }
7266 }
7267 encode(rollback, mdr->more()->rollback_bl);
7268 // FIXME: rollback snaprealm
7269 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7270
7271 // set up commit waiter
7272 mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
7273
7274 straydn->push_projected_linkage(in);
7275 dn->push_projected_linkage();
7276
7277 ceph_assert(straydn->first >= in->first);
7278 in->first = straydn->first;
7279
7280 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
7281 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
7282 _logged_slave_rmdir(mdr, dn, straydn);
7283 return;
7284 }
7285
7286 mdr->ls = mdlog->get_current_segment();
7287 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
7288 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
7289 mdlog->start_entry(le);
7290 le->rollback = mdr->more()->rollback_bl;
7291
7292 le->commit.add_dir_context(straydn->get_dir());
7293 le->commit.add_primary_dentry(straydn, in, true);
7294 // slave: no need to journal original dentry
7295
7296 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7297 le->commit.renamed_dirino = in->ino();
7298
7299 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7300 mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
7301
7302 mdr->more()->slave_update_journaled = true;
7303 submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
7304 mdr, __func__);
7305 mdlog->flush();
7306 }
7307
7308 void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7309 {
7310 dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
7311 CInode *in = dn->get_linkage()->get_inode();
7312
7313 bool new_realm;
7314 if (mdr->slave_request->desti_snapbl.length()) {
7315 new_realm = !in->snaprealm;
7316 in->decode_snap_blob(mdr->slave_request->desti_snapbl);
7317 ceph_assert(in->snaprealm);
7318 ceph_assert(in->snaprealm->have_past_parents_open());
7319 } else {
7320 new_realm = false;
7321 }
7322
7323 // update our cache now, so we are consistent with what is in the journal
7324 // when we journal a subtree map
7325 dn->get_dir()->unlink_inode(dn);
7326 straydn->pop_projected_linkage();
7327 dn->pop_projected_linkage();
7328
7329 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->slave_update_journaled);
7330
7331 if (new_realm)
7332 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
7333
7334 // done.
7335 mdr->reset_slave_request();
7336 mdr->straydn = 0;
7337
7338 if (!mdr->aborted) {
7339 auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREPACK);
7340 if (!mdr->more()->slave_update_journaled)
7341 reply->mark_not_journaled();
7342 mds->send_message_mds(reply, mdr->slave_to_mds);
7343 } else {
7344 dout(10) << " abort flag set, finishing" << dendl;
7345 mdcache->request_finish(mdr);
7346 }
7347 }
7348
7349 void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
7350 {
7351 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
7352 << " " << *ack << dendl;
7353
7354 mds_rank_t from = mds_rank_t(ack->get_source().num());
7355
7356 mdr->more()->slaves.insert(from);
7357 mdr->more()->witnessed.insert(from);
7358 if (!ack->is_not_journaled())
7359 mdr->more()->has_journaled_slaves = true;
7360
7361 // remove from waiting list
7362 ceph_assert(mdr->more()->waiting_on_slave.count(from));
7363 mdr->more()->waiting_on_slave.erase(from);
7364
7365 if (mdr->more()->waiting_on_slave.empty())
7366 dispatch_client_request(mdr); // go again!
7367 else
7368 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
7369 }
7370
7371 void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
7372 {
7373 dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
7374
7375 if (r == 0) {
7376 if (mdr->more()->slave_update_journaled) {
7377 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7378 if (strayin && !strayin->snaprealm)
7379 mdcache->clear_dirty_bits_for_stray(strayin);
7380 }
7381
7382 mdr->cleanup();
7383
7384 if (mdr->more()->slave_update_journaled) {
7385 // write a commit to the journal
7386 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
7387 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
7388 ESlaveUpdate::RMDIR);
7389 mdlog->start_entry(le);
7390 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
7391 mdlog->flush();
7392 } else {
7393 _committed_slave(mdr);
7394 }
7395 } else {
7396 // abort
7397 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
7398 }
7399 }
7400
7401 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
7402 metareqid_t reqid;
7403 CDentry *dn;
7404 CDentry *straydn;
7405 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
7406 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
7407 void finish(int r) override {
7408 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
7409 }
7410 };
7411
7412 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
7413 {
7414 // unlink the other rollback methods, the rmdir rollback is only
7415 // needed to record the subtree changes in the journal for inode
7416 // replicas who are auth for empty dirfrags. no actual changes to
7417 // the file system are taking place here, so there is no Mutation.
7418
7419 rmdir_rollback rollback;
7420 auto p = rbl.cbegin();
7421 decode(rollback, p);
7422
7423 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
7424 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
7425 ceph_assert(mdr || mds->is_resolve());
7426
7427 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
7428 if (!dir)
7429 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
7430 ceph_assert(dir);
7431 CDentry *dn = dir->lookup(rollback.src_dname);
7432 ceph_assert(dn);
7433 dout(10) << " dn " << *dn << dendl;
7434 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
7435 ceph_assert(straydir);
7436 CDentry *straydn = straydir->lookup(rollback.dest_dname);
7437 ceph_assert(straydn);
7438 dout(10) << " straydn " << *straydn << dendl;
7439 CInode *in = straydn->get_linkage()->get_inode();
7440
7441 dn->push_projected_linkage(in);
7442 straydn->push_projected_linkage();
7443
7444 if (rollback.snapbl.length() && in->snaprealm) {
7445 bool hadrealm;
7446 auto p = rollback.snapbl.cbegin();
7447 decode(hadrealm, p);
7448 if (hadrealm) {
7449 decode(in->snaprealm->srnode, p);
7450 } else {
7451 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
7452 }
7453 }
7454
7455 if (mdr && !mdr->more()->slave_update_journaled) {
7456 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
7457
7458 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
7459 return;
7460 }
7461
7462
7463 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
7464 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
7465 mdlog->start_entry(le);
7466
7467 le->commit.add_dir_context(dn->get_dir());
7468 le->commit.add_primary_dentry(dn, in, true);
7469 // slave: no need to journal straydn
7470
7471 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7472 le->commit.renamed_dirino = in->ino();
7473
7474 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
7475
7476 submit_mdlog_entry(le,
7477 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
7478 dn, straydn),
7479 mdr, __func__);
7480 mdlog->flush();
7481 }
7482
7483 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
7484 {
7485 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
7486
7487 straydn->get_dir()->unlink_inode(straydn);
7488 dn->pop_projected_linkage();
7489 straydn->pop_projected_linkage();
7490
7491 CInode *in = dn->get_linkage()->get_inode();
7492 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
7493 !mdr || mdr->more()->slave_update_journaled);
7494
7495 if (mds->is_resolve()) {
7496 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
7497 mdcache->try_trim_non_auth_subtree(root);
7498 }
7499
7500 if (mdr)
7501 mdcache->request_finish(mdr);
7502
7503 mdcache->finish_rollback(reqid, mdr);
7504 }
7505
7506
7507 /** _dir_is_nonempty[_unlocked]
7508 *
7509 * check if a directory is non-empty (i.e. we can rmdir it).
7510 *
7511 * the unlocked varient this is a fastpath check. we can't really be
7512 * sure until we rdlock the filelock.
7513 */
7514 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
7515 {
7516 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
7517 ceph_assert(in->is_auth());
7518
7519 if (in->filelock.is_cached())
7520 return false; // there can be pending async create/unlink. don't know.
7521 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
7522 return true; // in a snapshot!
7523
7524 auto&& ls = in->get_dirfrags();
7525 for (const auto& dir : ls) {
7526 // is the frag obviously non-empty?
7527 if (dir->is_auth()) {
7528 if (dir->get_projected_fnode()->fragstat.size()) {
7529 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7530 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
7531 return true;
7532 }
7533 }
7534 }
7535
7536 return false;
7537 }
7538
7539 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
7540 {
7541 dout(10) << "dir_is_nonempty " << *in << dendl;
7542 ceph_assert(in->is_auth());
7543 ceph_assert(in->filelock.can_read(mdr->get_client()));
7544
7545 frag_info_t dirstat;
7546 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
7547
7548 auto&& ls = in->get_dirfrags();
7549 for (const auto& dir : ls) {
7550 const fnode_t *pf = dir->get_projected_fnode();
7551 if (pf->fragstat.size()) {
7552 dout(10) << "dir_is_nonempty dirstat has "
7553 << pf->fragstat.size() << " items " << *dir << dendl;
7554 return true;
7555 }
7556
7557 if (pf->accounted_fragstat.version == dirstat_version)
7558 dirstat.add(pf->accounted_fragstat);
7559 else
7560 dirstat.add(pf->fragstat);
7561 }
7562
7563 return dirstat.size() != in->get_projected_inode()->dirstat.size();
7564 }
7565
7566
7567 // ======================================================
7568
7569
7570 class C_MDS_rename_finish : public ServerLogContext {
7571 CDentry *srcdn;
7572 CDentry *destdn;
7573 CDentry *straydn;
7574 public:
7575 C_MDS_rename_finish(Server *s, MDRequestRef& r,
7576 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
7577 ServerLogContext(s, r),
7578 srcdn(sdn), destdn(ddn), straydn(stdn) { }
7579 void finish(int r) override {
7580 ceph_assert(r == 0);
7581 server->_rename_finish(mdr, srcdn, destdn, straydn);
7582 }
7583 };
7584
7585
7586 /** handle_client_rename
7587 *
7588 * rename master is the destdn auth. this is because cached inodes
7589 * must remain connected. thus, any replica of srci, must also
7590 * replicate destdn, and possibly straydn, so that srci (and
7591 * destdn->inode) remain connected during the rename.
7592 *
7593 * to do this, we freeze srci, then master (destdn auth) verifies that
7594 * all other nodes have also replciated destdn and straydn. note that
7595 * destdn replicas need not also replicate srci. this only works when
7596 * destdn is master.
7597 *
7598 * This function takes responsibility for the passed mdr.
7599 */
7600 void Server::handle_client_rename(MDRequestRef& mdr)
7601 {
7602 const cref_t<MClientRequest> &req = mdr->client_request;
7603 dout(7) << "handle_client_rename " << *req << dendl;
7604
7605 filepath destpath = req->get_filepath();
7606 filepath srcpath = req->get_filepath2();
7607 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
7608 respond_to_request(mdr, -EBUSY);
7609 return;
7610 }
7611
7612 auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
7613 if (!destdn)
7614 return;
7615
7616 dout(10) << " destdn " << *destdn << dendl;
7617 CDir *destdir = destdn->get_dir();
7618 ceph_assert(destdir->is_auth());
7619 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7620
7621 dout(10) << " srcdn " << *srcdn << dendl;
7622 CDir *srcdir = srcdn->get_dir();
7623 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7624 CInode *srci = srcdnl->get_inode();
7625 dout(10) << " srci " << *srci << dendl;
7626
7627 // -- some sanity checks --
7628 if (destdn == srcdn) {
7629 dout(7) << "rename src=dest, noop" << dendl;
7630 respond_to_request(mdr, 0);
7631 return;
7632 }
7633
7634 // dest a child of src?
7635 // e.g. mv /usr /usr/foo
7636 if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
7637 dout(7) << "cannot rename item to be a child of itself" << dendl;
7638 respond_to_request(mdr, -EINVAL);
7639 return;
7640 }
7641
7642 // is this a stray migration, reintegration or merge? (sanity checks!)
7643 if (mdr->reqid.name.is_mds() &&
7644 !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
7645 MDS_INO_IS_STRAY(destpath.get_ino())) &&
7646 !(destdnl->is_remote() &&
7647 destdnl->get_remote_ino() == srci->ino())) {
7648 respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
7649 return;
7650 }
7651
7652 CInode *oldin = 0;
7653 if (!destdnl->is_null()) {
7654 //dout(10) << "dest dn exists " << *destdn << dendl;
7655 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
7656 if (!oldin) return;
7657 dout(10) << " oldin " << *oldin << dendl;
7658
7659 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
7660 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
7661 respond_to_request(mdr, -ENOTEMPTY);
7662 return;
7663 }
7664
7665 // mv /some/thing /to/some/existing_other_thing
7666 if (oldin->is_dir() && !srci->is_dir()) {
7667 respond_to_request(mdr, -EISDIR);
7668 return;
7669 }
7670 if (!oldin->is_dir() && srci->is_dir()) {
7671 respond_to_request(mdr, -ENOTDIR);
7672 return;
7673 }
7674 if (srci == oldin && !srcdir->inode->is_stray()) {
7675 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
7676 return;
7677 }
7678 }
7679
7680 vector<CDentry*>& srctrace = mdr->dn[1];
7681 vector<CDentry*>& desttrace = mdr->dn[0];
7682
7683 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
7684 if (destpath.get_ino() != srcpath.get_ino() &&
7685 !(req->get_source().is_mds() &&
7686 MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
7687 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
7688 CInode *destbase = desttrace[0]->get_dir()->get_inode();
7689 // ok, extend srctrace toward root until it is an ancestor of desttrace.
7690 while (srcbase != destbase &&
7691 !srcbase->is_projected_ancestor_of(destbase)) {
7692 CDentry *pdn = srcbase->get_projected_parent_dn();
7693 srctrace.insert(srctrace.begin(), pdn);
7694 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
7695 srcbase = pdn->get_dir()->get_inode();
7696 }
7697
7698 // then, extend destpath until it shares the same parent inode as srcpath.
7699 while (destbase != srcbase) {
7700 CDentry *pdn = destbase->get_projected_parent_dn();
7701 desttrace.insert(desttrace.begin(), pdn);
7702 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
7703 destbase = pdn->get_dir()->get_inode();
7704 }
7705 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
7706 }
7707
7708
7709 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
7710 if (linkmerge)
7711 dout(10) << " this is a link merge" << dendl;
7712
7713 // -- create stray dentry? --
7714 CDentry *straydn = NULL;
7715 if (destdnl->is_primary() && !linkmerge) {
7716 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
7717 if (!straydn)
7718 return;
7719 dout(10) << " straydn is " << *straydn << dendl;
7720 } else if (mdr->straydn) {
7721 mdr->unpin(mdr->straydn);
7722 mdr->straydn = NULL;
7723 }
7724
7725
7726 // -- locks --
7727 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7728 MutationImpl::LockOpVec lov;
7729
7730 // we need to update srci's ctime. xlock its least contended lock to do that...
7731 lov.add_xlock(&srci->linklock);
7732 lov.add_xlock(&srci->snaplock);
7733
7734 if (oldin) {
7735 // xlock oldin (for nlink--)
7736 lov.add_xlock(&oldin->linklock);
7737 lov.add_xlock(&oldin->snaplock);
7738 if (oldin->is_dir()) {
7739 ceph_assert(srci->is_dir());
7740 lov.add_rdlock(&oldin->filelock); // to verify it's empty
7741
7742 // adjust locking order?
7743 int cmp = mdr->compare_paths();
7744 if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
7745 std::reverse(lov.begin(), lov.end());
7746 } else {
7747 ceph_assert(!srci->is_dir());
7748 // adjust locking order;
7749 if (srci->ino() > oldin->ino())
7750 std::reverse(lov.begin(), lov.end());
7751 }
7752 }
7753
7754 // straydn?
7755 if (straydn) {
7756 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7757 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7758 lov.add_xlock(&straydn->lock);
7759 }
7760
7761 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
7762 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
7763 return;
7764
7765 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7766 }
7767
7768 if (linkmerge)
7769 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
7770
7771 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7772 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
7773 return;
7774
7775 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
7776 return;
7777
7778 if (!check_fragment_space(mdr, destdn->get_dir()))
7779 return;
7780
7781 if (!check_access(mdr, srci, MAY_WRITE))
7782 return;
7783 }
7784
7785 // with read lock, really verify oldin is empty
7786 if (oldin &&
7787 oldin->is_dir() &&
7788 _dir_is_nonempty(mdr, oldin)) {
7789 respond_to_request(mdr, -ENOTEMPTY);
7790 return;
7791 }
7792
7793 /* project_snaprealm_past_parent() will do this job
7794 *
7795 // moving between snaprealms?
7796 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
7797 SnapRealm *srcrealm = srci->find_snaprealm();
7798 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
7799 if (srcrealm != destrealm &&
7800 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
7801 destrealm->get_newest_seq() + 1 > srcdn->first)) {
7802 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
7803 mdcache->snaprealm_create(mdr, srci);
7804 return;
7805 }
7806 }
7807 */
7808
7809 ceph_assert(g_conf()->mds_kill_rename_at != 1);
7810
7811 // -- open all srcdn inode frags, if any --
7812 // we need these open so that auth can properly delegate from inode to dirfrags
7813 // after the inode is _ours_.
7814 if (srcdnl->is_primary() &&
7815 !srcdn->is_auth() &&
7816 srci->is_dir()) {
7817 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
7818 mdr->set_stickydirs(srci);
7819
7820 frag_vec_t leaves;
7821 srci->dirfragtree.get_leaves(leaves);
7822 for (const auto& leaf : leaves) {
7823 CDir *dir = srci->get_dirfrag(leaf);
7824 if (!dir) {
7825 dout(10) << " opening " << leaf << " under " << *srci << dendl;
7826 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
7827 return;
7828 }
7829 }
7830 }
7831
7832 // -- prepare snaprealm ---
7833
7834 if (linkmerge) {
7835 if (!mdr->more()->srci_srnode &&
7836 srci->get_projected_inode()->nlink == 1 &&
7837 srci->is_projected_snaprealm_global()) {
7838 sr_t *new_srnode = srci->prepare_new_srnode(0);
7839 srci->record_snaprealm_parent_dentry(new_srnode, NULL, destdn, false);
7840
7841 srci->clear_snaprealm_global(new_srnode);
7842 mdr->more()->srci_srnode = new_srnode;
7843 }
7844 } else {
7845 if (oldin && !mdr->more()->desti_srnode) {
7846 if (oldin->is_projected_snaprealm_global()) {
7847 sr_t *new_srnode = oldin->prepare_new_srnode(0);
7848 oldin->record_snaprealm_parent_dentry(new_srnode, NULL, destdn, destdnl->is_primary());
7849 // dropping the last linkage or dropping the last remote linkage,
7850 // detch the inode from global snaprealm
7851 auto nlink = oldin->get_projected_inode()->nlink;
7852 if (nlink == 1 ||
7853 (nlink == 2 && !destdnl->is_primary() &&
7854 !oldin->get_projected_parent_dir()->inode->is_stray()))
7855 oldin->clear_snaprealm_global(new_srnode);
7856 mdr->more()->desti_srnode = new_srnode;
7857 } else if (destdnl->is_primary()) {
7858 SnapRealm *dest_realm = destdir->inode->find_snaprealm();
7859 snapid_t follows = dest_realm->get_newest_seq();
7860 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
7861 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
7862 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7863 mdr->more()->desti_srnode = new_srnode;
7864 }
7865 }
7866 }
7867 if (!mdr->more()->srci_srnode) {
7868 SnapRealm *dest_realm = destdir->inode->find_snaprealm();
7869 if (srci->is_projected_snaprealm_global()) {
7870 sr_t *new_srnode = srci->prepare_new_srnode(0);
7871 srci->record_snaprealm_parent_dentry(new_srnode, dest_realm, srcdn, srcdnl->is_primary());
7872 mdr->more()->srci_srnode = new_srnode;
7873 } else if (srcdnl->is_primary()) {
7874 SnapRealm *src_realm = srcdir->inode->find_snaprealm();
7875 snapid_t follows = src_realm->get_newest_seq();
7876 if (src_realm != dest_realm &&
7877 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
7878 sr_t *new_srnode = srci->prepare_new_srnode(follows);
7879 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
7880 mdr->more()->srci_srnode = new_srnode;
7881 }
7882 }
7883 }
7884 }
7885
7886 // -- prepare witnesses --
7887
7888 /*
7889 * NOTE: we use _all_ replicas as witnesses.
7890 * this probably isn't totally necessary (esp for file renames),
7891 * but if/when we change that, we have to make sure rejoin is
7892 * sufficiently robust to handle strong rejoins from survivors
7893 * with totally wrong dentry->inode linkage.
7894 * (currently, it can ignore rename effects, because the resolve
7895 * stage will sort them out.)
7896 */
7897 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
7898 if (srcdn->is_auth())
7899 srcdn->list_replicas(witnesses);
7900 else
7901 witnesses.insert(srcdn->authority().first);
7902 if (srcdnl->is_remote() && !srci->is_auth())
7903 witnesses.insert(srci->authority().first);
7904 destdn->list_replicas(witnesses);
7905 if (destdnl->is_remote() && !oldin->is_auth())
7906 witnesses.insert(oldin->authority().first);
7907 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7908
7909 if (!witnesses.empty()) {
7910 // Replicas can't see projected dentry linkages and will get confused.
7911 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
7912 // can't project these inodes' linkages.
7913 bool need_flush = false;
7914 for (auto& dn : srctrace) {
7915 if (dn->is_projected()) {
7916 need_flush = true;
7917 break;
7918 }
7919 }
7920 if (!need_flush) {
7921 CDentry *dn = destdn;
7922 do {
7923 if (dn->is_projected()) {
7924 need_flush = true;
7925 break;
7926 }
7927 CInode *diri = dn->get_dir()->get_inode();
7928 dn = diri->get_projected_parent_dn();
7929 } while (dn);
7930 }
7931 if (need_flush) {
7932 mdlog->wait_for_safe(
7933 new MDSInternalContextWrapper(mds,
7934 new C_MDS_RetryRequest(mdcache, mdr)));
7935 mdlog->flush();
7936 return;
7937 }
7938 }
7939
7940 // do srcdn auth last
7941 mds_rank_t last = MDS_RANK_NONE;
7942 if (!srcdn->is_auth()) {
7943 last = srcdn->authority().first;
7944 mdr->more()->srcdn_auth_mds = last;
7945 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
7946 // are involved in the rename operation.
7947 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
7948 dout(10) << " preparing ambiguous auth for srci" << dendl;
7949 ceph_assert(mdr->more()->is_remote_frozen_authpin);
7950 ceph_assert(mdr->more()->rename_inode == srci);
7951 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
7952 return;
7953 }
7954 }
7955
7956 for (set<mds_rank_t>::iterator p = witnesses.begin();
7957 p != witnesses.end();
7958 ++p) {
7959 if (*p == last) continue; // do it last!
7960 if (mdr->more()->witnessed.count(*p)) {
7961 dout(10) << " already witnessed by mds." << *p << dendl;
7962 } else if (mdr->more()->waiting_on_slave.count(*p)) {
7963 dout(10) << " already waiting on witness mds." << *p << dendl;
7964 } else {
7965 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
7966 return;
7967 }
7968 }
7969 if (!mdr->more()->waiting_on_slave.empty())
7970 return; // we're waiting for a witness.
7971
7972 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
7973 dout(10) << " preparing last witness (srcdn auth)" << dendl;
7974 ceph_assert(mdr->more()->waiting_on_slave.count(last) == 0);
7975 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
7976 return;
7977 }
7978
7979 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
7980 if (!mdr->more()->slaves.empty() && !srci->is_dir())
7981 ceph_assert(g_conf()->mds_kill_rename_at != 3);
7982 if (!mdr->more()->slaves.empty() && srci->is_dir())
7983 ceph_assert(g_conf()->mds_kill_rename_at != 4);
7984
7985 // -- declare now --
7986 mdr->set_mds_stamp(ceph_clock_now());
7987
7988 // -- prepare journal entry --
7989 mdr->ls = mdlog->get_current_segment();
7990 EUpdate *le = new EUpdate(mdlog, "rename");
7991 mdlog->start_entry(le);
7992 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7993 if (!mdr->more()->witnessed.empty()) {
7994 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
7995
7996 le->reqid = mdr->reqid;
7997 le->had_slaves = true;
7998
7999 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
8000 // no need to send frozen auth pin to recovring auth MDS of srci
8001 mdr->more()->is_remote_frozen_authpin = false;
8002 }
8003
8004 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
8005 if (le->client_map.length())
8006 le->cmapv = mds->sessionmap.get_projected();
8007
8008 // -- commit locally --
8009 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
8010
8011 journal_and_reply(mdr, srci, destdn, le, fin);
8012 mds->balancer->maybe_fragment(destdn->get_dir(), false);
8013 }
8014
8015
8016 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8017 {
8018 dout(10) << "_rename_finish " << *mdr << dendl;
8019
8020 if (!mdr->more()->witnessed.empty())
8021 mdcache->logged_master_update(mdr->reqid);
8022
8023 // apply
8024 _rename_apply(mdr, srcdn, destdn, straydn);
8025
8026 mdcache->send_dentry_link(destdn, mdr);
8027
8028 CDentry::linkage_t *destdnl = destdn->get_linkage();
8029 CInode *in = destdnl->get_inode();
8030 bool need_eval = mdr->more()->cap_imports.count(in);
8031
8032 // test hack: test slave commit
8033 if (!mdr->more()->slaves.empty() && !in->is_dir())
8034 ceph_assert(g_conf()->mds_kill_rename_at != 5);
8035 if (!mdr->more()->slaves.empty() && in->is_dir())
8036 ceph_assert(g_conf()->mds_kill_rename_at != 6);
8037
8038 // bump popularity
8039 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
8040 if (destdnl->is_remote() && in->is_auth())
8041 mds->balancer->hit_inode(in, META_POP_IWR);
8042
8043 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8044
8045 ceph_assert(g_conf()->mds_kill_rename_at != 7);
8046
8047 // reply
8048 respond_to_request(mdr, 0);
8049
8050 if (need_eval)
8051 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
8052
8053 // clean up?
8054 // respond_to_request() drops locks. So stray reintegration can race with us.
8055 if (straydn && !straydn->get_projected_linkage()->is_null()) {
8056 mdcache->notify_stray(straydn);
8057 }
8058 }
8059
8060
8061
8062 // helpers
8063
8064 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
8065 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
8066 {
8067 if (mds->is_cluster_degraded() &&
8068 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
8069 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
8070 if (mdr->more()->waiting_on_slave.empty())
8071 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
8072 return false;
8073 }
8074
8075 dout(10) << "_rename_prepare_witness mds." << who << dendl;
8076 auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREP);
8077
8078 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
8079 for (auto dn : srctrace)
8080 req->srcdnpath.push_dentry(dn->get_name());
8081 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
8082 for (auto dn : dsttrace)
8083 req->destdnpath.push_dentry(dn->get_name());
8084 if (straydn)
8085 mdcache->encode_replica_stray(straydn, who, req->straybl);
8086
8087 if (mdr->more()->srci_srnode)
8088 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
8089 if (mdr->more()->desti_srnode)
8090 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
8091
8092 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
8093
8094 // srcdn auth will verify our current witness list is sufficient
8095 req->witnesses = witnesse;
8096
8097 req->op_stamp = mdr->get_op_stamp();
8098 mds->send_message_mds(req, who);
8099
8100 ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
8101 mdr->more()->waiting_on_slave.insert(who);
8102 return true;
8103 }
8104
8105 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
8106 {
8107 version_t oldpv = mdr->more()->inode_import_v;
8108
8109 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8110
8111 /* import node */
8112 auto blp = mdr->more()->inode_import.cbegin();
8113
8114 // imported caps
8115 map<client_t,entity_inst_t> client_map;
8116 map<client_t, client_metadata_t> client_metadata_map;
8117 decode(client_map, blp);
8118 decode(client_metadata_map, blp);
8119 prepare_force_open_sessions(client_map, client_metadata_map,
8120 mdr->more()->imported_session_map);
8121 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
8122 encode(client_metadata_map, *client_map_bl);
8123
8124 list<ScatterLock*> updated_scatterlocks;
8125 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
8126 mdr->more()->cap_imports, updated_scatterlocks);
8127
8128 // hack: force back to !auth and clean, temporarily
8129 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
8130 srcdnl->get_inode()->mark_clean();
8131
8132 return oldpv;
8133 }
8134
8135 bool Server::_need_force_journal(CInode *diri, bool empty)
8136 {
8137 auto&& dirs = diri->get_dirfrags();
8138
8139 bool force_journal = false;
8140 if (empty) {
8141 for (const auto& dir : dirs) {
8142 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
8143 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
8144 force_journal = true;
8145 break;
8146 } else
8147 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
8148 }
8149 } else {
8150 // see if any children of our frags are auth subtrees.
8151 std::vector<CDir*> subtrees;
8152 mdcache->get_subtrees(subtrees);
8153 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
8154 for (const auto& dir : dirs) {
8155 for (const auto& subtree : subtrees) {
8156 if (dir->contains(subtree)) {
8157 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
8158 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
8159 << *subtree << dendl;
8160 force_journal = true;
8161 break;
8162 } else
8163 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
8164 } else
8165 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
8166 }
8167 if (force_journal)
8168 break;
8169 }
8170 }
8171 return force_journal;
8172 }
8173
8174 void Server::_rename_prepare(MDRequestRef& mdr,
8175 EMetaBlob *metablob, bufferlist *client_map_bl,
8176 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8177 {
8178 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8179 if (straydn)
8180 dout(10) << " straydn " << *straydn << dendl;
8181
8182 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8183 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8184 CInode *srci = srcdnl->get_inode();
8185 CInode *oldin = destdnl->get_inode();
8186
8187 // primary+remote link merge?
8188 bool linkmerge = (srci == oldin);
8189 if (linkmerge)
8190 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
8191 bool silent = srcdn->get_dir()->inode->is_stray();
8192
8193 bool force_journal_dest = false;
8194 if (srci->is_dir() && !destdn->is_auth()) {
8195 if (srci->is_auth()) {
8196 // if we are auth for srci and exporting it, force journal because journal replay needs
8197 // the source inode to create auth subtrees.
8198 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
8199 force_journal_dest = true;
8200 } else
8201 force_journal_dest = _need_force_journal(srci, false);
8202 }
8203
8204 bool force_journal_stray = false;
8205 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
8206 force_journal_stray = _need_force_journal(oldin, true);
8207
8208 if (linkmerge)
8209 dout(10) << " merging remote and primary links to the same inode" << dendl;
8210 if (silent)
8211 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
8212 if (force_journal_dest)
8213 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
8214 if (force_journal_stray)
8215 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
8216
8217 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
8218 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
8219 metablob->renamed_dirino = srci->ino();
8220 } else if (oldin && oldin->is_dir() && force_journal_stray) {
8221 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
8222 metablob->renamed_dirino = oldin->ino();
8223 }
8224
8225 // prepare
8226 CInode::mempool_inode *spi = 0; // renamed inode
8227 CInode::mempool_inode *tpi = 0; // target/overwritten inode
8228
8229 // target inode
8230 if (!linkmerge) {
8231 if (destdnl->is_primary()) {
8232 ceph_assert(straydn); // moving to straydn.
8233 // link--, and move.
8234 if (destdn->is_auth()) {
8235 auto &pi= oldin->project_inode(); //project_snaprealm
8236 pi.inode.version = straydn->pre_dirty(pi.inode.version);
8237 pi.inode.update_backtrace();
8238 tpi = &pi.inode;
8239 }
8240 straydn->push_projected_linkage(oldin);
8241 } else if (destdnl->is_remote()) {
8242 // nlink-- targeti
8243 if (oldin->is_auth()) {
8244 auto &pi = oldin->project_inode();
8245 pi.inode.version = oldin->pre_dirty();
8246 tpi = &pi.inode;
8247 }
8248 }
8249 }
8250
8251 // dest
8252 if (srcdnl->is_remote()) {
8253 if (!linkmerge) {
8254 // destdn
8255 if (destdn->is_auth())
8256 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
8257 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8258 // srci
8259 if (srci->is_auth()) {
8260 auto &pi = srci->project_inode();
8261 pi.inode.version = srci->pre_dirty();
8262 spi = &pi.inode;
8263 }
8264 } else {
8265 dout(10) << " will merge remote onto primary link" << dendl;
8266 if (destdn->is_auth()) {
8267 auto &pi = oldin->project_inode();
8268 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
8269 spi = &pi.inode;
8270 }
8271 }
8272 } else { // primary
8273 if (destdn->is_auth()) {
8274 version_t oldpv;
8275 if (srcdn->is_auth())
8276 oldpv = srci->get_projected_version();
8277 else {
8278 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
8279
8280 // note which dirfrags have child subtrees in the journal
8281 // event, so that we can open those (as bounds) during replay.
8282 if (srci->is_dir()) {
8283 auto&& ls = srci->get_dirfrags();
8284 for (const auto& dir : ls) {
8285 if (!dir->is_auth())
8286 metablob->renamed_dir_frags.push_back(dir->get_frag());
8287 }
8288 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
8289 }
8290 }
8291 auto &pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
8292 // & srcdnl->snaprealm
8293 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
8294 pi.inode.update_backtrace();
8295 spi = &pi.inode;
8296 }
8297 destdn->push_projected_linkage(srci);
8298 }
8299
8300 // src
8301 if (srcdn->is_auth())
8302 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
8303 srcdn->push_projected_linkage(); // push null linkage
8304
8305 if (!silent) {
8306 if (spi) {
8307 spi->ctime = mdr->get_op_stamp();
8308 if (mdr->get_op_stamp() > spi->rstat.rctime)
8309 spi->rstat.rctime = mdr->get_op_stamp();
8310 spi->change_attr++;
8311 if (linkmerge)
8312 spi->nlink--;
8313 }
8314 if (tpi) {
8315 tpi->ctime = mdr->get_op_stamp();
8316 if (mdr->get_op_stamp() > tpi->rstat.rctime)
8317 tpi->rstat.rctime = mdr->get_op_stamp();
8318 tpi->change_attr++;
8319 {
8320 std::string t;
8321 destdn->make_path_string(t, true);
8322 tpi->stray_prior_path = std::move(t);
8323 }
8324 tpi->nlink--;
8325 if (tpi->nlink == 0)
8326 oldin->state_set(CInode::STATE_ORPHAN);
8327 }
8328 }
8329
8330 // prepare nesting, mtime updates
8331 int predirty_dir = silent ? 0:PREDIRTY_DIR;
8332
8333 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8334 // then link the source inode to destdn
8335 if (destdnl->is_primary()) {
8336 ceph_assert(straydn);
8337 if (straydn->is_auth()) {
8338 metablob->add_dir_context(straydn->get_dir());
8339 metablob->add_dir(straydn->get_dir(), true);
8340 }
8341 }
8342
8343 // sub off target
8344 if (destdn->is_auth() && !destdnl->is_null()) {
8345 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
8346 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
8347 if (destdnl->is_primary()) {
8348 ceph_assert(straydn);
8349 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
8350 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
8351 }
8352 }
8353
8354 // move srcdn
8355 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
8356 int flags = predirty_dir | predirty_primary;
8357 if (srcdn->is_auth())
8358 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
8359 if (destdn->is_auth())
8360 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
8361
8362 // add it all to the metablob
8363 // target inode
8364 if (!linkmerge) {
8365 if (destdnl->is_primary()) {
8366 ceph_assert(straydn);
8367 if (destdn->is_auth()) {
8368 // project snaprealm, too
8369 if (auto& desti_srnode = mdr->more()->desti_srnode) {
8370 oldin->project_snaprealm(desti_srnode);
8371 if (tpi->nlink == 0)
8372 ceph_assert(!desti_srnode->is_parent_global());
8373 desti_srnode = NULL;
8374 }
8375 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8376 metablob->add_primary_dentry(straydn, oldin, true, true);
8377 } else if (force_journal_stray) {
8378 dout(10) << " forced journaling straydn " << *straydn << dendl;
8379 metablob->add_dir_context(straydn->get_dir());
8380 metablob->add_primary_dentry(straydn, oldin, true);
8381 }
8382 } else if (destdnl->is_remote()) {
8383 if (oldin->is_auth()) {
8384 sr_t *new_srnode = NULL;
8385 if (mdr->slave_request) {
8386 if (mdr->slave_request->desti_snapbl.length() > 0) {
8387 new_srnode = new sr_t();
8388 auto p = mdr->slave_request->desti_snapbl.cbegin();
8389 decode(*new_srnode, p);
8390 }
8391 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8392 new_srnode = desti_srnode;
8393 desti_srnode = NULL;
8394 }
8395 if (new_srnode) {
8396 oldin->project_snaprealm(new_srnode);
8397 if (tpi->nlink == 0)
8398 ceph_assert(!new_srnode->is_parent_global());
8399 }
8400 // auth for targeti
8401 metablob->add_dir_context(oldin->get_projected_parent_dir());
8402 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
8403 CEPH_NOSNAP, 0, destdnl);
8404 metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
8405 }
8406 }
8407 }
8408
8409 // dest
8410 if (srcdnl->is_remote()) {
8411 ceph_assert(!linkmerge);
8412 if (destdn->is_auth() && !destdnl->is_null())
8413 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8414 else
8415 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8416
8417 if (destdn->is_auth())
8418 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8419
8420 if (srci->is_auth() ) { // it's remote
8421 if (mdr->slave_request) {
8422 if (mdr->slave_request->srci_snapbl.length() > 0) {
8423 sr_t *new_srnode = new sr_t();
8424 auto p = mdr->slave_request->srci_snapbl.cbegin();
8425 decode(*new_srnode, p);
8426 srci->project_snaprealm(new_srnode);
8427 }
8428 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8429 srci->project_snaprealm(srci_srnode);
8430 srci_srnode = NULL;
8431 }
8432
8433 CDentry *srci_pdn = srci->get_projected_parent_dn();
8434 metablob->add_dir_context(srci_pdn->get_dir());
8435 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn, CEPH_NOSNAP, 0, srcdnl);
8436 metablob->add_primary_dentry(srci_pdn, srci, true);
8437 }
8438 } else if (srcdnl->is_primary()) {
8439 // project snap parent update?
8440 if (destdn->is_auth()) {
8441 if (auto& srci_srnode = mdr->more()->srci_srnode) {
8442 srci->project_snaprealm(srci_srnode);
8443 srci_srnode = NULL;
8444 }
8445 }
8446
8447 if (destdn->is_auth() && !destdnl->is_null())
8448 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8449
8450 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8451
8452 if (destdn->is_auth())
8453 metablob->add_primary_dentry(destdn, srci, true, true);
8454 else if (force_journal_dest) {
8455 dout(10) << " forced journaling destdn " << *destdn << dendl;
8456 metablob->add_dir_context(destdn->get_dir());
8457 metablob->add_primary_dentry(destdn, srci, true);
8458 if (srcdn->is_auth() && srci->is_dir()) {
8459 // journal new subtrees root dirfrags
8460 auto&& ls = srci->get_dirfrags();
8461 for (const auto& dir : ls) {
8462 if (dir->is_auth())
8463 metablob->add_dir(dir, true);
8464 }
8465 }
8466 }
8467 }
8468
8469 // src
8470 if (srcdn->is_auth()) {
8471 dout(10) << " journaling srcdn " << *srcdn << dendl;
8472 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
8473 // also journal the inode in case we need do slave rename rollback. It is Ok to add
8474 // both primary and NULL dentries. Because during journal replay, null dentry is
8475 // processed after primary dentry.
8476 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
8477 metablob->add_primary_dentry(srcdn, srci, true);
8478 metablob->add_null_dentry(srcdn, true);
8479 } else
8480 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
8481
8482 // make renamed inode first track the dn
8483 if (srcdnl->is_primary() && destdn->is_auth()) {
8484 ceph_assert(srci->first <= destdn->first);
8485 srci->first = destdn->first;
8486 }
8487 // make stray inode first track the straydn
8488 if (straydn && straydn->is_auth()) {
8489 ceph_assert(oldin->first <= straydn->first);
8490 oldin->first = straydn->first;
8491 }
8492
8493 if (oldin && oldin->is_dir()) {
8494 ceph_assert(straydn);
8495 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
8496 }
8497 if (srci->is_dir())
8498 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
8499
8500 }
8501
8502
8503 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8504 {
8505 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8506 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
8507
8508 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8509 CDentry::linkage_t *destdnl = destdn->get_linkage();
8510
8511 CInode *oldin = destdnl->get_inode();
8512
8513 // primary+remote link merge?
8514 bool linkmerge = (srcdnl->get_inode() == oldin);
8515 if (linkmerge)
8516 ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
8517
8518 bool new_in_snaprealm = false;
8519 bool new_oldin_snaprealm = false;
8520
8521 // target inode
8522 if (!linkmerge) {
8523 if (destdnl->is_primary()) {
8524 ceph_assert(straydn);
8525 dout(10) << "straydn is " << *straydn << dendl;
8526
8527 // if there is newly created snaprealm, need to split old snaprealm's
8528 // inodes_with_caps. So pop snaprealm before linkage changes.
8529 if (destdn->is_auth()) {
8530 bool hadrealm = (oldin->snaprealm ? true : false);
8531 oldin->early_pop_projected_snaprealm();
8532 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
8533 } else {
8534 ceph_assert(mdr->slave_request);
8535 if (mdr->slave_request->desti_snapbl.length()) {
8536 new_oldin_snaprealm = !oldin->snaprealm;
8537 oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
8538 ceph_assert(oldin->snaprealm);
8539 ceph_assert(oldin->snaprealm->have_past_parents_open());
8540 }
8541 }
8542
8543 destdn->get_dir()->unlink_inode(destdn, false);
8544
8545 straydn->pop_projected_linkage();
8546 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8547 ceph_assert(!straydn->is_projected()); // no other projected
8548
8549 // nlink-- targeti
8550 if (destdn->is_auth())
8551 oldin->pop_and_dirty_projected_inode(mdr->ls);
8552
8553 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
8554 } else if (destdnl->is_remote()) {
8555 destdn->get_dir()->unlink_inode(destdn, false);
8556 if (oldin->is_auth()) {
8557 oldin->pop_and_dirty_projected_inode(mdr->ls);
8558 } else if (mdr->slave_request) {
8559 if (mdr->slave_request->desti_snapbl.length() > 0) {
8560 ceph_assert(oldin->snaprealm);
8561 oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
8562 }
8563 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8564 delete desti_srnode;
8565 desti_srnode = NULL;
8566 }
8567 }
8568 }
8569
8570 // unlink src before we relink it at dest
8571 CInode *in = srcdnl->get_inode();
8572 ceph_assert(in);
8573
8574 bool srcdn_was_remote = srcdnl->is_remote();
8575 if (!srcdn_was_remote) {
8576 // if there is newly created snaprealm, need to split old snaprealm's
8577 // inodes_with_caps. So pop snaprealm before linkage changes.
8578 if (destdn->is_auth()) {
8579 bool hadrealm = (in->snaprealm ? true : false);
8580 in->early_pop_projected_snaprealm();
8581 new_in_snaprealm = (in->snaprealm && !hadrealm);
8582 } else {
8583 ceph_assert(mdr->slave_request);
8584 if (mdr->slave_request->srci_snapbl.length()) {
8585 new_in_snaprealm = !in->snaprealm;
8586 in->decode_snap_blob(mdr->slave_request->srci_snapbl);
8587 ceph_assert(in->snaprealm);
8588 ceph_assert(in->snaprealm->have_past_parents_open());
8589 }
8590 }
8591 }
8592
8593 srcdn->get_dir()->unlink_inode(srcdn);
8594
8595 // dest
8596 if (srcdn_was_remote) {
8597 if (!linkmerge) {
8598 // destdn
8599 destdnl = destdn->pop_projected_linkage();
8600 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8601 ceph_assert(!destdn->is_projected()); // no other projected
8602
8603 destdn->link_remote(destdnl, in);
8604 if (destdn->is_auth())
8605 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
8606 // in
8607 if (in->is_auth()) {
8608 in->pop_and_dirty_projected_inode(mdr->ls);
8609 } else if (mdr->slave_request) {
8610 if (mdr->slave_request->srci_snapbl.length() > 0) {
8611 ceph_assert(in->snaprealm);
8612 in->decode_snap_blob(mdr->slave_request->srci_snapbl);
8613 }
8614 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8615 delete srci_srnode;
8616 srci_srnode = NULL;
8617 }
8618 } else {
8619 dout(10) << "merging remote onto primary link" << dendl;
8620 oldin->pop_and_dirty_projected_inode(mdr->ls);
8621 }
8622 } else { // primary
8623 if (linkmerge) {
8624 dout(10) << "merging primary onto remote link" << dendl;
8625 destdn->get_dir()->unlink_inode(destdn, false);
8626 }
8627 destdnl = destdn->pop_projected_linkage();
8628 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8629 ceph_assert(!destdn->is_projected()); // no other projected
8630
8631 // srcdn inode import?
8632 if (!srcdn->is_auth() && destdn->is_auth()) {
8633 ceph_assert(mdr->more()->inode_import.length() > 0);
8634
8635 map<client_t,Capability::Import> imported_caps;
8636
8637 // finish cap imports
8638 finish_force_open_sessions(mdr->more()->imported_session_map);
8639 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
8640 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
8641 mdr->more()->srcdn_auth_mds, true,
8642 mdr->more()->imported_session_map,
8643 mdr->more()->cap_imports[destdnl->get_inode()],
8644 imported_caps);
8645 }
8646
8647 mdr->more()->inode_import.clear();
8648 encode(imported_caps, mdr->more()->inode_import);
8649
8650 /* hack: add an auth pin for each xlock we hold. These were
8651 * remote xlocks previously but now they're local and
8652 * we're going to try and unpin when we xlock_finish. */
8653
8654 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
8655 i != mdr->locks.end();
8656 ++i) {
8657 SimpleLock *lock = i->lock;
8658 if (lock->get_parent() != destdnl->get_inode())
8659 break;
8660 if (i->is_xlock() && !lock->is_locallock())
8661 mds->locker->xlock_import(lock);
8662 }
8663
8664 // hack: fix auth bit
8665 in->state_set(CInode::STATE_AUTH);
8666
8667 mdr->clear_ambiguous_auth();
8668 }
8669
8670 if (destdn->is_auth())
8671 in->pop_and_dirty_projected_inode(mdr->ls);
8672 }
8673
8674 // src
8675 if (srcdn->is_auth())
8676 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
8677 srcdn->pop_projected_linkage();
8678 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8679 ceph_assert(!srcdn->is_projected()); // no other projected
8680
8681 // apply remaining projected inodes (nested)
8682 mdr->apply();
8683
8684 // update subtree map?
8685 if (destdnl->is_primary() && in->is_dir())
8686 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
8687
8688 if (straydn && oldin->is_dir())
8689 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
8690
8691 if (new_oldin_snaprealm)
8692 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
8693 if (new_in_snaprealm)
8694 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
8695
8696 // removing a new dn?
8697 if (srcdn->is_auth())
8698 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
8699 }
8700
8701
8702
8703 // ------------
8704 // SLAVE
8705
8706 class C_MDS_SlaveRenamePrep : public ServerLogContext {
8707 CDentry *srcdn, *destdn, *straydn;
8708 public:
8709 C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
8710 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
8711 void finish(int r) override {
8712 server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
8713 }
8714 };
8715
8716 class C_MDS_SlaveRenameCommit : public ServerContext {
8717 MDRequestRef mdr;
8718 CDentry *srcdn, *destdn, *straydn;
8719 public:
8720 C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
8721 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
8722 void finish(int r) override {
8723 server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
8724 }
8725 };
8726
8727 class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
8728 MDRequestRef mdr;
8729 public:
8730 C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
8731 ServerContext(s), mdr(r) {}
8732 void finish(int r) override {
8733 server->_slave_rename_sessions_flushed(mdr);
8734 }
8735 };
8736
8737 void Server::handle_slave_rename_prep(MDRequestRef& mdr)
8738 {
8739 dout(10) << "handle_slave_rename_prep " << *mdr
8740 << " " << mdr->slave_request->srcdnpath
8741 << " to " << mdr->slave_request->destdnpath
8742 << dendl;
8743
8744 if (mdr->slave_request->is_interrupted()) {
8745 dout(10) << " slave request interrupted, sending noop reply" << dendl;
8746 auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
8747 reply->mark_interrupted();
8748 mds->send_message_mds(reply, mdr->slave_to_mds);
8749 mdr->reset_slave_request();
8750 return;
8751 }
8752
8753 // discover destdn
8754 filepath destpath(mdr->slave_request->destdnpath);
8755 dout(10) << " dest " << destpath << dendl;
8756 vector<CDentry*> trace;
8757 CF_MDS_MDRContextFactory cf(mdcache, mdr, false);
8758 int r = mdcache->path_traverse(mdr, cf, destpath,
8759 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
8760 &trace);
8761 if (r > 0) return;
8762 if (r == -ESTALE) {
8763 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
8764 mdr->slave_to_mds, true);
8765 return;
8766 }
8767 ceph_assert(r == 0); // we shouldn't get an error here!
8768
8769 CDentry *destdn = trace.back();
8770 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8771 dout(10) << " destdn " << *destdn << dendl;
8772 mdr->pin(destdn);
8773
8774 // discover srcdn
8775 filepath srcpath(mdr->slave_request->srcdnpath);
8776 dout(10) << " src " << srcpath << dendl;
8777 CInode *srci = nullptr;
8778 r = mdcache->path_traverse(mdr, cf, srcpath,
8779 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
8780 &trace, &srci);
8781 if (r > 0) return;
8782 ceph_assert(r == 0);
8783
8784 CDentry *srcdn = trace.back();
8785 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8786 dout(10) << " srcdn " << *srcdn << dendl;
8787 mdr->pin(srcdn);
8788 mdr->pin(srci);
8789
8790 // stray?
8791 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
8792 if (linkmerge)
8793 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
8794 CDentry *straydn = mdr->straydn;
8795 if (destdnl->is_primary() && !linkmerge)
8796 ceph_assert(straydn);
8797
8798 mdr->set_op_stamp(mdr->slave_request->op_stamp);
8799 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
8800
8801 // set up commit waiter (early, to clean up any freezing etc we do)
8802 if (!mdr->more()->slave_commit)
8803 mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
8804
8805 // am i srcdn auth?
8806 if (srcdn->is_auth()) {
8807 set<mds_rank_t> srcdnrep;
8808 srcdn->list_replicas(srcdnrep);
8809
8810 bool reply_witness = false;
8811 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
8812 // freeze?
8813 // we need this to
8814 // - avoid conflicting lock state changes
8815 // - avoid concurrent updates to the inode
8816 // (this could also be accomplished with the versionlock)
8817 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
8818 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
8819 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
8820
8821 // unfreeze auth pin after freezing the inode to avoid queueing waiters
8822 if (srcdnl->get_inode()->is_frozen_auth_pin())
8823 mdr->unfreeze_auth_pin();
8824
8825 if (!frozen_inode) {
8826 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
8827 return;
8828 }
8829
8830 /*
8831 * set ambiguous auth for srci
8832 * NOTE: we don't worry about ambiguous cache expire as we do
8833 * with subtree migrations because all slaves will pin
8834 * srcdn->get_inode() for duration of this rename.
8835 */
8836 mdr->set_ambiguous_auth(srcdnl->get_inode());
8837
8838 // just mark the source inode as ambiguous auth if more than two MDS are involved.
8839 // the master will send another OP_RENAMEPREP slave request later.
8840 if (mdr->slave_request->witnesses.size() > 1) {
8841 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
8842 reply_witness = true;
8843 }
8844
8845 // make sure bystanders have received all lock related messages
8846 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
8847 if (*p == mdr->slave_to_mds ||
8848 (mds->is_cluster_degraded() &&
8849 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
8850 continue;
8851 auto notify = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMENOTIFY);
8852 mds->send_message_mds(notify, *p);
8853 mdr->more()->waiting_on_slave.insert(*p);
8854 }
8855
8856 // make sure clients have received all cap related messages
8857 set<client_t> export_client_set;
8858 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
8859
8860 MDSGatherBuilder gather(g_ceph_context);
8861 flush_client_sessions(export_client_set, gather);
8862 if (gather.has_subs()) {
8863 mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
8864 gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
8865 gather.activate();
8866 }
8867 }
8868
8869 // is witness list sufficient?
8870 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
8871 if (*p == mdr->slave_to_mds ||
8872 mdr->slave_request->witnesses.count(*p)) continue;
8873 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
8874 reply_witness = true;
8875 break;
8876 }
8877
8878 if (reply_witness) {
8879 ceph_assert(!srcdnrep.empty());
8880 auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
8881 reply->witnesses.swap(srcdnrep);
8882 mds->send_message_mds(reply, mdr->slave_to_mds);
8883 mdr->reset_slave_request();
8884 return;
8885 }
8886 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
8887 if (!mdr->more()->waiting_on_slave.empty()) {
8888 dout(10) << " still waiting for rename notify acks from "
8889 << mdr->more()->waiting_on_slave << dendl;
8890 return;
8891 }
8892 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
8893 // set ambiguous auth for srci on witnesses
8894 mdr->set_ambiguous_auth(srcdnl->get_inode());
8895 }
8896
8897 // encode everything we'd need to roll this back... basically, just the original state.
8898 rename_rollback rollback;
8899
8900 rollback.reqid = mdr->reqid;
8901
8902 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
8903 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
8904 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
8905 rollback.orig_src.dname = srcdn->get_name();
8906 if (srcdnl->is_primary())
8907 rollback.orig_src.ino = srcdnl->get_inode()->ino();
8908 else {
8909 ceph_assert(srcdnl->is_remote());
8910 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
8911 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
8912 }
8913
8914 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
8915 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
8916 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
8917 rollback.orig_dest.dname = destdn->get_name();
8918 if (destdnl->is_primary())
8919 rollback.orig_dest.ino = destdnl->get_inode()->ino();
8920 else if (destdnl->is_remote()) {
8921 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
8922 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
8923 }
8924
8925 if (straydn) {
8926 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
8927 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
8928 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
8929 rollback.stray.dname = straydn->get_name();
8930 }
8931 if (mdr->slave_request->desti_snapbl.length()) {
8932 CInode *oldin = destdnl->get_inode();
8933 if (oldin->snaprealm) {
8934 encode(true, rollback.desti_snapbl);
8935 oldin->encode_snap_blob(rollback.desti_snapbl);
8936 } else {
8937 encode(false, rollback.desti_snapbl);
8938 }
8939 }
8940 if (mdr->slave_request->srci_snapbl.length()) {
8941 if (srci->snaprealm) {
8942 encode(true, rollback.srci_snapbl);
8943 srci->encode_snap_blob(rollback.srci_snapbl);
8944 } else {
8945 encode(false, rollback.srci_snapbl);
8946 }
8947 }
8948 encode(rollback, mdr->more()->rollback_bl);
8949 // FIXME: rollback snaprealm
8950 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
8951
8952 // journal.
8953 mdr->ls = mdlog->get_current_segment();
8954 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
8955 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
8956 mdlog->start_entry(le);
8957 le->rollback = mdr->more()->rollback_bl;
8958
8959 bufferlist blah; // inode import data... obviously not used if we're the slave
8960 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
8961
8962 if (le->commit.empty()) {
8963 dout(10) << " empty metablob, skipping journal" << dendl;
8964 mdlog->cancel_entry(le);
8965 mdr->ls = NULL;
8966 _logged_slave_rename(mdr, srcdn, destdn, straydn);
8967 } else {
8968 mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
8969 mdr->more()->slave_update_journaled = true;
8970 submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
8971 mdr, __func__);
8972 mdlog->flush();
8973 }
8974 }
8975
8976 void Server::_logged_slave_rename(MDRequestRef& mdr,
8977 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8978 {
8979 dout(10) << "_logged_slave_rename " << *mdr << dendl;
8980
8981 // prepare ack
8982 ref_t<MMDSSlaveRequest> reply;
8983 if (!mdr->aborted) {
8984 reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
8985 if (!mdr->more()->slave_update_journaled)
8986 reply->mark_not_journaled();
8987 }
8988
8989 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8990 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
8991
8992 // export srci?
8993 if (srcdn->is_auth() && srcdnl->is_primary()) {
8994 // set export bounds for CInode::encode_export()
8995 if (reply) {
8996 std::vector<CDir*> bounds;
8997 if (srcdnl->get_inode()->is_dir()) {
8998 srcdnl->get_inode()->get_dirfrags(bounds);
8999 for (const auto& bound : bounds) {
9000 bound->state_set(CDir::STATE_EXPORTBOUND);
9001 }
9002 }
9003
9004 map<client_t,entity_inst_t> exported_client_map;
9005 map<client_t, client_metadata_t> exported_client_metadata_map;
9006 bufferlist inodebl;
9007 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
9008 exported_client_map,
9009 exported_client_metadata_map);
9010
9011 for (const auto& bound : bounds) {
9012 bound->state_clear(CDir::STATE_EXPORTBOUND);
9013 }
9014
9015 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
9016 encode(exported_client_metadata_map, reply->inode_export);
9017 reply->inode_export.claim_append(inodebl);
9018 reply->inode_export_v = srcdnl->get_inode()->inode.version;
9019 }
9020
9021 // remove mdr auth pin
9022 mdr->auth_unpin(srcdnl->get_inode());
9023 mdr->more()->is_inode_exporter = true;
9024
9025 if (srcdnl->get_inode()->is_dirty())
9026 srcdnl->get_inode()->mark_clean();
9027
9028 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
9029 }
9030
9031 // apply
9032 _rename_apply(mdr, srcdn, destdn, straydn);
9033
9034 CDentry::linkage_t *destdnl = destdn->get_linkage();
9035
9036 // bump popularity
9037 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
9038 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
9039 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
9040
9041 // done.
9042 mdr->reset_slave_request();
9043 mdr->straydn = 0;
9044
9045 if (reply) {
9046 mds->send_message_mds(reply, mdr->slave_to_mds);
9047 } else {
9048 ceph_assert(mdr->aborted);
9049 dout(10) << " abort flag set, finishing" << dendl;
9050 mdcache->request_finish(mdr);
9051 }
9052 }
9053
9054 void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
9055 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9056 {
9057 dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
9058
9059 CInode *in = destdn->get_linkage()->get_inode();
9060
9061 inodeno_t migrated_stray;
9062 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
9063 migrated_stray = in->ino();
9064
9065 MDSContext::vec finished;
9066 if (r == 0) {
9067 // unfreeze+singleauth inode
9068 // hmm, do i really need to delay this?
9069 if (mdr->more()->is_inode_exporter) {
9070 // drop our pins
9071 // we exported, clear out any xlocks that we moved to another MDS
9072
9073 for (auto i = mdr->locks.lower_bound(&in->versionlock);
9074 i != mdr->locks.end(); ) {
9075 SimpleLock *lock = i->lock;
9076 if (lock->get_parent() != in)
9077 break;
9078 // we only care about xlocks on the exported inode
9079 if (i->is_xlock() && !lock->is_locallock())
9080 mds->locker->xlock_export(i++, mdr.get());
9081 else
9082 ++i;
9083 }
9084
9085 map<client_t,Capability::Import> peer_imported;
9086 auto bp = mdr->more()->inode_import.cbegin();
9087 decode(peer_imported, bp);
9088
9089 dout(10) << " finishing inode export on " << *in << dendl;
9090 mdcache->migrator->finish_export_inode(in, mdr->slave_to_mds, peer_imported, finished);
9091 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
9092
9093 // unfreeze
9094 ceph_assert(in->is_frozen_inode());
9095 in->unfreeze_inode(finished);
9096 }
9097
9098 // singleauth
9099 if (mdr->more()->is_ambiguous_auth) {
9100 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9101 mdr->more()->is_ambiguous_auth = false;
9102 }
9103
9104 if (straydn && mdr->more()->slave_update_journaled) {
9105 CInode *strayin = straydn->get_projected_linkage()->get_inode();
9106 if (strayin && !strayin->snaprealm)
9107 mdcache->clear_dirty_bits_for_stray(strayin);
9108 }
9109
9110 mds->queue_waiters(finished);
9111 mdr->cleanup();
9112
9113 if (mdr->more()->slave_update_journaled) {
9114 // write a commit to the journal
9115 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
9116 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
9117 ESlaveUpdate::RENAME);
9118 mdlog->start_entry(le);
9119 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
9120 mdlog->flush();
9121 } else {
9122 _committed_slave(mdr);
9123 }
9124 } else {
9125
9126 // abort
9127 // rollback_bl may be empty if we froze the inode but had to provide an expanded
9128 // witness list from the master, and they failed before we tried prep again.
9129 if (mdr->more()->rollback_bl.length()) {
9130 if (mdr->more()->is_inode_exporter) {
9131 dout(10) << " reversing inode export of " << *in << dendl;
9132 in->abort_export();
9133 }
9134 if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
9135 mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
9136 // rollback but preserve the slave request
9137 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
9138 mdr->more()->rollback_bl.clear();
9139 } else
9140 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
9141 } else {
9142 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
9143 // singleauth
9144 if (mdr->more()->is_ambiguous_auth) {
9145 if (srcdn->is_auth())
9146 mdr->more()->rename_inode->unfreeze_inode(finished);
9147
9148 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9149 mdr->more()->is_ambiguous_auth = false;
9150 }
9151 mds->queue_waiters(finished);
9152 mdcache->request_finish(mdr);
9153 }
9154 }
9155
9156 if (migrated_stray && mds->is_stopping())
9157 mdcache->shutdown_export_stray_finish(migrated_stray);
9158 }
9159
9160 void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
9161 bool isdir, int linkunlink, nest_info_t &rstat)
9162 {
9163 fnode_t *pf;
9164 pf = dir->project_fnode();
9165 mut->add_projected_fnode(dir);
9166 pf->version = dir->pre_dirty();
9167
9168 if (isdir) {
9169 pf->fragstat.nsubdirs += linkunlink;
9170 } else {
9171 pf->fragstat.nfiles += linkunlink;
9172 }
9173 if (r.ino) {
9174 pf->rstat.rbytes += linkunlink * rstat.rbytes;
9175 pf->rstat.rfiles += linkunlink * rstat.rfiles;
9176 pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
9177 pf->rstat.rsnaps += linkunlink * rstat.rsnaps;
9178 }
9179 if (pf->fragstat.mtime == ctime) {
9180 pf->fragstat.mtime = r.dirfrag_old_mtime;
9181 if (pf->rstat.rctime == ctime)
9182 pf->rstat.rctime = r.dirfrag_old_rctime;
9183 }
9184 mut->add_updated_lock(&dir->get_inode()->filelock);
9185 mut->add_updated_lock(&dir->get_inode()->nestlock);
9186 }
9187
9188 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
9189 MutationRef mut;
9190 CDentry *srcdn;
9191 version_t srcdnpv;
9192 CDentry *destdn;
9193 CDentry *straydn;
9194 map<client_t,ref_t<MClientSnap>> splits[2];
9195 bool finish_mdr;
9196 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
9197 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
9198 map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
9199 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
9200 straydn(st), finish_mdr(f) {
9201 splits[0].swap(_splits[0]);
9202 splits[1].swap(_splits[1]);
9203 }
9204 void finish(int r) override {
9205 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
9206 destdn, straydn, splits, finish_mdr);
9207 }
9208 };
9209
9210 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
9211 bool finish_mdr)
9212 {
9213 rename_rollback rollback;
9214 auto p = rbl.cbegin();
9215 decode(rollback, p);
9216
9217 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
9218 // need to finish this update before sending resolve to claim the subtree
9219 mdcache->add_rollback(rollback.reqid, master);
9220
9221 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
9222 mut->ls = mds->mdlog->get_current_segment();
9223
9224 CDentry *srcdn = NULL;
9225 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
9226 if (!srcdir)
9227 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
9228 if (srcdir) {
9229 dout(10) << " srcdir " << *srcdir << dendl;
9230 srcdn = srcdir->lookup(rollback.orig_src.dname);
9231 if (srcdn) {
9232 dout(10) << " srcdn " << *srcdn << dendl;
9233 ceph_assert(srcdn->get_linkage()->is_null());
9234 } else
9235 dout(10) << " srcdn not found" << dendl;
9236 } else
9237 dout(10) << " srcdir not found" << dendl;
9238
9239 CDentry *destdn = NULL;
9240 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
9241 if (!destdir)
9242 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
9243 if (destdir) {
9244 dout(10) << " destdir " << *destdir << dendl;
9245 destdn = destdir->lookup(rollback.orig_dest.dname);
9246 if (destdn)
9247 dout(10) << " destdn " << *destdn << dendl;
9248 else
9249 dout(10) << " destdn not found" << dendl;
9250 } else
9251 dout(10) << " destdir not found" << dendl;
9252
9253 CInode *in = NULL;
9254 if (rollback.orig_src.ino) {
9255 in = mdcache->get_inode(rollback.orig_src.ino);
9256 if (in && in->is_dir())
9257 ceph_assert(srcdn && destdn);
9258 } else
9259 in = mdcache->get_inode(rollback.orig_src.remote_ino);
9260
9261 CDir *straydir = NULL;
9262 CDentry *straydn = NULL;
9263 if (rollback.stray.dirfrag.ino) {
9264 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
9265 if (straydir) {
9266 dout(10) << "straydir " << *straydir << dendl;
9267 straydn = straydir->lookup(rollback.stray.dname);
9268 if (straydn) {
9269 dout(10) << " straydn " << *straydn << dendl;
9270 ceph_assert(straydn->get_linkage()->is_primary());
9271 } else
9272 dout(10) << " straydn not found" << dendl;
9273 } else
9274 dout(10) << "straydir not found" << dendl;
9275 }
9276
9277 CInode *target = NULL;
9278 if (rollback.orig_dest.ino) {
9279 target = mdcache->get_inode(rollback.orig_dest.ino);
9280 if (target)
9281 ceph_assert(destdn && straydn);
9282 } else if (rollback.orig_dest.remote_ino)
9283 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
9284
9285 // can't use is_auth() in the resolve stage
9286 mds_rank_t whoami = mds->get_nodeid();
9287 // slave
9288 ceph_assert(!destdn || destdn->authority().first != whoami);
9289 ceph_assert(!straydn || straydn->authority().first != whoami);
9290
9291 bool force_journal_src = false;
9292 bool force_journal_dest = false;
9293 if (in && in->is_dir() && srcdn->authority().first != whoami)
9294 force_journal_src = _need_force_journal(in, false);
9295 if (in && target && target->is_dir())
9296 force_journal_dest = _need_force_journal(in, true);
9297
9298 version_t srcdnpv = 0;
9299 // repair src
9300 if (srcdn) {
9301 if (srcdn->authority().first == whoami)
9302 srcdnpv = srcdn->pre_dirty();
9303 if (rollback.orig_src.ino) {
9304 ceph_assert(in);
9305 srcdn->push_projected_linkage(in);
9306 } else
9307 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
9308 rollback.orig_src.remote_d_type);
9309 }
9310
9311 map<client_t,ref_t<MClientSnap>> splits[2];
9312
9313 CInode::mempool_inode *pip = nullptr;
9314 if (in) {
9315 bool projected;
9316 if (in->get_projected_parent_dn()->authority().first == whoami) {
9317 auto &pi = in->project_inode();
9318 pip = &pi.inode;
9319 mut->add_projected_inode(in);
9320 pip->version = in->pre_dirty();
9321 projected = true;
9322 } else {
9323 pip = in->get_projected_inode();
9324 projected = false;
9325 }
9326 if (pip->ctime == rollback.ctime)
9327 pip->ctime = rollback.orig_src.old_ctime;
9328
9329 if (rollback.srci_snapbl.length() && in->snaprealm) {
9330 bool hadrealm;
9331 auto p = rollback.srci_snapbl.cbegin();
9332 decode(hadrealm, p);
9333 if (hadrealm) {
9334 if (projected && !mds->is_resolve()) {
9335 sr_t *new_srnode = new sr_t();
9336 decode(*new_srnode, p);
9337 in->project_snaprealm(new_srnode);
9338 } else
9339 decode(in->snaprealm->srnode, p);
9340 } else {
9341 SnapRealm *realm;
9342 if (rollback.orig_src.ino) {
9343 ceph_assert(srcdir);
9344 realm = srcdir->get_inode()->find_snaprealm();
9345 } else {
9346 realm = in->snaprealm->parent;
9347 }
9348 if (!mds->is_resolve())
9349 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
9350 if (projected)
9351 in->project_snaprealm(NULL);
9352 else
9353 in->snaprealm->merge_to(realm);
9354 }
9355 }
9356 }
9357
9358 if (srcdn && srcdn->authority().first == whoami) {
9359 nest_info_t blah;
9360 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
9361 in ? in->is_dir() : false, 1, pip ? pip->accounted_rstat : blah);
9362 }
9363
9364 // repair dest
9365 if (destdn) {
9366 if (rollback.orig_dest.ino && target) {
9367 destdn->push_projected_linkage(target);
9368 } else if (rollback.orig_dest.remote_ino) {
9369 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
9370 rollback.orig_dest.remote_d_type);
9371 } else {
9372 // the dentry will be trimmed soon, it's ok to have wrong linkage
9373 if (rollback.orig_dest.ino)
9374 ceph_assert(mds->is_resolve());
9375 destdn->push_projected_linkage();
9376 }
9377 }
9378
9379 if (straydn)
9380 straydn->push_projected_linkage();
9381
9382 if (target) {
9383 bool projected;
9384 CInode::mempool_inode *ti = nullptr;
9385 if (target->get_projected_parent_dn()->authority().first == whoami) {
9386 auto &pi = target->project_inode();
9387 ti = &pi.inode;
9388 mut->add_projected_inode(target);
9389 ti->version = target->pre_dirty();
9390 projected = true;
9391 } else {
9392 ti = target->get_projected_inode();
9393 projected = false;
9394 }
9395 if (ti->ctime == rollback.ctime)
9396 ti->ctime = rollback.orig_dest.old_ctime;
9397 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
9398 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
9399 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
9400 else
9401 ceph_assert(rollback.orig_dest.remote_ino &&
9402 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
9403 } else
9404 ti->nlink++;
9405
9406 if (rollback.desti_snapbl.length() && target->snaprealm) {
9407 bool hadrealm;
9408 auto p = rollback.desti_snapbl.cbegin();
9409 decode(hadrealm, p);
9410 if (hadrealm) {
9411 if (projected && !mds->is_resolve()) {
9412 sr_t *new_srnode = new sr_t();
9413 decode(*new_srnode, p);
9414 target->project_snaprealm(new_srnode);
9415 } else
9416 decode(target->snaprealm->srnode, p);
9417 } else {
9418 SnapRealm *realm;
9419 if (rollback.orig_dest.ino) {
9420 ceph_assert(destdir);
9421 realm = destdir->get_inode()->find_snaprealm();
9422 } else {
9423 realm = target->snaprealm->parent;
9424 }
9425 if (!mds->is_resolve())
9426 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
9427 if (projected)
9428 target->project_snaprealm(NULL);
9429 else
9430 target->snaprealm->merge_to(realm);
9431 }
9432 }
9433 }
9434
9435 if (srcdn)
9436 dout(0) << " srcdn back to " << *srcdn << dendl;
9437 if (in)
9438 dout(0) << " srci back to " << *in << dendl;
9439 if (destdn)
9440 dout(0) << " destdn back to " << *destdn << dendl;
9441 if (target)
9442 dout(0) << " desti back to " << *target << dendl;
9443
9444 // journal it
9445 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
9446 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
9447 mdlog->start_entry(le);
9448
9449 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
9450 le->commit.add_dir_context(srcdir);
9451 if (rollback.orig_src.ino)
9452 le->commit.add_primary_dentry(srcdn, 0, true);
9453 else
9454 le->commit.add_remote_dentry(srcdn, true);
9455 }
9456
9457 if (!rollback.orig_src.ino && // remote linkage
9458 in && in->authority().first == whoami) {
9459 le->commit.add_dir_context(in->get_projected_parent_dir());
9460 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9461 }
9462
9463 if (force_journal_dest) {
9464 ceph_assert(rollback.orig_dest.ino);
9465 le->commit.add_dir_context(destdir);
9466 le->commit.add_primary_dentry(destdn, 0, true);
9467 }
9468
9469 // slave: no need to journal straydn
9470
9471 if (target && target != in && target->authority().first == whoami) {
9472 ceph_assert(rollback.orig_dest.remote_ino);
9473 le->commit.add_dir_context(target->get_projected_parent_dir());
9474 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
9475 }
9476
9477 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
9478 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
9479 le->commit.renamed_dirino = in->ino();
9480 if (srcdn->authority().first == whoami) {
9481 auto&& ls = in->get_dirfrags();
9482 for (const auto& dir : ls) {
9483 if (!dir->is_auth())
9484 le->commit.renamed_dir_frags.push_back(dir->get_frag());
9485 }
9486 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
9487 }
9488 } else if (force_journal_dest) {
9489 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
9490 le->commit.renamed_dirino = target->ino();
9491 }
9492
9493 if (target && target->is_dir()) {
9494 ceph_assert(destdn);
9495 mdcache->project_subtree_rename(target, straydir, destdir);
9496 }
9497
9498 if (in && in->is_dir()) {
9499 ceph_assert(srcdn);
9500 mdcache->project_subtree_rename(in, destdir, srcdir);
9501 }
9502
9503 if (mdr && !mdr->more()->slave_update_journaled) {
9504 ceph_assert(le->commit.empty());
9505 mdlog->cancel_entry(le);
9506 mut->ls = NULL;
9507 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
9508 } else {
9509 ceph_assert(!le->commit.empty());
9510 if (mdr)
9511 mdr->more()->slave_update_journaled = false;
9512 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
9513 srcdn, srcdnpv, destdn, straydn,
9514 splits, finish_mdr);
9515 submit_mdlog_entry(le, fin, mdr, __func__);
9516 mdlog->flush();
9517 }
9518 }
9519
9520 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
9521 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
9522 map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
9523 {
9524 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
9525
9526 if (straydn) {
9527 straydn->get_dir()->unlink_inode(straydn);
9528 straydn->pop_projected_linkage();
9529 }
9530 if (destdn) {
9531 destdn->get_dir()->unlink_inode(destdn);
9532 destdn->pop_projected_linkage();
9533 }
9534 if (srcdn) {
9535 srcdn->pop_projected_linkage();
9536 if (srcdn->authority().first == mds->get_nodeid()) {
9537 srcdn->mark_dirty(srcdnpv, mut->ls);
9538 if (srcdn->get_linkage()->is_primary())
9539 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
9540 }
9541 }
9542
9543 mut->apply();
9544
9545 if (srcdn && srcdn->get_linkage()->is_primary()) {
9546 CInode *in = srcdn->get_linkage()->get_inode();
9547 if (in && in->is_dir()) {
9548 ceph_assert(destdn);
9549 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
9550 }
9551 }
9552
9553 if (destdn) {
9554 CInode *oldin = destdn->get_linkage()->get_inode();
9555 // update subtree map?
9556 if (oldin && oldin->is_dir()) {
9557 ceph_assert(straydn);
9558 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
9559 }
9560 }
9561
9562 if (mds->is_resolve()) {
9563 CDir *root = NULL;
9564 if (straydn)
9565 root = mdcache->get_subtree_root(straydn->get_dir());
9566 else if (destdn)
9567 root = mdcache->get_subtree_root(destdn->get_dir());
9568 if (root)
9569 mdcache->try_trim_non_auth_subtree(root);
9570 } else {
9571 mdcache->send_snaps(splits[1]);
9572 mdcache->send_snaps(splits[0]);
9573 }
9574
9575 if (mdr) {
9576 MDSContext::vec finished;
9577 if (mdr->more()->is_ambiguous_auth) {
9578 if (srcdn->is_auth())
9579 mdr->more()->rename_inode->unfreeze_inode(finished);
9580
9581 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9582 mdr->more()->is_ambiguous_auth = false;
9583 }
9584 mds->queue_waiters(finished);
9585 if (finish_mdr || mdr->aborted)
9586 mdcache->request_finish(mdr);
9587 else
9588 mdr->more()->slave_rolling_back = false;
9589 }
9590
9591 mdcache->finish_rollback(mut->reqid, mdr);
9592
9593 mut->cleanup();
9594 }
9595
9596 void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
9597 {
9598 dout(10) << "handle_slave_rename_prep_ack " << *mdr
9599 << " witnessed by " << ack->get_source()
9600 << " " << *ack << dendl;
9601 mds_rank_t from = mds_rank_t(ack->get_source().num());
9602
9603 // note slave
9604 mdr->more()->slaves.insert(from);
9605 if (mdr->more()->srcdn_auth_mds == from &&
9606 mdr->more()->is_remote_frozen_authpin &&
9607 !mdr->more()->is_ambiguous_auth) {
9608 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
9609 }
9610
9611 // witnessed? or add extra witnesses?
9612 ceph_assert(mdr->more()->witnessed.count(from) == 0);
9613 if (ack->is_interrupted()) {
9614 dout(10) << " slave request interrupted, noop" << dendl;
9615 } else if (ack->witnesses.empty()) {
9616 mdr->more()->witnessed.insert(from);
9617 if (!ack->is_not_journaled())
9618 mdr->more()->has_journaled_slaves = true;
9619 } else {
9620 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
9621 mdr->more()->extra_witnesses = ack->witnesses;
9622 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
9623 }
9624
9625 // srci import?
9626 if (ack->inode_export.length()) {
9627 dout(10) << " got srci import" << dendl;
9628 mdr->more()->inode_import.share(ack->inode_export);
9629 mdr->more()->inode_import_v = ack->inode_export_v;
9630 }
9631
9632 // remove from waiting list
9633 ceph_assert(mdr->more()->waiting_on_slave.count(from));
9634 mdr->more()->waiting_on_slave.erase(from);
9635
9636 if (mdr->more()->waiting_on_slave.empty())
9637 dispatch_client_request(mdr); // go again!
9638 else
9639 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
9640 }
9641
9642 void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
9643 {
9644 dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
9645 << ack->get_source() << dendl;
9646 ceph_assert(mdr->is_slave());
9647 mds_rank_t from = mds_rank_t(ack->get_source().num());
9648
9649 if (mdr->more()->waiting_on_slave.count(from)) {
9650 mdr->more()->waiting_on_slave.erase(from);
9651
9652 if (mdr->more()->waiting_on_slave.empty()) {
9653 if (mdr->slave_request)
9654 dispatch_slave_request(mdr);
9655 } else
9656 dout(10) << " still waiting for rename notify acks from "
9657 << mdr->more()->waiting_on_slave << dendl;
9658 }
9659 }
9660
9661 void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
9662 {
9663 dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
9664
9665 if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
9666 mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
9667
9668 if (mdr->more()->waiting_on_slave.empty()) {
9669 if (mdr->slave_request)
9670 dispatch_slave_request(mdr);
9671 } else
9672 dout(10) << " still waiting for rename notify acks from "
9673 << mdr->more()->waiting_on_slave << dendl;
9674 }
9675 }
9676
9677 // snaps
9678 /* This function takes responsibility for the passed mdr*/
9679 void Server::handle_client_lssnap(MDRequestRef& mdr)
9680 {
9681 const cref_t<MClientRequest> &req = mdr->client_request;
9682
9683 // traverse to path
9684 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
9685 if (!diri)
9686 return;
9687
9688 if (!diri->is_dir()) {
9689 respond_to_request(mdr, -ENOTDIR);
9690 return;
9691 }
9692 dout(10) << "lssnap on " << *diri << dendl;
9693
9694 // lock snap
9695 if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
9696 return;
9697
9698 if (!check_access(mdr, diri, MAY_READ))
9699 return;
9700
9701 SnapRealm *realm = diri->find_snaprealm();
9702 map<snapid_t,const SnapInfo*> infomap;
9703 realm->get_snap_info(infomap, diri->get_oldest_snap());
9704
9705 unsigned max_entries = req->head.args.readdir.max_entries;
9706 if (!max_entries)
9707 max_entries = infomap.size();
9708 int max_bytes = req->head.args.readdir.max_bytes;
9709 if (!max_bytes)
9710 // make sure at least one item can be encoded
9711 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
9712
9713 __u64 last_snapid = 0;
9714 string offset_str = req->get_path2();
9715 if (!offset_str.empty())
9716 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
9717
9718 //Empty DirStat
9719 bufferlist dirbl;
9720 static DirStat empty;
9721 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
9722
9723 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
9724
9725 __u32 num = 0;
9726 bufferlist dnbl;
9727 auto p = infomap.upper_bound(last_snapid);
9728 for (; p != infomap.end() && num < max_entries; ++p) {
9729 dout(10) << p->first << " -> " << *p->second << dendl;
9730
9731 // actual
9732 string snap_name;
9733 if (p->second->ino == diri->ino())
9734 snap_name = p->second->name;
9735 else
9736 snap_name = p->second->get_long_name();
9737
9738 unsigned start_len = dnbl.length();
9739 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
9740 break;
9741
9742 encode(snap_name, dnbl);
9743 //infinite lease
9744 LeaseStat e(CEPH_LEASE_VALID, -1, 0);
9745 mds->locker->encode_lease(dnbl, mdr->session->info, e);
9746 dout(20) << "encode_infinite_lease" << dendl;
9747
9748 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
9749 if (r < 0) {
9750 bufferlist keep;
9751 keep.substr_of(dnbl, 0, start_len);
9752 dnbl.swap(keep);
9753 break;
9754 }
9755 ++num;
9756 }
9757
9758 encode(num, dirbl);
9759 __u16 flags = 0;
9760 if (p == infomap.end()) {
9761 flags = CEPH_READDIR_FRAG_END;
9762 if (last_snapid == 0)
9763 flags |= CEPH_READDIR_FRAG_COMPLETE;
9764 }
9765 encode(flags, dirbl);
9766 dirbl.claim_append(dnbl);
9767
9768 mdr->reply_extra_bl = dirbl;
9769 mdr->tracei = diri;
9770 respond_to_request(mdr, 0);
9771 }
9772
9773
9774 // MKSNAP
9775
9776 struct C_MDS_mksnap_finish : public ServerLogContext {
9777 CInode *diri;
9778 SnapInfo info;
9779 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
9780 ServerLogContext(s, r), diri(di), info(i) {}
9781 void finish(int r) override {
9782 server->_mksnap_finish(mdr, diri, info);
9783 }
9784 };
9785
9786 /* This function takes responsibility for the passed mdr*/
9787 void Server::handle_client_mksnap(MDRequestRef& mdr)
9788 {
9789 const cref_t<MClientRequest> &req = mdr->client_request;
9790 // make sure we have as new a map as the client
9791 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
9792 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
9793 return;
9794 }
9795 if (!mds->mdsmap->allows_snaps()) {
9796 // you can't make snapshots until you set an option right now
9797 respond_to_request(mdr, -EPERM);
9798 return;
9799 }
9800
9801 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
9802 if (!diri)
9803 return;
9804
9805 // dir only
9806 if (!diri->is_dir()) {
9807 respond_to_request(mdr, -ENOTDIR);
9808 return;
9809 }
9810 if (diri->is_system() && !diri->is_root()) {
9811 // no snaps in system dirs (root is ok)
9812 respond_to_request(mdr, -EPERM);
9813 return;
9814 }
9815
9816 std::string_view snapname = req->get_filepath().last_dentry();
9817
9818 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
9819 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
9820 respond_to_request(mdr, -EPERM);
9821 return;
9822 }
9823
9824 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
9825
9826 // lock snap
9827 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
9828 MutationImpl::LockOpVec lov;
9829 lov.add_xlock(&diri->snaplock);
9830 if (!mds->locker->acquire_locks(mdr, lov))
9831 return;
9832
9833 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
9834 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
9835 return;
9836 }
9837 mdr->locking_state |= MutationImpl::ALL_LOCKED;
9838 }
9839
9840 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
9841 return;
9842
9843 // check if we can create any more snapshots
9844 // we don't allow any more if we are already at or beyond the limit
9845 if (diri->snaprealm &&
9846 diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
9847 respond_to_request(mdr, -EMLINK);
9848 return;
9849 }
9850
9851 // make sure name is unique
9852 if (diri->snaprealm &&
9853 diri->snaprealm->exists(snapname)) {
9854 respond_to_request(mdr, -EEXIST);
9855 return;
9856 }
9857 if (snapname.length() == 0 ||
9858 snapname[0] == '_') {
9859 respond_to_request(mdr, -EINVAL);
9860 return;
9861 }
9862
9863 // allocate a snapid
9864 if (!mdr->more()->stid) {
9865 // prepare an stid
9866 mds->snapclient->prepare_create(diri->ino(), snapname,
9867 mdr->get_mds_stamp(),
9868 &mdr->more()->stid, &mdr->more()->snapidbl,
9869 new C_MDS_RetryRequest(mdcache, mdr));
9870 return;
9871 }
9872
9873 version_t stid = mdr->more()->stid;
9874 snapid_t snapid;
9875 auto p = mdr->more()->snapidbl.cbegin();
9876 decode(snapid, p);
9877 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
9878
9879 ceph_assert(mds->snapclient->get_cached_version() >= stid);
9880
9881 // journal
9882 SnapInfo info;
9883 info.ino = diri->ino();
9884 info.snapid = snapid;
9885 info.name = snapname;
9886 info.stamp = mdr->get_op_stamp();
9887
9888 auto &pi = diri->project_inode(false, true);
9889 pi.inode.ctime = info.stamp;
9890 if (info.stamp > pi.inode.rstat.rctime)
9891 pi.inode.rstat.rctime = info.stamp;
9892 pi.inode.rstat.rsnaps++;
9893 pi.inode.version = diri->pre_dirty();
9894
9895 // project the snaprealm
9896 auto &newsnap = *pi.snapnode;
9897 newsnap.created = snapid;
9898 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
9899 if (!em.second)
9900 em.first->second = info;
9901 newsnap.seq = snapid;
9902 newsnap.last_created = snapid;
9903
9904 // journal the inode changes
9905 mdr->ls = mdlog->get_current_segment();
9906 EUpdate *le = new EUpdate(mdlog, "mksnap");
9907 mdlog->start_entry(le);
9908
9909 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
9910 le->metablob.add_table_transaction(TABLE_SNAP, stid);
9911 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
9912 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
9913
9914 // journal the snaprealm changes
9915 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
9916 mdr, __func__);
9917 mdlog->flush();
9918 }
9919
9920 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
9921 {
9922 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
9923
9924 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
9925
9926 diri->pop_and_dirty_projected_inode(mdr->ls);
9927 mdr->apply();
9928
9929 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
9930
9931 // create snap
9932 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
9933
9934 // notify other mds
9935 mdcache->send_snap_update(diri, mdr->more()->stid, op);
9936
9937 mdcache->do_realm_invalidate_and_update_notify(diri, op);
9938
9939 // yay
9940 mdr->in[0] = diri;
9941 mdr->snapid = info.snapid;
9942 mdr->tracei = diri;
9943 respond_to_request(mdr, 0);
9944 }
9945
9946
9947 // RMSNAP
9948
9949 struct C_MDS_rmsnap_finish : public ServerLogContext {
9950 CInode *diri;
9951 snapid_t snapid;
9952 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
9953 ServerLogContext(s, r), diri(di), snapid(sn) {}
9954 void finish(int r) override {
9955 server->_rmsnap_finish(mdr, diri, snapid);
9956 }
9957 };
9958
9959 /* This function takes responsibility for the passed mdr*/
9960 void Server::handle_client_rmsnap(MDRequestRef& mdr)
9961 {
9962 const cref_t<MClientRequest> &req = mdr->client_request;
9963
9964 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
9965 if (!diri)
9966 return;
9967
9968 if (!diri->is_dir()) {
9969 respond_to_request(mdr, -ENOTDIR);
9970 return;
9971 }
9972
9973 std::string_view snapname = req->get_filepath().last_dentry();
9974
9975 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
9976 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
9977 respond_to_request(mdr, -EPERM);
9978 return;
9979 }
9980
9981 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
9982
9983 // does snap exist?
9984 if (snapname.length() == 0 || snapname[0] == '_') {
9985 respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
9986 return;
9987 }
9988 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
9989 respond_to_request(mdr, -ENOENT);
9990 return;
9991 }
9992 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
9993 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
9994
9995 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
9996 MutationImpl::LockOpVec lov;
9997 lov.add_xlock(&diri->snaplock);
9998 if (!mds->locker->acquire_locks(mdr, lov))
9999 return;
10000 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10001 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10002 return;
10003 }
10004 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10005 }
10006
10007 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10008 return;
10009
10010 // prepare
10011 if (!mdr->more()->stid) {
10012 mds->snapclient->prepare_destroy(diri->ino(), snapid,
10013 &mdr->more()->stid, &mdr->more()->snapidbl,
10014 new C_MDS_RetryRequest(mdcache, mdr));
10015 return;
10016 }
10017 version_t stid = mdr->more()->stid;
10018 auto p = mdr->more()->snapidbl.cbegin();
10019 snapid_t seq;
10020 decode(seq, p);
10021 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
10022
10023 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10024
10025 // journal
10026 auto &pi = diri->project_inode(false, true);
10027 pi.inode.version = diri->pre_dirty();
10028 pi.inode.ctime = mdr->get_op_stamp();
10029 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
10030 pi.inode.rstat.rctime = mdr->get_op_stamp();
10031 pi.inode.rstat.rsnaps--;
10032
10033 mdr->ls = mdlog->get_current_segment();
10034 EUpdate *le = new EUpdate(mdlog, "rmsnap");
10035 mdlog->start_entry(le);
10036
10037 // project the snaprealm
10038 auto &newnode = *pi.snapnode;
10039 newnode.snaps.erase(snapid);
10040 newnode.seq = seq;
10041 newnode.last_destroyed = seq;
10042
10043 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10044 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10045 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10046 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10047
10048 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
10049 mdr, __func__);
10050 mdlog->flush();
10051 }
10052
10053 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10054 {
10055 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
10056 snapid_t stid = mdr->more()->stid;
10057 auto p = mdr->more()->snapidbl.cbegin();
10058 snapid_t seq;
10059 decode(seq, p);
10060
10061 diri->pop_and_dirty_projected_inode(mdr->ls);
10062 mdr->apply();
10063
10064 mds->snapclient->commit(stid, mdr->ls);
10065
10066 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10067
10068 // notify other mds
10069 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
10070
10071 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
10072
10073 // yay
10074 mdr->in[0] = diri;
10075 respond_to_request(mdr, 0);
10076
10077 // purge snapshot data
10078 if (diri->snaprealm->have_past_parents_open())
10079 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
10080 }
10081
10082 struct C_MDS_renamesnap_finish : public ServerLogContext {
10083 CInode *diri;
10084 snapid_t snapid;
10085 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10086 ServerLogContext(s, r), diri(di), snapid(sn) {}
10087 void finish(int r) override {
10088 server->_renamesnap_finish(mdr, diri, snapid);
10089 }
10090 };
10091
10092 /* This function takes responsibility for the passed mdr*/
10093 void Server::handle_client_renamesnap(MDRequestRef& mdr)
10094 {
10095 const cref_t<MClientRequest> &req = mdr->client_request;
10096 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
10097 respond_to_request(mdr, -EINVAL);
10098 return;
10099 }
10100
10101 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10102 if (!diri)
10103 return;
10104
10105 if (!diri->is_dir()) { // dir only
10106 respond_to_request(mdr, -ENOTDIR);
10107 return;
10108 }
10109
10110 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
10111 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10112 respond_to_request(mdr, -EPERM);
10113 return;
10114 }
10115
10116 std::string_view dstname = req->get_filepath().last_dentry();
10117 std::string_view srcname = req->get_filepath2().last_dentry();
10118 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
10119
10120 if (srcname.length() == 0 || srcname[0] == '_') {
10121 respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
10122 return;
10123 }
10124 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
10125 respond_to_request(mdr, -ENOENT);
10126 return;
10127 }
10128 if (dstname.length() == 0 || dstname[0] == '_') {
10129 respond_to_request(mdr, -EINVAL);
10130 return;
10131 }
10132 if (diri->snaprealm->exists(dstname)) {
10133 respond_to_request(mdr, -EEXIST);
10134 return;
10135 }
10136
10137 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
10138 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
10139
10140 // lock snap
10141 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10142 MutationImpl::LockOpVec lov;
10143 lov.add_xlock(&diri->snaplock);
10144 if (!mds->locker->acquire_locks(mdr, lov))
10145 return;
10146 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10147 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10148 return;
10149 }
10150 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10151 }
10152
10153 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10154 return;
10155
10156 // prepare
10157 if (!mdr->more()->stid) {
10158 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
10159 &mdr->more()->stid,
10160 new C_MDS_RetryRequest(mdcache, mdr));
10161 return;
10162 }
10163
10164 version_t stid = mdr->more()->stid;
10165 dout(10) << " stid is " << stid << dendl;
10166
10167 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10168
10169 // journal
10170 auto &pi = diri->project_inode(false, true);
10171 pi.inode.ctime = mdr->get_op_stamp();
10172 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
10173 pi.inode.rstat.rctime = mdr->get_op_stamp();
10174 pi.inode.version = diri->pre_dirty();
10175
10176 // project the snaprealm
10177 auto &newsnap = *pi.snapnode;
10178 auto it = newsnap.snaps.find(snapid);
10179 ceph_assert(it != newsnap.snaps.end());
10180 it->second.name = dstname;
10181
10182 // journal the inode changes
10183 mdr->ls = mdlog->get_current_segment();
10184 EUpdate *le = new EUpdate(mdlog, "renamesnap");
10185 mdlog->start_entry(le);
10186
10187 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10188 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10189 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10190 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10191
10192 // journal the snaprealm changes
10193 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
10194 mdr, __func__);
10195 mdlog->flush();
10196 }
10197
10198 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10199 {
10200 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
10201
10202 diri->pop_and_dirty_projected_inode(mdr->ls);
10203 mdr->apply();
10204
10205 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10206
10207 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10208
10209 // notify other mds
10210 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
10211
10212 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
10213
10214 // yay
10215 mdr->in[0] = diri;
10216 mdr->tracei = diri;
10217 mdr->snapid = snapid;
10218 respond_to_request(mdr, 0);
10219 }
10220
10221 /**
10222 * Return true if server is in state RECONNECT and this
10223 * client has not yet reconnected.
10224 */
10225 bool Server::waiting_for_reconnect(client_t c) const
10226 {
10227 return client_reconnect_gather.count(c) > 0;
10228 }
10229
10230 void Server::dump_reconnect_status(Formatter *f) const
10231 {
10232 f->open_object_section("reconnect_status");
10233 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
10234 f->close_section();
10235 }