]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
import ceph quincy 17.2.1
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
21
22 #include "MDSRank.h"
23 #include "Server.h"
24 #include "Locker.h"
25 #include "MDCache.h"
26 #include "MDLog.h"
27 #include "Migrator.h"
28 #include "MDBalancer.h"
29 #include "InoTable.h"
30 #include "SnapClient.h"
31 #include "Mutation.h"
32 #include "MetricsHandler.h"
33 #include "cephfs_features.h"
34
35 #include "msg/Messenger.h"
36
37 #include "osdc/Objecter.h"
38
39 #include "events/EUpdate.h"
40 #include "events/EPeerUpdate.h"
41 #include "events/ESession.h"
42 #include "events/EOpen.h"
43 #include "events/ECommitted.h"
44 #include "events/EPurged.h"
45
46 #include "include/stringify.h"
47 #include "include/filepath.h"
48 #include "common/errno.h"
49 #include "common/Timer.h"
50 #include "common/perf_counters.h"
51 #include "include/compat.h"
52 #include "osd/OSDMap.h"
53
54 #include <errno.h>
55
56 #include <list>
57 #include <regex>
58 #include <string_view>
59 #include <functional>
60
61 #include "common/config.h"
62
63 #define dout_context g_ceph_context
64 #define dout_subsys ceph_subsys_mds
65 #undef dout_prefix
66 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
67
68 using namespace std;
69
70 class ServerContext : public MDSContext {
71 protected:
72 Server *server;
73 MDSRank *get_mds() override
74 {
75 return server->mds;
76 }
77
78 public:
79 explicit ServerContext(Server *s) : server(s) {
80 ceph_assert(server != NULL);
81 }
82 };
83
84 class Batch_Getattr_Lookup : public BatchOp {
85 protected:
86 Server* server;
87 ceph::ref_t<MDRequestImpl> mdr;
88 std::vector<ceph::ref_t<MDRequestImpl>> batch_reqs;
89 int res = 0;
90 public:
91 Batch_Getattr_Lookup(Server* s, const ceph::ref_t<MDRequestImpl>& r)
92 : server(s), mdr(r) {
93 if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP)
94 mdr->batch_op_map = &mdr->dn[0].back()->batch_ops;
95 else
96 mdr->batch_op_map = &mdr->in[0]->batch_ops;
97 }
98 void add_request(const ceph::ref_t<MDRequestImpl>& r) override {
99 batch_reqs.push_back(r);
100 }
101 ceph::ref_t<MDRequestImpl> find_new_head() override {
102 while (!batch_reqs.empty()) {
103 auto r = std::move(batch_reqs.back());
104 batch_reqs.pop_back();
105 if (r->killed)
106 continue;
107
108 r->batch_op_map = mdr->batch_op_map;
109 mdr->batch_op_map = nullptr;
110 mdr = r;
111 return mdr;
112 }
113 return nullptr;
114 }
115 void _forward(mds_rank_t t) override {
116 MDCache* mdcache = server->mdcache;
117 mdcache->mds->forward_message_mds(mdr->release_client_request(), t);
118 mdr->set_mds_stamp(ceph_clock_now());
119 for (auto& m : batch_reqs) {
120 if (!m->killed)
121 mdcache->request_forward(m, t);
122 }
123 batch_reqs.clear();
124 }
125 void _respond(int r) override {
126 mdr->set_mds_stamp(ceph_clock_now());
127 for (auto& m : batch_reqs) {
128 if (!m->killed) {
129 m->tracei = mdr->tracei;
130 m->tracedn = mdr->tracedn;
131 server->respond_to_request(m, r);
132 }
133 }
134 batch_reqs.clear();
135 server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
136 }
137 void print(std::ostream& o) {
138 o << "[batch front=" << *mdr << "]";
139 }
140 };
141
142 class ServerLogContext : public MDSLogContextBase {
143 protected:
144 Server *server;
145 MDSRank *get_mds() override
146 {
147 return server->mds;
148 }
149
150 MDRequestRef mdr;
151 void pre_finish(int r) override {
152 if (mdr)
153 mdr->mark_event("journal_committed: ");
154 }
155 public:
156 explicit ServerLogContext(Server *s) : server(s) {
157 ceph_assert(server != NULL);
158 }
159 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
160 ceph_assert(server != NULL);
161 }
162 };
163
164 void Server::create_logger()
165 {
166 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
167
168 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
169 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
170 plb.add_u64_counter(l_mdss_handle_peer_request, "handle_peer_request",
171 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
172 plb.add_u64_counter(l_mdss_handle_client_session,
173 "handle_client_session", "Client session messages", "hcs",
174 PerfCountersBuilder::PRIO_INTERESTING);
175 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
176 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
177 plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
178 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
179 PerfCountersBuilder::PRIO_INTERESTING);
180
181 // fop latencies are useful
182 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
183 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
184 "Request type lookup hash of inode latency");
185 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
186 "Request type lookup inode latency");
187 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
188 "Request type lookup parent latency");
189 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
190 "Request type lookup name latency");
191 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
192 "Request type lookup latency");
193 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
194 "Request type lookup snapshot latency");
195 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
196 "Request type get attribute latency");
197 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
198 "Request type set attribute latency");
199 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
200 "Request type set file layout latency");
201 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
202 "Request type set directory layout latency");
203 plb.add_time_avg(l_mdss_req_getvxattr_latency, "req_getvxattr_latency",
204 "Request type get virtual extended attribute latency");
205 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
206 "Request type set extended attribute latency");
207 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
208 "Request type remove extended attribute latency");
209 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
210 "Request type read directory latency");
211 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
212 "Request type set file lock latency");
213 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
214 "Request type get file lock latency");
215 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
216 "Request type create latency");
217 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
218 "Request type open latency");
219 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
220 "Request type make node latency");
221 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
222 "Request type link latency");
223 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
224 "Request type unlink latency");
225 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
226 "Request type remove directory latency");
227 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
228 "Request type rename latency");
229 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
230 "Request type make directory latency");
231 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
232 "Request type symbolic link latency");
233 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
234 "Request type list snapshot latency");
235 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
236 "Request type make snapshot latency");
237 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
238 "Request type remove snapshot latency");
239 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
240 "Request type rename snapshot latency");
241
242 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
243 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
244 "Client requests dispatched");
245 plb.add_u64_counter(l_mdss_dispatch_peer_request, "dispatch_server_request",
246 "Server requests dispatched");
247
248 logger = plb.create_perf_counters();
249 g_ceph_context->get_perfcounters_collection()->add(logger);
250 }
251
252 Server::Server(MDSRank *m, MetricsHandler *metrics_handler) :
253 mds(m),
254 mdcache(mds->mdcache), mdlog(mds->mdlog),
255 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
256 metrics_handler(metrics_handler)
257 {
258 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
259 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
260 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
261 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
262 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
263 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
264 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
265 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
266 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
267 dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
268 bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
269 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
270 supported_metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
271 }
272
273 void Server::dispatch(const cref_t<Message> &m)
274 {
275 switch (m->get_type()) {
276 case CEPH_MSG_CLIENT_RECONNECT:
277 handle_client_reconnect(ref_cast<MClientReconnect>(m));
278 return;
279 }
280
281 /*
282 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
283
284 1. In reconnect phase, client sent unsafe requests to mds.
285 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
286 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
287 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
288
289 */
290 bool sessionclosed_isok = replay_unsafe_with_closed_session;
291 // active?
292 // handle_peer_request()/handle_client_session() will wait if necessary
293 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
294 const auto &req = ref_cast<MClientRequest>(m);
295 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
296 Session *session = mds->get_session(req);
297 if (!session || (!session->is_open() && !sessionclosed_isok)) {
298 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
299 return;
300 }
301 bool queue_replay = false;
302 if (req->is_replay() || req->is_async()) {
303 dout(3) << "queuing replayed op" << dendl;
304 queue_replay = true;
305 if (req->head.ino &&
306 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
307 inodeno_t ino(req->head.ino);
308 mdcache->add_replay_ino_alloc(ino);
309 if (replay_unsafe_with_closed_session &&
310 session->free_prealloc_inos.contains(ino)) {
311 // don't purge inodes that will be created by later replay
312 session->free_prealloc_inos.erase(ino);
313 session->delegated_inos.insert(ino);
314 }
315 }
316 } else if (req->get_retry_attempt()) {
317 // process completed request in clientreplay stage. The completed request
318 // might have created new file/directorie. This guarantees MDS sends a reply
319 // to client before other request modifies the new file/directorie.
320 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
321 dout(3) << "queuing completed op" << dendl;
322 queue_replay = true;
323 }
324 // this request was created before the cap reconnect message, drop any embedded
325 // cap releases.
326 req->releases.clear();
327 }
328 if (queue_replay) {
329 req->mark_queued_for_replay();
330 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
331 return;
332 }
333 }
334
335 bool wait_for_active = true;
336 if (mds->is_stopping()) {
337 wait_for_active = false;
338 } else if (mds->is_clientreplay()) {
339 if (req->is_queued_for_replay()) {
340 wait_for_active = false;
341 }
342 }
343 if (wait_for_active) {
344 dout(3) << "not active yet, waiting" << dendl;
345 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
346 return;
347 }
348 }
349
350 switch (m->get_type()) {
351 case CEPH_MSG_CLIENT_SESSION:
352 handle_client_session(ref_cast<MClientSession>(m));
353 return;
354 case CEPH_MSG_CLIENT_REQUEST:
355 handle_client_request(ref_cast<MClientRequest>(m));
356 return;
357 case CEPH_MSG_CLIENT_RECLAIM:
358 handle_client_reclaim(ref_cast<MClientReclaim>(m));
359 return;
360 case MSG_MDS_PEER_REQUEST:
361 handle_peer_request(ref_cast<MMDSPeerRequest>(m));
362 return;
363 default:
364 derr << "server unknown message " << m->get_type() << dendl;
365 ceph_abort_msg("server unknown message");
366 }
367 }
368
369
370
371 // ----------------------------------------------------------
372 // SESSION management
373
374 class C_MDS_session_finish : public ServerLogContext {
375 Session *session;
376 uint64_t state_seq;
377 bool open;
378 version_t cmapv;
379 interval_set<inodeno_t> inos_to_free;
380 version_t inotablev;
381 interval_set<inodeno_t> inos_to_purge;
382 LogSegment *ls = nullptr;
383 Context *fin;
384 public:
385 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = nullptr) :
386 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
387 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv,
388 const interval_set<inodeno_t>& to_free, version_t iv,
389 const interval_set<inodeno_t>& to_purge, LogSegment *_ls, Context *fin_ = nullptr) :
390 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv),
391 inos_to_free(to_free), inotablev(iv), inos_to_purge(to_purge), ls(_ls), fin(fin_) {}
392 void finish(int r) override {
393 ceph_assert(r == 0);
394 server->_session_logged(session, state_seq, open, cmapv, inos_to_free, inotablev, inos_to_purge, ls);
395 if (fin) {
396 fin->complete(r);
397 }
398 }
399 };
400
401 Session* Server::find_session_by_uuid(std::string_view uuid)
402 {
403 Session* session = nullptr;
404 for (auto& it : mds->sessionmap.get_sessions()) {
405 auto& metadata = it.second->info.client_metadata;
406
407 auto p = metadata.find("uuid");
408 if (p == metadata.end() || p->second != uuid)
409 continue;
410
411 if (!session) {
412 session = it.second;
413 } else if (!session->reclaiming_from) {
414 ceph_assert(it.second->reclaiming_from == session);
415 session = it.second;
416 } else {
417 ceph_assert(session->reclaiming_from == it.second);
418 }
419 }
420 return session;
421 }
422
423 void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
424 {
425 if (!session->is_open() && !session->is_stale()) {
426 dout(10) << "session not open, dropping this req" << dendl;
427 return;
428 }
429
430 auto reply = make_message<MClientReclaimReply>(0);
431 if (m->get_uuid().empty()) {
432 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
433 reply->set_result(-CEPHFS_EINVAL);
434 mds->send_message_client(reply, session);
435 return;
436 }
437
438 unsigned flags = m->get_flags();
439 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
440 dout(10) << __func__ << " unsupported flags" << dendl;
441 reply->set_result(-CEPHFS_EOPNOTSUPP);
442 mds->send_message_client(reply, session);
443 return;
444 }
445
446 Session* target = find_session_by_uuid(m->get_uuid());
447 if (target) {
448 if (session->info.auth_name != target->info.auth_name) {
449 dout(10) << __func__ << " session auth_name " << session->info.auth_name
450 << " != target auth_name " << target->info.auth_name << dendl;
451 reply->set_result(-CEPHFS_EPERM);
452 mds->send_message_client(reply, session);
453 }
454
455 ceph_assert(!target->reclaiming_from);
456 ceph_assert(!session->reclaiming_from);
457 session->reclaiming_from = target;
458 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
459 }
460
461 if (flags & CEPH_RECLAIM_RESET) {
462 finish_reclaim_session(session, reply);
463 return;
464 }
465
466 ceph_abort();
467 }
468
469 void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
470 {
471 Session *target = session->reclaiming_from;
472 if (target) {
473 session->reclaiming_from = nullptr;
474
475 Context *send_reply;
476 if (reply) {
477 int64_t session_id = session->get_client().v;
478 send_reply = new LambdaContext([this, session_id, reply](int r) {
479 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
480 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
481 if (!session) {
482 return;
483 }
484 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
485 reply->set_epoch(epoch);
486 mds->send_message_client(reply, session);
487 });
488 } else {
489 send_reply = nullptr;
490 }
491
492 bool blocklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
493 return map.is_blocklisted(target->info.inst.addr);
494 });
495
496 if (blocklisted || !g_conf()->mds_session_blocklist_on_evict) {
497 kill_session(target, send_reply);
498 } else {
499 CachedStackStringStream css;
500 mds->evict_client(target->get_client().v, false, true, *css, send_reply);
501 }
502 } else if (reply) {
503 mds->send_message_client(reply, session);
504 }
505 }
506
507 void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
508 {
509 Session *session = mds->get_session(m);
510 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
511 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
512
513 if (!session) {
514 dout(0) << " ignoring sessionless msg " << *m << dendl;
515 return;
516 }
517
518 std::string_view fs_name = mds->mdsmap->get_fs_name();
519 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
520 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
521 return;
522 }
523
524 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
525 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
526 return;
527 }
528
529 if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
530 finish_reclaim_session(session);
531 } else {
532 reclaim_session(session, m);
533 }
534 }
535
536 void Server::handle_client_session(const cref_t<MClientSession> &m)
537 {
538 version_t pv;
539 Session *session = mds->get_session(m);
540
541 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
542 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
543
544 if (!session) {
545 dout(0) << " ignoring sessionless msg " << *m << dendl;
546 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
547 reply->metadata["error_string"] = "sessionless";
548 mds->send_message(reply, m->get_connection());
549 return;
550 }
551
552 std::string_view fs_name = mds->mdsmap->get_fs_name();
553 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
554 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
555 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
556 reply->metadata["error_string"] = "client doesn't have caps for FS \"" +
557 std::string(fs_name) + "\"";
558 mds->send_message(std::move(reply), m->get_connection());
559 return;
560 }
561
562 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
563 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
564 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
565 // close requests need to be handled when mds is active
566 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
567 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
568 return;
569 }
570 } else {
571 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
572 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
573 return;
574 }
575 }
576
577 if (logger)
578 logger->inc(l_mdss_handle_client_session);
579
580 uint64_t sseq = 0;
581 switch (m->get_op()) {
582 case CEPH_SESSION_REQUEST_OPEN:
583 if (session->is_opening() ||
584 session->is_open() ||
585 session->is_stale() ||
586 session->is_killing() ||
587 terminating_sessions) {
588 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
589 return;
590 }
591 ceph_assert(session->is_closed() || session->is_closing());
592
593 if (mds->is_stopping()) {
594 dout(10) << "mds is stopping, dropping open req" << dendl;
595 return;
596 }
597
598 {
599 auto& addr = session->info.inst.addr;
600 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
601 auto& client_metadata = session->info.client_metadata;
602
603 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
604 auto now = ceph_clock_now();
605 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
606 auto elapsed = now - m->get_recv_stamp();
607 CachedStackStringStream css;
608 *css << "New client session:"
609 << " addr=\"" << session->info.inst.addr << "\""
610 << ",elapsed=" << elapsed
611 << ",throttled=" << throttle_elapsed
612 << ",status=\"" << status << "\"";
613 if (!err.empty()) {
614 *css << ",error=\"" << err << "\"";
615 }
616 const auto& metadata = session->info.client_metadata;
617 if (auto it = metadata.find("root"); it != metadata.end()) {
618 *css << ",root=\"" << it->second << "\"";
619 }
620 dout(2) << css->strv() << dendl;
621 };
622
623 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str, unsigned flags=0) {
624 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT, 0, flags);
625 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
626 m->metadata["error_string"] = err_str;
627 mds->send_message_client(m, session);
628 log_session_status("REJECTED", err_str);
629 };
630
631 bool blocklisted = mds->objecter->with_osdmap(
632 [&addr](const OSDMap &osd_map) -> bool {
633 return osd_map.is_blocklisted(addr);
634 });
635
636 if (blocklisted) {
637 dout(10) << "rejecting blocklisted client " << addr << dendl;
638 // This goes on the wire and the "blacklisted" substring is
639 // depended upon by the kernel client for detecting whether it
640 // has been blocklisted. If mounted with recover_session=clean
641 // (since 5.4), it tries to automatically recover itself from
642 // blocklisting.
643 unsigned flags = 0;
644 flags |= MClientSession::SESSION_BLOCKLISTED;
645 send_reject_message("blocklisted (blacklisted)", flags);
646 session->clear();
647 break;
648 }
649
650 if (client_metadata.features.empty())
651 infer_supported_features(session, client_metadata);
652
653 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
654 dout(20) << " features: '" << client_metadata.features << "'" << dendl;
655 dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
656 for (const auto& p : client_metadata) {
657 dout(20) << " " << p.first << ": " << p.second << dendl;
658 }
659
660 feature_bitset_t missing_features = required_client_features;
661 missing_features -= client_metadata.features;
662 if (!missing_features.empty()) {
663 CachedStackStringStream css;
664 *css << "missing required features '" << missing_features << "'";
665 send_reject_message(css->strv());
666 mds->clog->warn() << "client session (" << session->info.inst
667 << ") lacks required features " << missing_features
668 << "; client supports " << client_metadata.features;
669 session->clear();
670 break;
671 }
672
673 // Special case for the 'root' metadata path; validate that the claimed
674 // root is actually within the caps of the session
675 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
676 auto claimed_root = it->second;
677 CachedStackStringStream css;
678 bool denied = false;
679 // claimed_root has a leading "/" which we strip before passing
680 // into caps check
681 if (claimed_root.empty() || claimed_root[0] != '/') {
682 denied = true;
683 *css << "invalue root '" << claimed_root << "'";
684 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
685 denied = true;
686 *css << "non-allowable root '" << claimed_root << "'";
687 }
688
689 if (denied) {
690 // Tell the client we're rejecting their open
691 send_reject_message(css->strv());
692 mds->clog->warn() << "client session with " << css->strv()
693 << " denied (" << session->info.inst << ")";
694 session->clear();
695 break;
696 }
697 }
698
699 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
700 if (find_session_by_uuid(it->second)) {
701 send_reject_message("duplicated session uuid");
702 mds->clog->warn() << "client session with duplicated session uuid '"
703 << it->second << "' denied (" << session->info.inst << ")";
704 session->clear();
705 break;
706 }
707 }
708
709 if (session->is_closed()) {
710 mds->sessionmap.add_session(session);
711 }
712
713 pv = mds->sessionmap.mark_projected(session);
714 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
715 mds->sessionmap.touch_session(session);
716 auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
717 ceph_assert(r == 0);
718 log_session_status("ACCEPTED", "");
719 });
720 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
721 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
722 mdlog->flush();
723 }
724 break;
725
726 case CEPH_SESSION_REQUEST_RENEWCAPS:
727 if (session->is_open() || session->is_stale()) {
728 mds->sessionmap.touch_session(session);
729 if (session->is_stale()) {
730 mds->sessionmap.set_state(session, Session::STATE_OPEN);
731 mds->locker->resume_stale_caps(session);
732 mds->sessionmap.touch_session(session);
733 }
734 auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
735 mds->send_message_client(reply, session);
736 } else {
737 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
738 }
739 break;
740
741 case CEPH_SESSION_REQUEST_CLOSE:
742 {
743 if (session->is_closed() ||
744 session->is_closing() ||
745 session->is_killing()) {
746 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
747 return;
748 }
749 if (session->is_importing()) {
750 dout(10) << "ignoring close req on importing session" << dendl;
751 return;
752 }
753 ceph_assert(session->is_open() ||
754 session->is_stale() ||
755 session->is_opening());
756 if (m->get_seq() < session->get_push_seq()) {
757 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
758 << ", dropping" << dendl;
759 return;
760 }
761 // We are getting a seq that is higher than expected.
762 // Handle the same as any other seqn error.
763 //
764 if (m->get_seq() != session->get_push_seq()) {
765 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
766 << ", BUGGY!" << dendl;
767 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
768 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
769 return;
770 }
771 journal_close_session(session, Session::STATE_CLOSING, NULL);
772 }
773 break;
774
775 case CEPH_SESSION_FLUSHMSG_ACK:
776 finish_flush_session(session, m->get_seq());
777 break;
778
779 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
780 if (mds->is_active())
781 mdlog->flush();
782 break;
783
784 default:
785 ceph_abort();
786 }
787 }
788
789 void Server::flush_session(Session *session, MDSGatherBuilder& gather) {
790 if (!session->is_open() ||
791 !session->get_connection() ||
792 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
793 return;
794 }
795
796 version_t seq = session->wait_for_flush(gather.new_sub());
797 mds->send_message_client(
798 make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
799 }
800
801 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
802 {
803 for (const auto& client : client_set) {
804 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
805 ceph_assert(session);
806 flush_session(session, gather);
807 }
808 }
809
810 void Server::finish_flush_session(Session *session, version_t seq)
811 {
812 MDSContext::vec finished;
813 session->finish_flush(seq, finished);
814 mds->queue_waiters(finished);
815 }
816
817 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
818 const interval_set<inodeno_t>& inos_to_free, version_t piv,
819 const interval_set<inodeno_t>& inos_to_purge, LogSegment *ls)
820 {
821 dout(10) << "_session_logged " << session->info.inst
822 << " state_seq " << state_seq
823 << " " << (open ? "open":"close") << " " << pv
824 << " inos_to_free " << inos_to_free << " inotablev " << piv
825 << " inos_to_purge " << inos_to_purge << dendl;
826
827 if (!open) {
828 if (inos_to_purge.size()){
829 ceph_assert(ls);
830 session->info.prealloc_inos.subtract(inos_to_purge);
831 ls->purging_inodes.insert(inos_to_purge);
832 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping())
833 mdcache->purge_inodes(inos_to_purge, ls);
834 }
835
836 if (inos_to_free.size()) {
837 ceph_assert(piv);
838 ceph_assert(session->is_closing() || session->is_killing() ||
839 session->is_opening()); // re-open closing session
840 session->info.prealloc_inos.subtract(inos_to_free);
841 mds->inotable->apply_release_ids(inos_to_free);
842 ceph_assert(mds->inotable->get_version() == piv);
843 }
844 session->free_prealloc_inos = session->info.prealloc_inos;
845 session->delegated_inos.clear();
846 }
847
848 mds->sessionmap.mark_dirty(session);
849
850 // apply
851 if (session->get_state_seq() != state_seq) {
852 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
853 << ", noop" << dendl;
854 // close must have been canceled (by an import?), or any number of other things..
855 } else if (open) {
856 ceph_assert(session->is_opening());
857 mds->sessionmap.set_state(session, Session::STATE_OPEN);
858 mds->sessionmap.touch_session(session);
859 metrics_handler->add_session(session);
860 ceph_assert(session->get_connection());
861 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
862 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
863 reply->supported_features = supported_features;
864 reply->metric_spec = supported_metric_spec;
865 }
866 mds->send_message_client(reply, session);
867 if (mdcache->is_readonly()) {
868 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
869 mds->send_message_client(m, session);
870 }
871 } else if (session->is_closing() ||
872 session->is_killing()) {
873 // kill any lingering capabilities, leases, requests
874 bool killing = session->is_killing();
875 while (!session->caps.empty()) {
876 Capability *cap = session->caps.front();
877 CInode *in = cap->get_inode();
878 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
879 mds->locker->remove_client_cap(in, cap, killing);
880 }
881 while (!session->leases.empty()) {
882 ClientLease *r = session->leases.front();
883 CDentry *dn = static_cast<CDentry*>(r->parent);
884 dout(20) << " killing client lease of " << *dn << dendl;
885 dn->remove_client_lease(r, mds->locker);
886 }
887 if (client_reconnect_gather.erase(session->info.get_client())) {
888 dout(20) << " removing client from reconnect set" << dendl;
889 if (client_reconnect_gather.empty()) {
890 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
891 reconnect_gather_finish();
892 }
893 }
894 if (client_reclaim_gather.erase(session->info.get_client())) {
895 dout(20) << " removing client from reclaim set" << dendl;
896 if (client_reclaim_gather.empty()) {
897 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
898 mds->maybe_clientreplay_done();
899 }
900 }
901
902 if (session->is_closing()) {
903 // mark con disposable. if there is a fault, we will get a
904 // reset and clean it up. if the client hasn't received the
905 // CLOSE message yet, they will reconnect and get an
906 // ms_handle_remote_reset() and realize they had in fact closed.
907 // do this *before* sending the message to avoid a possible
908 // race.
909 if (session->get_connection()) {
910 // Conditional because terminate_sessions will indiscrimately
911 // put sessions in CLOSING whether they ever had a conn or not.
912 session->get_connection()->mark_disposable();
913 }
914
915 // reset session
916 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
917 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
918 session->clear();
919 metrics_handler->remove_session(session);
920 mds->sessionmap.remove_session(session);
921 } else if (session->is_killing()) {
922 // destroy session, close connection
923 if (session->get_connection()) {
924 session->get_connection()->mark_down();
925 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
926 session->set_connection(nullptr);
927 }
928 metrics_handler->remove_session(session);
929 mds->sessionmap.remove_session(session);
930 } else {
931 ceph_abort();
932 }
933 } else {
934 ceph_abort();
935 }
936 }
937
938 /**
939 * Inject sessions from some source other than actual connections.
940 *
941 * For example:
942 * - sessions inferred from journal replay
943 * - sessions learned from other MDSs during rejoin
944 * - sessions learned from other MDSs during dir/caps migration
945 * - sessions learned from other MDSs during a cross-MDS rename
946 */
947 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
948 map<client_t,client_metadata_t>& cmm,
949 map<client_t, pair<Session*,uint64_t> >& smap)
950 {
951 version_t pv = mds->sessionmap.get_projected();
952
953 dout(10) << "prepare_force_open_sessions " << pv
954 << " on " << cm.size() << " clients"
955 << dendl;
956
957 mds->objecter->with_osdmap(
958 [this, &cm, &cmm](const OSDMap &osd_map) {
959 for (auto p = cm.begin(); p != cm.end(); ) {
960 if (osd_map.is_blocklisted(p->second.addr)) {
961 dout(10) << " ignoring blocklisted client." << p->first
962 << " (" << p->second.addr << ")" << dendl;
963 cmm.erase(p->first);
964 cm.erase(p++);
965 } else {
966 ++p;
967 }
968 }
969 });
970
971 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
972 Session *session = mds->sessionmap.get_or_add_session(p->second);
973 pv = mds->sessionmap.mark_projected(session);
974 uint64_t sseq;
975 if (session->is_closed() ||
976 session->is_closing() ||
977 session->is_killing()) {
978 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
979 auto q = cmm.find(p->first);
980 if (q != cmm.end())
981 session->info.client_metadata.merge(q->second);
982 } else {
983 ceph_assert(session->is_open() ||
984 session->is_opening() ||
985 session->is_stale());
986 sseq = 0;
987 }
988 smap[p->first] = make_pair(session, sseq);
989 session->inc_importing();
990 }
991 return pv;
992 }
993
994 void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
995 bool dec_import)
996 {
997 /*
998 * FIXME: need to carefully consider the race conditions between a
999 * client trying to close a session and an MDS doing an import
1000 * trying to force open a session...
1001 */
1002 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
1003 << " initial v " << mds->sessionmap.get_version() << dendl;
1004
1005 for (auto &it : smap) {
1006 Session *session = it.second.first;
1007 uint64_t sseq = it.second.second;
1008 if (sseq > 0) {
1009 if (session->get_state_seq() != sseq) {
1010 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
1011 } else {
1012 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
1013 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1014 mds->sessionmap.touch_session(session);
1015 metrics_handler->add_session(session);
1016
1017 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
1018 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
1019 reply->supported_features = supported_features;
1020 reply->metric_spec = supported_metric_spec;
1021 }
1022 mds->send_message_client(reply, session);
1023
1024 if (mdcache->is_readonly())
1025 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
1026 }
1027 } else {
1028 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
1029 ceph_assert(session->is_open() || session->is_stale());
1030 }
1031
1032 if (dec_import) {
1033 session->dec_importing();
1034 }
1035
1036 mds->sessionmap.mark_dirty(session);
1037 }
1038
1039 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
1040 }
1041
1042 class C_MDS_TerminatedSessions : public ServerContext {
1043 void finish(int r) override {
1044 server->terminating_sessions = false;
1045 }
1046 public:
1047 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
1048 };
1049
1050 void Server::terminate_sessions()
1051 {
1052 dout(5) << "terminating all sessions..." << dendl;
1053
1054 terminating_sessions = true;
1055
1056 // kill them off. clients will retry etc.
1057 set<Session*> sessions;
1058 mds->sessionmap.get_client_session_set(sessions);
1059 for (set<Session*>::const_iterator p = sessions.begin();
1060 p != sessions.end();
1061 ++p) {
1062 Session *session = *p;
1063 if (session->is_closing() ||
1064 session->is_killing() ||
1065 session->is_closed())
1066 continue;
1067 journal_close_session(session, Session::STATE_CLOSING, NULL);
1068 }
1069
1070 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
1071 }
1072
1073
1074 void Server::find_idle_sessions()
1075 {
1076 auto now = clock::now();
1077 auto last_cleared_laggy = mds->last_cleared_laggy();
1078
1079 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
1080
1081 // timeout/stale
1082 // (caps go stale, lease die)
1083 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
1084 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
1085
1086 // don't kick clients if we've been laggy
1087 if (last_cleared_laggy < cutoff) {
1088 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
1089 << "), not marking any client stale" << dendl;
1090 return;
1091 }
1092
1093 std::vector<Session*> to_evict;
1094
1095 bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
1096 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
1097 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
1098 std::vector<Session*> new_stale;
1099
1100 for (auto session : *(sessions_p1->second)) {
1101 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1102 if (last_cap_renew_span < cutoff) {
1103 dout(20) << "laggiest active session is " << session->info.inst
1104 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1105 break;
1106 }
1107
1108 if (session->last_seen > session->last_cap_renew) {
1109 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
1110 if (last_cap_renew_span < cutoff) {
1111 dout(20) << "laggiest active session is " << session->info.inst
1112 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1113 continue;
1114 }
1115 }
1116
1117 if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
1118 dout(20) << "evicting session " << session->info.inst << " since autoclose "
1119 "has arrived" << dendl;
1120 // evict session without marking it stale
1121 to_evict.push_back(session);
1122 continue;
1123 }
1124
1125 if (defer_session_stale &&
1126 !session->is_any_flush_waiter() &&
1127 !mds->locker->is_revoking_any_caps_from(session->get_client())) {
1128 dout(20) << "deferring marking session " << session->info.inst << " stale "
1129 "since it holds no caps" << dendl;
1130 continue;
1131 }
1132
1133 auto it = session->info.client_metadata.find("timeout");
1134 if (it != session->info.client_metadata.end()) {
1135 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
1136 if (timeout == 0) {
1137 dout(10) << "skipping session " << session->info.inst
1138 << ", infinite timeout specified" << dendl;
1139 continue;
1140 }
1141 double cutoff = queue_max_age + timeout;
1142 if (last_cap_renew_span < cutoff) {
1143 dout(10) << "skipping session " << session->info.inst
1144 << ", timeout (" << timeout << ") specified"
1145 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1146 continue;
1147 }
1148
1149 // do not go through stale, evict it directly.
1150 to_evict.push_back(session);
1151 } else {
1152 dout(10) << "new stale session " << session->info.inst
1153 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1154 new_stale.push_back(session);
1155 }
1156 }
1157
1158 for (auto session : new_stale) {
1159 mds->sessionmap.set_state(session, Session::STATE_STALE);
1160 if (mds->locker->revoke_stale_caps(session)) {
1161 mds->locker->remove_stale_leases(session);
1162 finish_flush_session(session, session->get_push_seq());
1163 auto m = make_message<MClientSession>(CEPH_SESSION_STALE, session->get_push_seq());
1164 mds->send_message_client(m, session);
1165 } else {
1166 to_evict.push_back(session);
1167 }
1168 }
1169 }
1170
1171 // autoclose
1172 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
1173
1174 // Collect a list of sessions exceeding the autoclose threshold
1175 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
1176 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1177 for (auto session : *(sessions_p2->second)) {
1178 ceph_assert(session->is_stale());
1179 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1180 if (last_cap_renew_span < cutoff) {
1181 dout(20) << "oldest stale session is " << session->info.inst
1182 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1183 break;
1184 }
1185 to_evict.push_back(session);
1186 }
1187 }
1188
1189 for (auto session: to_evict) {
1190 if (session->is_importing()) {
1191 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1192 continue;
1193 }
1194
1195 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1196 mds->clog->warn() << "evicting unresponsive client " << *session
1197 << ", after " << last_cap_renew_span << " seconds";
1198 dout(10) << "autoclosing stale session " << session->info.inst
1199 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1200
1201 if (g_conf()->mds_session_blocklist_on_timeout) {
1202 CachedStackStringStream css;
1203 mds->evict_client(session->get_client().v, false, true, *css, nullptr);
1204 } else {
1205 kill_session(session, NULL);
1206 }
1207 }
1208 }
1209
1210 void Server::evict_cap_revoke_non_responders() {
1211 if (!cap_revoke_eviction_timeout) {
1212 return;
1213 }
1214
1215 auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
1216
1217 for (auto const &client: to_evict) {
1218 mds->clog->warn() << "client id " << client << " has not responded to"
1219 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1220 << " seconds, evicting";
1221 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1222 << client << dendl;
1223
1224 CachedStackStringStream css;
1225 bool evicted = mds->evict_client(client.v, false,
1226 g_conf()->mds_session_blocklist_on_evict,
1227 *css, nullptr);
1228 if (evicted && logger) {
1229 logger->inc(l_mdss_cap_revoke_eviction);
1230 }
1231 }
1232 }
1233
1234 void Server::handle_conf_change(const std::set<std::string>& changed) {
1235 if (changed.count("mds_forward_all_requests_to_auth")){
1236 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
1237 }
1238 if (changed.count("mds_cap_revoke_eviction_timeout")) {
1239 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
1240 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1241 << cap_revoke_eviction_timeout << dendl;
1242 }
1243 if (changed.count("mds_recall_max_decay_rate")) {
1244 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
1245 }
1246 if (changed.count("mds_max_snaps_per_dir")) {
1247 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
1248 dout(20) << __func__ << " max snapshots per directory changed to "
1249 << max_snaps_per_dir << dendl;
1250 }
1251 if (changed.count("mds_client_delegate_inos_pct")) {
1252 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
1253 }
1254 if (changed.count("mds_max_caps_per_client")) {
1255 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1256 }
1257 if (changed.count("mds_session_cap_acquisition_throttle")) {
1258 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
1259 }
1260 if (changed.count("mds_session_max_caps_throttle_ratio")) {
1261 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
1262 }
1263 if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1264 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1265 }
1266 if (changed.count("mds_alternate_name_max")) {
1267 alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
1268 }
1269 if (changed.count("mds_dir_max_entries")) {
1270 dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
1271 dout(20) << __func__ << " max entries per directory changed to "
1272 << dir_max_entries << dendl;
1273 }
1274 if (changed.count("mds_bal_fragment_size_max")) {
1275 bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
1276 dout(20) << __func__ << " max fragment size changed to "
1277 << bal_fragment_size_max << dendl;
1278 }
1279 }
1280
1281 /*
1282 * XXX bump in the interface here, not using an MDSContext here
1283 * because all the callers right now happen to use a SaferCond
1284 */
1285 void Server::kill_session(Session *session, Context *on_safe)
1286 {
1287 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
1288
1289 if ((session->is_opening() ||
1290 session->is_open() ||
1291 session->is_stale()) &&
1292 !session->is_importing()) {
1293 dout(10) << "kill_session " << session << dendl;
1294 journal_close_session(session, Session::STATE_KILLING, on_safe);
1295 } else {
1296 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
1297 if (session->is_closing() ||
1298 session->is_killing()) {
1299 if (on_safe)
1300 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1301 } else {
1302 ceph_assert(session->is_closed() ||
1303 session->is_importing());
1304 if (on_safe)
1305 on_safe->complete(0);
1306 }
1307 }
1308 }
1309
1310 size_t Server::apply_blocklist()
1311 {
1312 std::vector<Session*> victims;
1313 const auto& sessions = mds->sessionmap.get_sessions();
1314 mds->objecter->with_osdmap(
1315 [&](const OSDMap& o) {
1316 for (const auto& p : sessions) {
1317 if (!p.first.is_client()) {
1318 // Do not apply OSDMap blocklist to MDS daemons, we find out
1319 // about their death via MDSMap.
1320 continue;
1321 }
1322 if (o.is_blocklisted(p.second->info.inst.addr)) {
1323 victims.push_back(p.second);
1324 }
1325 }
1326 });
1327
1328 for (const auto& s : victims) {
1329 kill_session(s, nullptr);
1330 }
1331
1332 dout(10) << "apply_blocklist: killed " << victims.size() << dendl;
1333
1334 return victims.size();
1335 }
1336
1337 void Server::journal_close_session(Session *session, int state, Context *on_safe)
1338 {
1339 dout(10) << __func__ << " : "
1340 << session->info.inst
1341 << " pending_prealloc_inos " << session->pending_prealloc_inos
1342 << " free_prealloc_inos " << session->free_prealloc_inos
1343 << " delegated_inos " << session->delegated_inos << dendl;
1344
1345 uint64_t sseq = mds->sessionmap.set_state(session, state);
1346 version_t pv = mds->sessionmap.mark_projected(session);
1347 version_t piv = 0;
1348
1349 // release alloc and pending-alloc inos for this session
1350 // and wipe out session state, in case the session close aborts for some reason
1351 interval_set<inodeno_t> inos_to_free;
1352 inos_to_free.insert(session->pending_prealloc_inos);
1353 inos_to_free.insert(session->free_prealloc_inos);
1354 if (inos_to_free.size()) {
1355 mds->inotable->project_release_ids(inos_to_free);
1356 piv = mds->inotable->get_projected_version();
1357 } else
1358 piv = 0;
1359
1360 auto le = new ESession(session->info.inst, false, pv, inos_to_free, piv, session->delegated_inos);
1361 auto fin = new C_MDS_session_finish(this, session, sseq, false, pv, inos_to_free, piv,
1362 session->delegated_inos, mdlog->get_current_segment(), on_safe);
1363 mdlog->start_submit_entry(le, fin);
1364 mdlog->flush();
1365
1366 // clean up requests, too
1367 while(!session->requests.empty()) {
1368 auto mdr = MDRequestRef(*session->requests.begin());
1369 mdcache->request_kill(mdr);
1370 }
1371
1372 finish_flush_session(session, session->get_push_seq());
1373 }
1374
1375 void Server::reconnect_clients(MDSContext *reconnect_done_)
1376 {
1377 reconnect_done = reconnect_done_;
1378
1379 auto now = clock::now();
1380 set<Session*> sessions;
1381 mds->sessionmap.get_client_session_set(sessions);
1382 for (auto session : sessions) {
1383 if (session->is_open()) {
1384 client_reconnect_gather.insert(session->get_client());
1385 session->set_reconnecting(true);
1386 session->last_cap_renew = now;
1387 }
1388 }
1389
1390 if (client_reconnect_gather.empty()) {
1391 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1392 reconnect_gather_finish();
1393 return;
1394 }
1395
1396 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1397
1398 reconnect_start = now;
1399 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1400 mds->sessionmap.dump();
1401 }
1402
1403 void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
1404 {
1405 dout(7) << "handle_client_reconnect " << m->get_source()
1406 << (m->has_more() ? " (more)" : "") << dendl;
1407 client_t from = m->get_source().num();
1408 Session *session = mds->get_session(m);
1409 if (!session) {
1410 dout(0) << " ignoring sessionless msg " << *m << dendl;
1411 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
1412 reply->metadata["error_string"] = "sessionless";
1413 mds->send_message(reply, m->get_connection());
1414 return;
1415 }
1416
1417 if (!session->is_open()) {
1418 dout(0) << " ignoring msg from not-open session" << *m << dendl;
1419 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1420 mds->send_message(reply, m->get_connection());
1421 return;
1422 }
1423
1424 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1425
1426 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1427 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1428 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1429 return;
1430 }
1431
1432 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
1433 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1434
1435 bool deny = false;
1436 if (reconnect_all_deny || !mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
1437 // XXX maybe in the future we can do better than this?
1438 if (reconnect_all_deny) {
1439 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl;
1440 } else {
1441 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1442 }
1443 mds->clog->info() << "denied reconnect attempt (mds is "
1444 << ceph_mds_state_name(mds->get_state())
1445 << ") from " << m->get_source_inst()
1446 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
1447 deny = true;
1448 } else {
1449 std::string error_str;
1450 if (!session->is_open()) {
1451 error_str = "session is closed";
1452 } else if (mdcache->is_readonly()) {
1453 error_str = "mds is readonly";
1454 } else {
1455 if (session->info.client_metadata.features.empty())
1456 infer_supported_features(session, session->info.client_metadata);
1457
1458 feature_bitset_t missing_features = required_client_features;
1459 missing_features -= session->info.client_metadata.features;
1460 if (!missing_features.empty()) {
1461 CachedStackStringStream css;
1462 *css << "missing required features '" << missing_features << "'";
1463 error_str = css->strv();
1464 }
1465 }
1466
1467 if (!error_str.empty()) {
1468 deny = true;
1469 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1470 mds->clog->info() << "denied reconnect attempt from "
1471 << m->get_source_inst() << " (" << error_str << ")";
1472 }
1473 }
1474
1475 if (deny) {
1476 auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1477 mds->send_message_client(r, session);
1478 if (session->is_open()) {
1479 client_reconnect_denied.insert(session->get_client());
1480 }
1481 return;
1482 }
1483
1484 if (!m->has_more()) {
1485 metrics_handler->add_session(session);
1486 // notify client of success with an OPEN
1487 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
1488 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
1489 reply->supported_features = supported_features;
1490 reply->metric_spec = supported_metric_spec;
1491 }
1492 mds->send_message_client(reply, session);
1493 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1494 }
1495
1496 session->last_cap_renew = clock::now();
1497
1498 // snaprealms
1499 for (const auto &r : m->realms) {
1500 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
1501 if (in && in->state_test(CInode::STATE_PURGING))
1502 continue;
1503 if (in) {
1504 if (in->snaprealm) {
1505 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
1506 } else {
1507 // this can happen if we are non-auth or we rollback snaprealm
1508 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
1509 }
1510 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1511 } else {
1512 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1513 << " seq " << r.realm.seq << dendl;
1514 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1515 }
1516 }
1517
1518 // caps
1519 for (const auto &p : m->caps) {
1520 // make sure our last_cap_id is MAX over all issued caps
1521 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1522 mdcache->last_cap_id = p.second.capinfo.cap_id;
1523
1524 CInode *in = mdcache->get_inode(p.first);
1525 if (in && in->state_test(CInode::STATE_PURGING))
1526 continue;
1527 if (in && in->is_auth()) {
1528 // we recovered it, and it's ours. take note.
1529 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
1530 << " on " << *in << dendl;
1531 in->reconnect_cap(from, p.second, session);
1532 mdcache->add_reconnected_cap(from, p.first, p.second);
1533 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
1534 continue;
1535 }
1536
1537 if (in && !in->is_auth()) {
1538 // not mine.
1539 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1540 // add to cap export list.
1541 mdcache->rejoin_export_caps(p.first, from, p.second,
1542 in->authority().first, true);
1543 } else {
1544 // don't know if the inode is mine
1545 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1546 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
1547 }
1548 }
1549
1550 reconnect_last_seen = clock::now();
1551
1552 if (!m->has_more()) {
1553 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1554
1555 // remove from gather set
1556 client_reconnect_gather.erase(from);
1557 session->set_reconnecting(false);
1558 if (client_reconnect_gather.empty())
1559 reconnect_gather_finish();
1560 }
1561 }
1562
1563 void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1564 {
1565 int supported = -1;
1566 auto it = client_metadata.find("ceph_version");
1567 if (it != client_metadata.end()) {
1568 // user space client
1569 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1570 supported = CEPHFS_FEATURE_LUMINOUS;
1571 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1572 supported = CEPHFS_FEATURE_KRAKEN;
1573 } else {
1574 it = client_metadata.find("kernel_version");
1575 if (it != client_metadata.end()) {
1576 // kernel client
1577 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1578 supported = CEPHFS_FEATURE_LUMINOUS;
1579 }
1580 }
1581 if (supported == -1 &&
1582 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1583 supported = CEPHFS_FEATURE_JEWEL;
1584
1585 if (supported >= 0) {
1586 unsigned long value = (1UL << (supported + 1)) - 1;
1587 client_metadata.features = feature_bitset_t(value);
1588 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1589 }
1590 }
1591
1592 void Server::update_required_client_features()
1593 {
1594 required_client_features = mds->mdsmap->get_required_client_features();
1595 dout(7) << "required_client_features: " << required_client_features << dendl;
1596
1597 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1598 set<Session*> sessions;
1599 mds->sessionmap.get_client_session_set(sessions);
1600 for (auto session : sessions) {
1601 feature_bitset_t missing_features = required_client_features;
1602 missing_features -= session->info.client_metadata.features;
1603 if (!missing_features.empty()) {
1604 bool blocklisted = mds->objecter->with_osdmap(
1605 [session](const OSDMap &osd_map) -> bool {
1606 return osd_map.is_blocklisted(session->info.inst.addr);
1607 });
1608 if (blocklisted)
1609 continue;
1610
1611 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1612 << missing_features << "'";
1613 CachedStackStringStream css;
1614 mds->evict_client(session->get_client().v, false,
1615 g_conf()->mds_session_blocklist_on_evict, *css);
1616 }
1617 }
1618 }
1619 }
1620
1621 void Server::reconnect_gather_finish()
1622 {
1623 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1624 ceph_assert(reconnect_done);
1625
1626 if (!mds->snapclient->is_synced()) {
1627 // make sure snaptable cache is populated. snaprealms will be
1628 // extensively used in rejoin stage.
1629 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1630 mds->snapclient->wait_for_sync(reconnect_done);
1631 } else {
1632 reconnect_done->complete(0);
1633 }
1634 reconnect_done = NULL;
1635 }
1636
1637 void Server::reconnect_tick()
1638 {
1639 bool reject_all_reconnect = false;
1640 if (reconnect_evicting) {
1641 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
1642 return;
1643 }
1644
1645 /*
1646 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1647 * then load less meta information in rejoin phase. This will shorten reboot time.
1648 * Moreover, loading less meta increases the chance standby with less memory can failover.
1649
1650 * Why not shorten reconnect period?
1651 * Clients may send unsafe or retry requests, which haven't been
1652 * completed before old mds stop, to new mds. These requests may
1653 * need to be processed during new mds's clientreplay phase,
1654 * see: #https://github.com/ceph/ceph/pull/29059.
1655 */
1656 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1657 if (client_reconnect_gather.empty())
1658 return;
1659
1660 if (reconnect_all_deny && (client_reconnect_gather == client_reconnect_denied))
1661 reject_all_reconnect = true;
1662
1663 auto now = clock::now();
1664 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
1665 if (elapse1 < g_conf()->mds_reconnect_timeout && !reject_all_reconnect)
1666 return;
1667
1668 vector<Session*> remaining_sessions;
1669 remaining_sessions.reserve(client_reconnect_gather.size());
1670 for (auto c : client_reconnect_gather) {
1671 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1672 ceph_assert(session);
1673 remaining_sessions.push_back(session);
1674 // client re-sends cap flush messages before the reconnect message
1675 if (session->last_seen > reconnect_last_seen)
1676 reconnect_last_seen = session->last_seen;
1677 }
1678
1679 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
1680 if (elapse2 < g_conf()->mds_reconnect_timeout / 2 && !reject_all_reconnect) {
1681 dout(7) << "reconnect_tick: last seen " << elapse2
1682 << " seconds ago, extending reconnect interval" << dendl;
1683 return;
1684 }
1685
1686 dout(7) << "reconnect timed out, " << remaining_sessions.size()
1687 << " clients have not reconnected in time" << dendl;
1688
1689 // If we're doing blocklist evictions, use this to wait for them before
1690 // proceeding to reconnect_gather_finish
1691 MDSGatherBuilder gather(g_ceph_context);
1692
1693 for (auto session : remaining_sessions) {
1694 // Keep sessions that have specified timeout. These sessions will prevent
1695 // mds from going to active. MDS goes to active after they all have been
1696 // killed or reclaimed.
1697 if (session->info.client_metadata.find("timeout") !=
1698 session->info.client_metadata.end()) {
1699 dout(1) << "reconnect keeps " << session->info.inst
1700 << ", need to be reclaimed" << dendl;
1701 client_reclaim_gather.insert(session->get_client());
1702 continue;
1703 }
1704
1705 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
1706
1707 mds->clog->warn() << "evicting unresponsive client " << *session
1708 << ", after waiting " << elapse1
1709 << " seconds during MDS startup";
1710
1711 // make _session_logged() purge orphan objects of lost async/unsafe requests
1712 session->delegated_inos.swap(session->free_prealloc_inos);
1713
1714 if (g_conf()->mds_session_blocklist_on_timeout) {
1715 CachedStackStringStream css;
1716 mds->evict_client(session->get_client().v, false, true, *css,
1717 gather.new_sub());
1718 } else {
1719 kill_session(session, NULL);
1720 }
1721
1722 failed_reconnects++;
1723 }
1724 client_reconnect_gather.clear();
1725 client_reconnect_denied.clear();
1726
1727 if (gather.has_subs()) {
1728 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1729 gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
1730 [this](int r){reconnect_gather_finish();})));
1731 gather.activate();
1732 reconnect_evicting = true;
1733 } else {
1734 reconnect_gather_finish();
1735 }
1736 }
1737
1738 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1739 {
1740 if (!locks.length()) return;
1741 int numlocks;
1742 ceph_filelock lock;
1743 auto p = locks.cbegin();
1744 decode(numlocks, p);
1745 for (int i = 0; i < numlocks; ++i) {
1746 decode(lock, p);
1747 lock.client = client;
1748 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1749 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1750 }
1751 decode(numlocks, p);
1752 for (int i = 0; i < numlocks; ++i) {
1753 decode(lock, p);
1754 lock.client = client;
1755 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1756 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1757 }
1758 }
1759
1760 /**
1761 * Call this when the MDCache is oversized, to send requests to the clients
1762 * to trim some caps, and consequently unpin some inodes in the MDCache so
1763 * that it can trim too.
1764 */
1765 std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1766 {
1767 const auto now = clock::now();
1768 const bool steady = !!(flags&RecallFlags::STEADY);
1769 const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
1770 const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
1771 const bool trim = !!(flags&RecallFlags::TRIM);
1772
1773 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1774 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1775 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1776 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1777 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
1778 const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
1779
1780 dout(7) << __func__ << ":"
1781 << " min=" << min_caps_per_client
1782 << " max=" << max_caps_per_client
1783 << " total=" << Capability::count()
1784 << " flags=" << flags
1785 << dendl;
1786
1787 /* trim caps of sessions with the most caps first */
1788 std::multimap<uint64_t, Session*> caps_session;
1789 auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
1790 auto num_caps = s->caps.size();
1791 auto cache_liveness = s->get_session_cache_liveness();
1792 if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
1793 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1794 }
1795 };
1796 mds->sessionmap.get_client_sessions(std::move(f));
1797
1798 std::pair<bool, uint64_t> result = {false, 0};
1799 auto& [throttled, caps_recalled] = result;
1800 last_recall_state = now;
1801 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
1802 if (!session->is_open() ||
1803 !session->get_connection() ||
1804 !session->info.inst.name.is_client())
1805 continue;
1806
1807 dout(10) << __func__ << ":"
1808 << " session " << session->info.inst
1809 << " caps " << num_caps
1810 << ", leases " << session->leases.size()
1811 << dendl;
1812
1813 uint64_t newlim;
1814 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1815 newlim = min_caps_per_client;
1816 } else {
1817 newlim = num_caps-recall_max_caps;
1818 }
1819 if (num_caps > newlim) {
1820 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1821 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1822 newlim = num_caps-recall;
1823 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
1824 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1825 const uint64_t global_recall_throttle = recall_throttle.get();
1826 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1827 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1828 throttled = true;
1829 continue;
1830 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1831 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1832 throttled = true;
1833 continue;
1834 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1835 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1836 throttled = true;
1837 break;
1838 }
1839
1840 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1841 if (steady) {
1842 const auto session_recall = session->get_recall_caps();
1843 const auto session_release = session->get_release_caps();
1844 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1845 /* The session has been unable to keep up with the number of caps
1846 * recalled (by half); additionally, to prevent marking sessions
1847 * we've just begun to recall from, the session_recall counter
1848 * (decayed count of caps recently recalled) is **greater** than the
1849 * session threshold for the session's cap recall throttle.
1850 */
1851 dout(15) << " 2*session_release < session_recall"
1852 " (2*" << session_release << " < " << session_recall << ") &&"
1853 " 2*session_recall < recall_max_decay_threshold"
1854 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
1855 " Skipping because we are unlikely to get more released." << dendl;
1856 continue;
1857 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1858 /* The number of caps recalled is less than the number we *could*
1859 * recall (so there isn't much left to recall?) and the number of
1860 * caps is less than the current recall_caps counter (decayed count
1861 * of caps recently recalled).
1862 */
1863 dout(15) << " 2*recall < session_recall "
1864 " (2*" << recall << " < " << session_recall << ") &&"
1865 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1866 " Skipping because we are unlikely to get more released." << dendl;
1867 continue;
1868 }
1869 }
1870
1871 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1872
1873 auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
1874 m->head.max_caps = newlim;
1875 mds->send_message_client(m, session);
1876 if (gather) {
1877 flush_session(session, *gather);
1878 }
1879 caps_recalled += session->notify_recall_sent(newlim);
1880 recall_throttle.hit(recall);
1881 }
1882 }
1883
1884 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1885
1886 return result;
1887 }
1888
1889 void Server::force_clients_readonly()
1890 {
1891 dout(10) << "force_clients_readonly" << dendl;
1892 set<Session*> sessions;
1893 mds->sessionmap.get_client_session_set(sessions);
1894 for (set<Session*>::const_iterator p = sessions.begin();
1895 p != sessions.end();
1896 ++p) {
1897 Session *session = *p;
1898 if (!session->info.inst.name.is_client() ||
1899 !(session->is_open() || session->is_stale()))
1900 continue;
1901 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
1902 }
1903 }
1904
1905 /*******
1906 * some generic stuff for finishing off requests
1907 */
1908 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1909 {
1910 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1911 ceph_assert(!mdr->has_completed);
1912
1913 // note trace items for eventual reply.
1914 mdr->tracei = in;
1915 if (in)
1916 mdr->pin(in);
1917
1918 mdr->tracedn = dn;
1919 if (dn)
1920 mdr->pin(dn);
1921
1922 early_reply(mdr, in, dn);
1923
1924 mdr->committing = true;
1925 submit_mdlog_entry(le, fin, mdr, __func__);
1926
1927 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1928 if (mds->queue_one_replay()) {
1929 dout(10) << " queued next replay op" << dendl;
1930 } else {
1931 dout(10) << " journaled last replay op" << dendl;
1932 }
1933 } else if (mdr->did_early_reply)
1934 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
1935 else
1936 mdlog->flush();
1937 }
1938
1939 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1940 std::string_view event)
1941 {
1942 if (mdr) {
1943 string event_str("submit entry: ");
1944 event_str += event;
1945 mdr->mark_event(event_str);
1946 }
1947 mdlog->submit_entry(le, fin);
1948 }
1949
1950 /*
1951 * send response built from mdr contents and error code; clean up mdr
1952 */
1953 void Server::respond_to_request(MDRequestRef& mdr, int r)
1954 {
1955 if (mdr->client_request) {
1956 if (mdr->is_batch_head()) {
1957 dout(20) << __func__ << " batch head " << *mdr << dendl;
1958 mdr->release_batch_op()->respond(r);
1959 } else {
1960 reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
1961 }
1962 } else if (mdr->internal_op > -1) {
1963 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1964 if (!mdr->internal_op_finish)
1965 ceph_abort_msg("trying to respond to internal op without finisher");
1966 mdr->internal_op_finish->complete(r);
1967 mdcache->request_finish(mdr);
1968 }
1969 }
1970
1971 // statistics mds req op number and latency
1972 void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
1973 {
1974 int code = l_mdss_first;
1975 switch(req->get_op()) {
1976 case CEPH_MDS_OP_LOOKUPHASH:
1977 code = l_mdss_req_lookuphash_latency;
1978 break;
1979 case CEPH_MDS_OP_LOOKUPINO:
1980 code = l_mdss_req_lookupino_latency;
1981 break;
1982 case CEPH_MDS_OP_LOOKUPPARENT:
1983 code = l_mdss_req_lookupparent_latency;
1984 break;
1985 case CEPH_MDS_OP_LOOKUPNAME:
1986 code = l_mdss_req_lookupname_latency;
1987 break;
1988 case CEPH_MDS_OP_LOOKUP:
1989 code = l_mdss_req_lookup_latency;
1990 break;
1991 case CEPH_MDS_OP_LOOKUPSNAP:
1992 code = l_mdss_req_lookupsnap_latency;
1993 break;
1994 case CEPH_MDS_OP_GETATTR:
1995 code = l_mdss_req_getattr_latency;
1996 break;
1997 case CEPH_MDS_OP_SETATTR:
1998 code = l_mdss_req_setattr_latency;
1999 break;
2000 case CEPH_MDS_OP_SETLAYOUT:
2001 code = l_mdss_req_setlayout_latency;
2002 break;
2003 case CEPH_MDS_OP_SETDIRLAYOUT:
2004 code = l_mdss_req_setdirlayout_latency;
2005 break;
2006 case CEPH_MDS_OP_GETVXATTR:
2007 code = l_mdss_req_getvxattr_latency;
2008 break;
2009 case CEPH_MDS_OP_SETXATTR:
2010 code = l_mdss_req_setxattr_latency;
2011 break;
2012 case CEPH_MDS_OP_RMXATTR:
2013 code = l_mdss_req_rmxattr_latency;
2014 break;
2015 case CEPH_MDS_OP_READDIR:
2016 code = l_mdss_req_readdir_latency;
2017 break;
2018 case CEPH_MDS_OP_SETFILELOCK:
2019 code = l_mdss_req_setfilelock_latency;
2020 break;
2021 case CEPH_MDS_OP_GETFILELOCK:
2022 code = l_mdss_req_getfilelock_latency;
2023 break;
2024 case CEPH_MDS_OP_CREATE:
2025 code = l_mdss_req_create_latency;
2026 break;
2027 case CEPH_MDS_OP_OPEN:
2028 code = l_mdss_req_open_latency;
2029 break;
2030 case CEPH_MDS_OP_MKNOD:
2031 code = l_mdss_req_mknod_latency;
2032 break;
2033 case CEPH_MDS_OP_LINK:
2034 code = l_mdss_req_link_latency;
2035 break;
2036 case CEPH_MDS_OP_UNLINK:
2037 code = l_mdss_req_unlink_latency;
2038 break;
2039 case CEPH_MDS_OP_RMDIR:
2040 code = l_mdss_req_rmdir_latency;
2041 break;
2042 case CEPH_MDS_OP_RENAME:
2043 code = l_mdss_req_rename_latency;
2044 break;
2045 case CEPH_MDS_OP_MKDIR:
2046 code = l_mdss_req_mkdir_latency;
2047 break;
2048 case CEPH_MDS_OP_SYMLINK:
2049 code = l_mdss_req_symlink_latency;
2050 break;
2051 case CEPH_MDS_OP_LSSNAP:
2052 code = l_mdss_req_lssnap_latency;
2053 break;
2054 case CEPH_MDS_OP_MKSNAP:
2055 code = l_mdss_req_mksnap_latency;
2056 break;
2057 case CEPH_MDS_OP_RMSNAP:
2058 code = l_mdss_req_rmsnap_latency;
2059 break;
2060 case CEPH_MDS_OP_RENAMESNAP:
2061 code = l_mdss_req_renamesnap_latency;
2062 break;
2063 default:
2064 dout(1) << ": unknown client op" << dendl;
2065 return;
2066 }
2067 logger->tinc(code, lat);
2068 }
2069
2070 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
2071 {
2072 if (!g_conf()->mds_early_reply)
2073 return;
2074
2075 if (mdr->no_early_reply) {
2076 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
2077 return;
2078 }
2079
2080 if (mdr->has_more() && mdr->more()->has_journaled_peers) {
2081 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl;
2082 return;
2083 }
2084
2085 if (mdr->alloc_ino) {
2086 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
2087 return;
2088 }
2089
2090 const cref_t<MClientRequest> &req = mdr->client_request;
2091 entity_inst_t client_inst = req->get_source_inst();
2092 if (client_inst.name.is_mds())
2093 return;
2094
2095 if (req->is_replay()) {
2096 dout(10) << " no early reply on replay op" << dendl;
2097 return;
2098 }
2099
2100
2101 auto reply = make_message<MClientReply>(*req, 0);
2102 reply->set_unsafe();
2103
2104 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2105 //
2106 //_rename_finish() does not send dentry link/unlink message to replicas.
2107 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2108 // that have projected linkages from getting new replica.
2109 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
2110
2111 dout(10) << "early_reply " << reply->get_result()
2112 << " (" << cpp_strerror(reply->get_result())
2113 << ") " << *req << dendl;
2114
2115 if (tracei || tracedn) {
2116 if (tracei)
2117 mdr->cap_releases.erase(tracei->vino());
2118 if (tracedn)
2119 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2120
2121 set_trace_dist(reply, tracei, tracedn, mdr);
2122 }
2123
2124 reply->set_extra_bl(mdr->reply_extra_bl);
2125 mds->send_message_client(reply, mdr->session);
2126
2127 mdr->did_early_reply = true;
2128
2129 mds->logger->inc(l_mds_reply);
2130 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
2131 mds->logger->tinc(l_mds_reply_latency, lat);
2132 if (lat >= g_conf()->mds_op_complaint_time) {
2133 mds->logger->inc(l_mds_slow_reply);
2134 }
2135 if (client_inst.name.is_client()) {
2136 mds->sessionmap.hit_session(mdr->session);
2137 }
2138 perf_gather_op_latency(req, lat);
2139 dout(20) << "lat " << lat << dendl;
2140
2141 mdr->mark_event("early_replied");
2142 }
2143
2144 /*
2145 * send given reply
2146 * include a trace to tracei
2147 * Clean up mdr
2148 */
2149 void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
2150 {
2151 ceph_assert(mdr.get());
2152 const cref_t<MClientRequest> &req = mdr->client_request;
2153
2154 dout(7) << "reply_client_request " << reply->get_result()
2155 << " (" << cpp_strerror(reply->get_result())
2156 << ") " << *req << dendl;
2157
2158 mdr->mark_event("replying");
2159
2160 Session *session = mdr->session;
2161
2162 // note successful request in session map?
2163 //
2164 // setfilelock requests are special, they only modify states in MDS memory.
2165 // The states get lost when MDS fails. If Client re-send a completed
2166 // setfilelock request, it means that client did not receive corresponding
2167 // setfilelock reply. So MDS should re-execute the setfilelock request.
2168 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
2169 reply->get_result() == 0 && session) {
2170 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
2171 session->add_completed_request(mdr->reqid.tid, created);
2172 if (mdr->ls) {
2173 mdr->ls->touched_sessions.insert(session->info.inst.name);
2174 }
2175 }
2176
2177 // give any preallocated inos to the session
2178 apply_allocated_inos(mdr, session);
2179
2180 // get tracei/tracedn from mdr?
2181 CInode *tracei = mdr->tracei;
2182 CDentry *tracedn = mdr->tracedn;
2183
2184 bool is_replay = mdr->client_request->is_replay();
2185 bool did_early_reply = mdr->did_early_reply;
2186 entity_inst_t client_inst = req->get_source_inst();
2187
2188 if (!did_early_reply && !is_replay) {
2189
2190 mds->logger->inc(l_mds_reply);
2191 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
2192 mds->logger->tinc(l_mds_reply_latency, lat);
2193 if (lat >= g_conf()->mds_op_complaint_time) {
2194 mds->logger->inc(l_mds_slow_reply);
2195 }
2196 if (session && client_inst.name.is_client()) {
2197 mds->sessionmap.hit_session(session);
2198 }
2199 perf_gather_op_latency(req, lat);
2200 dout(20) << "lat " << lat << dendl;
2201
2202 if (tracei)
2203 mdr->cap_releases.erase(tracei->vino());
2204 if (tracedn)
2205 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2206 }
2207
2208 // drop non-rdlocks before replying, so that we can issue leases
2209 mdcache->request_drop_non_rdlocks(mdr);
2210
2211 // reply at all?
2212 if (session && !client_inst.name.is_mds()) {
2213 // send reply.
2214 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
2215 (tracei || tracedn)) {
2216 if (is_replay) {
2217 if (tracei)
2218 mdcache->try_reconnect_cap(tracei, session);
2219 } else {
2220 // include metadata in reply
2221 set_trace_dist(reply, tracei, tracedn, mdr);
2222 }
2223 }
2224
2225 // We can set the extra bl unconditionally: if it's already been sent in the
2226 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2227 reply->set_extra_bl(mdr->reply_extra_bl);
2228
2229 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
2230 mds->send_message_client(reply, session);
2231 }
2232
2233 if (req->is_queued_for_replay() &&
2234 (mdr->has_completed || reply->get_result() < 0)) {
2235 if (reply->get_result() < 0) {
2236 int r = reply->get_result();
2237 derr << "reply_client_request: failed to replay " << *req
2238 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2239 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2240 }
2241 mds->queue_one_replay();
2242 }
2243
2244 // clean up request
2245 mdcache->request_finish(mdr);
2246
2247 // take a closer look at tracei, if it happens to be a remote link
2248 if (tracei &&
2249 tracedn &&
2250 tracedn->get_projected_linkage()->is_remote()) {
2251 mdcache->eval_remote(tracedn);
2252 }
2253 }
2254
2255 /*
2256 * pass inode OR dentry (not both, or we may get confused)
2257 *
2258 * trace is in reverse order (i.e. root inode comes last)
2259 */
2260 void Server::set_trace_dist(const ref_t<MClientReply> &reply,
2261 CInode *in, CDentry *dn,
2262 MDRequestRef& mdr)
2263 {
2264 // skip doing this for debugging purposes?
2265 if (g_conf()->mds_inject_traceless_reply_probability &&
2266 mdr->ls && !mdr->o_trunc &&
2267 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
2268 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2269 return;
2270 }
2271
2272 // inode, dentry, dir, ..., inode
2273 bufferlist bl;
2274 mds_rank_t whoami = mds->get_nodeid();
2275 Session *session = mdr->session;
2276 snapid_t snapid = mdr->snapid;
2277 utime_t now = ceph_clock_now();
2278
2279 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2280
2281 // realm
2282 if (snapid == CEPH_NOSNAP) {
2283 SnapRealm *realm;
2284 if (in)
2285 realm = in->find_snaprealm();
2286 else
2287 realm = dn->get_dir()->get_inode()->find_snaprealm();
2288 reply->snapbl = realm->get_snap_trace();
2289 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2290 }
2291
2292 // dir + dentry?
2293 if (dn) {
2294 reply->head.is_dentry = 1;
2295 CDir *dir = dn->get_dir();
2296 CInode *diri = dir->get_inode();
2297
2298 diri->encode_inodestat(bl, session, NULL, snapid);
2299 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2300
2301 #ifdef MDS_VERIFY_FRAGSTAT
2302 if (dir->is_complete())
2303 dir->verify_fragstat();
2304 #endif
2305 DirStat ds;
2306 ds.frag = dir->get_frag();
2307 ds.auth = dir->get_dir_auth().first;
2308 if (dir->is_auth() && !forward_all_requests_to_auth)
2309 dir->get_dist_spec(ds.dist, whoami);
2310
2311 dir->encode_dirstat(bl, session->info, ds);
2312 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2313
2314 encode(dn->get_name(), bl);
2315
2316 int lease_mask = 0;
2317 CDentry::linkage_t *dnl = dn->get_linkage(mdr->get_client(), mdr);
2318 if (dnl->is_primary()) {
2319 ceph_assert(dnl->get_inode() == in);
2320 lease_mask = CEPH_LEASE_PRIMARY_LINK;
2321 } else {
2322 if (dnl->is_remote())
2323 ceph_assert(dnl->get_remote_ino() == in->ino());
2324 else
2325 ceph_assert(!in);
2326 }
2327 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, bl);
2328 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
2329 } else
2330 reply->head.is_dentry = 0;
2331
2332 // inode
2333 if (in) {
2334 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2335 dout(20) << "set_trace_dist added in " << *in << dendl;
2336 reply->head.is_target = 1;
2337 } else
2338 reply->head.is_target = 0;
2339
2340 reply->set_trace(bl);
2341 }
2342
2343 void Server::handle_client_request(const cref_t<MClientRequest> &req)
2344 {
2345 dout(4) << "handle_client_request " << *req << dendl;
2346
2347 if (mds->logger)
2348 mds->logger->inc(l_mds_request);
2349 if (logger)
2350 logger->inc(l_mdss_handle_client_request);
2351
2352 if (!mdcache->is_open()) {
2353 dout(5) << "waiting for root" << dendl;
2354 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2355 return;
2356 }
2357
2358 bool sessionclosed_isok = replay_unsafe_with_closed_session;
2359 // active session?
2360 Session *session = 0;
2361 if (req->get_source().is_client()) {
2362 session = mds->get_session(req);
2363 if (!session) {
2364 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
2365 } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
2366 session->is_closing() ||
2367 session->is_killing()) {
2368 dout(5) << "session closed|closing|killing, dropping" << dendl;
2369 session = NULL;
2370 }
2371 if (!session) {
2372 if (req->is_queued_for_replay())
2373 mds->queue_one_replay();
2374 return;
2375 }
2376 }
2377
2378 // old mdsmap?
2379 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2380 // send it? hrm, this isn't ideal; they may get a lot of copies if
2381 // they have a high request rate.
2382 }
2383
2384 // completed request?
2385 bool has_completed = false;
2386 if (req->is_replay() || req->get_retry_attempt()) {
2387 ceph_assert(session);
2388 inodeno_t created;
2389 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2390 has_completed = true;
2391 if (!session->is_open())
2392 return;
2393 // Don't send traceless reply if the completed request has created
2394 // new inode. Treat the request as lookup request instead.
2395 if (req->is_replay() ||
2396 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2397 req->get_op() != CEPH_MDS_OP_OPEN &&
2398 req->get_op() != CEPH_MDS_OP_CREATE)) {
2399 dout(5) << "already completed " << req->get_reqid() << dendl;
2400 auto reply = make_message<MClientReply>(*req, 0);
2401 if (created != inodeno_t()) {
2402 bufferlist extra;
2403 encode(created, extra);
2404 reply->set_extra_bl(extra);
2405 }
2406 mds->send_message_client(reply, session);
2407
2408 if (req->is_queued_for_replay())
2409 mds->queue_one_replay();
2410
2411 return;
2412 }
2413 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2414 req->get_op() != CEPH_MDS_OP_CREATE) {
2415 dout(10) << " completed request which created new inode " << created
2416 << ", convert it to lookup request" << dendl;
2417 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2418 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2419 }
2420 }
2421 }
2422
2423 // trim completed_request list
2424 if (req->get_oldest_client_tid() > 0) {
2425 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
2426 ceph_assert(session);
2427 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2428 // Sessions 'completed_requests' was dirtied, mark it to be
2429 // potentially flushed at segment expiry.
2430 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2431
2432 if (session->get_num_trim_requests_warnings() > 0 &&
2433 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
2434 session->reset_num_trim_requests_warnings();
2435 } else {
2436 if (session->get_num_completed_requests() >=
2437 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
2438 session->inc_num_trim_requests_warnings();
2439 CachedStackStringStream css;
2440 *css << "client." << session->get_client() << " does not advance its oldest_client_tid ("
2441 << req->get_oldest_client_tid() << "), "
2442 << session->get_num_completed_requests()
2443 << " completed requests recorded in session\n";
2444 mds->clog->warn() << css->strv();
2445 dout(20) << __func__ << " " << css->strv() << dendl;
2446 }
2447 }
2448 }
2449
2450 // register + dispatch
2451 MDRequestRef mdr = mdcache->request_start(req);
2452 if (!mdr.get())
2453 return;
2454
2455 if (session) {
2456 mdr->session = session;
2457 session->requests.push_back(&mdr->item_session_request);
2458 }
2459
2460 if (has_completed)
2461 mdr->has_completed = true;
2462
2463 // process embedded cap releases?
2464 // (only if NOT replay!)
2465 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
2466 client_t client = req->get_source().num();
2467 for (const auto &r : req->releases) {
2468 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2469 }
2470 req->releases.clear();
2471 }
2472
2473 dispatch_client_request(mdr);
2474 return;
2475 }
2476
2477 void Server::handle_osd_map()
2478 {
2479 /* Note that we check the OSDMAP_FULL flag directly rather than
2480 * using osdmap_full_flag(), because we want to know "is the flag set"
2481 * rather than "does the flag apply to us?" */
2482 mds->objecter->with_osdmap([this](const OSDMap& o) {
2483 auto pi = o.get_pg_pool(mds->get_metadata_pool());
2484 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
2485 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2486 << o.get_epoch() << dendl;
2487 });
2488 }
2489
2490 void Server::dispatch_client_request(MDRequestRef& mdr)
2491 {
2492 // we shouldn't be waiting on anyone.
2493 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_peer.empty());
2494
2495 if (mdr->killed) {
2496 dout(10) << "request " << *mdr << " was killed" << dendl;
2497 //if the mdr is a "batch_op" and it has followers, pick a follower as
2498 //the new "head of the batch ops" and go on processing the new one.
2499 if (mdr->is_batch_head()) {
2500 int mask = mdr->client_request->head.args.getattr.mask;
2501 auto it = mdr->batch_op_map->find(mask);
2502 auto new_batch_head = it->second->find_new_head();
2503 if (!new_batch_head) {
2504 mdr->batch_op_map->erase(it);
2505 return;
2506 }
2507 mdr = std::move(new_batch_head);
2508 } else {
2509 return;
2510 }
2511 } else if (mdr->aborted) {
2512 mdr->aborted = false;
2513 mdcache->request_kill(mdr);
2514 return;
2515 }
2516
2517 const cref_t<MClientRequest> &req = mdr->client_request;
2518
2519 if (logger) logger->inc(l_mdss_dispatch_client_request);
2520
2521 dout(7) << "dispatch_client_request " << *req << dendl;
2522
2523 if (req->may_write() && mdcache->is_readonly()) {
2524 dout(10) << " read-only FS" << dendl;
2525 respond_to_request(mdr, -CEPHFS_EROFS);
2526 return;
2527 }
2528 if (mdr->has_more() && mdr->more()->peer_error) {
2529 dout(10) << " got error from peers" << dendl;
2530 respond_to_request(mdr, mdr->more()->peer_error);
2531 return;
2532 }
2533
2534 if (is_full) {
2535 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
2536 if (!cur) {
2537 // the request is already responded to
2538 return;
2539 }
2540 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2541 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2542 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2543 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2544 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2545 req->get_op() == CEPH_MDS_OP_CREATE ||
2546 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2547 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2548 ((req->get_op() == CEPH_MDS_OP_LINK ||
2549 req->get_op() == CEPH_MDS_OP_RENAME) &&
2550 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started peer request
2551 ) {
2552
2553 if (check_access(mdr, cur, MAY_FULL)) {
2554 dout(20) << __func__ << ": full, has FULL caps, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2555 } else {
2556 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2557 respond_to_request(mdr, -CEPHFS_ENOSPC);
2558 return;
2559 }
2560 } else {
2561 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2562 }
2563 }
2564
2565 switch (req->get_op()) {
2566 case CEPH_MDS_OP_LOOKUPHASH:
2567 case CEPH_MDS_OP_LOOKUPINO:
2568 handle_client_lookup_ino(mdr, false, false);
2569 break;
2570 case CEPH_MDS_OP_LOOKUPPARENT:
2571 handle_client_lookup_ino(mdr, true, false);
2572 break;
2573 case CEPH_MDS_OP_LOOKUPNAME:
2574 handle_client_lookup_ino(mdr, false, true);
2575 break;
2576
2577 // inodes ops.
2578 case CEPH_MDS_OP_LOOKUP:
2579 handle_client_getattr(mdr, true);
2580 break;
2581
2582 case CEPH_MDS_OP_LOOKUPSNAP:
2583 // lookupsnap does not reference a CDentry; treat it as a getattr
2584 case CEPH_MDS_OP_GETATTR:
2585 handle_client_getattr(mdr, false);
2586 break;
2587 case CEPH_MDS_OP_GETVXATTR:
2588 handle_client_getvxattr(mdr);
2589 break;
2590
2591 case CEPH_MDS_OP_SETATTR:
2592 handle_client_setattr(mdr);
2593 break;
2594 case CEPH_MDS_OP_SETLAYOUT:
2595 handle_client_setlayout(mdr);
2596 break;
2597 case CEPH_MDS_OP_SETDIRLAYOUT:
2598 handle_client_setdirlayout(mdr);
2599 break;
2600 case CEPH_MDS_OP_SETXATTR:
2601 handle_client_setxattr(mdr);
2602 break;
2603 case CEPH_MDS_OP_RMXATTR:
2604 handle_client_removexattr(mdr);
2605 break;
2606
2607 case CEPH_MDS_OP_READDIR:
2608 handle_client_readdir(mdr);
2609 break;
2610
2611 case CEPH_MDS_OP_SETFILELOCK:
2612 handle_client_file_setlock(mdr);
2613 break;
2614
2615 case CEPH_MDS_OP_GETFILELOCK:
2616 handle_client_file_readlock(mdr);
2617 break;
2618
2619 // funky.
2620 case CEPH_MDS_OP_CREATE:
2621 if (mdr->has_completed)
2622 handle_client_open(mdr); // already created.. just open
2623 else
2624 handle_client_openc(mdr);
2625 break;
2626
2627 case CEPH_MDS_OP_OPEN:
2628 handle_client_open(mdr);
2629 break;
2630
2631 // namespace.
2632 // no prior locks.
2633 case CEPH_MDS_OP_MKNOD:
2634 handle_client_mknod(mdr);
2635 break;
2636 case CEPH_MDS_OP_LINK:
2637 handle_client_link(mdr);
2638 break;
2639 case CEPH_MDS_OP_UNLINK:
2640 case CEPH_MDS_OP_RMDIR:
2641 handle_client_unlink(mdr);
2642 break;
2643 case CEPH_MDS_OP_RENAME:
2644 handle_client_rename(mdr);
2645 break;
2646 case CEPH_MDS_OP_MKDIR:
2647 handle_client_mkdir(mdr);
2648 break;
2649 case CEPH_MDS_OP_SYMLINK:
2650 handle_client_symlink(mdr);
2651 break;
2652
2653
2654 // snaps
2655 case CEPH_MDS_OP_LSSNAP:
2656 handle_client_lssnap(mdr);
2657 break;
2658 case CEPH_MDS_OP_MKSNAP:
2659 handle_client_mksnap(mdr);
2660 break;
2661 case CEPH_MDS_OP_RMSNAP:
2662 handle_client_rmsnap(mdr);
2663 break;
2664 case CEPH_MDS_OP_RENAMESNAP:
2665 handle_client_renamesnap(mdr);
2666 break;
2667
2668 default:
2669 dout(1) << " unknown client op " << req->get_op() << dendl;
2670 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
2671 }
2672 }
2673
2674
2675 // ---------------------------------------
2676 // PEER REQUESTS
2677
2678 void Server::handle_peer_request(const cref_t<MMDSPeerRequest> &m)
2679 {
2680 dout(4) << "handle_peer_request " << m->get_reqid() << " from " << m->get_source() << dendl;
2681 mds_rank_t from = mds_rank_t(m->get_source().num());
2682
2683 if (logger) logger->inc(l_mdss_handle_peer_request);
2684
2685 // reply?
2686 if (m->is_reply())
2687 return handle_peer_request_reply(m);
2688
2689 // the purpose of rename notify is enforcing causal message ordering. making sure
2690 // bystanders have received all messages from rename srcdn's auth MDS.
2691 if (m->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY) {
2692 auto reply = make_message<MMDSPeerRequest>(m->get_reqid(), m->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK);
2693 mds->send_message(reply, m->get_connection());
2694 return;
2695 }
2696
2697 CDentry *straydn = NULL;
2698 if (m->straybl.length() > 0) {
2699 mdcache->decode_replica_stray(straydn, nullptr, m->straybl, from);
2700 ceph_assert(straydn);
2701 m->straybl.clear();
2702 }
2703
2704 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2705 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2706 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2707 return;
2708 }
2709
2710 // am i a new peer?
2711 MDRequestRef mdr;
2712 if (mdcache->have_request(m->get_reqid())) {
2713 // existing?
2714 mdr = mdcache->request_get(m->get_reqid());
2715
2716 // is my request newer?
2717 if (mdr->attempt > m->get_attempt()) {
2718 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2719 << ", dropping " << *m << dendl;
2720 return;
2721 }
2722
2723 if (mdr->attempt < m->get_attempt()) {
2724 // mine is old, close it out
2725 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2726 << ", closing out" << dendl;
2727 mdcache->request_finish(mdr);
2728 mdr.reset();
2729 } else if (mdr->peer_to_mds != from) {
2730 dout(10) << "local request " << *mdr << " not peer to mds." << from << dendl;
2731 return;
2732 }
2733
2734 // may get these while mdr->peer_request is non-null
2735 if (m->get_op() == MMDSPeerRequest::OP_DROPLOCKS) {
2736 mds->locker->drop_locks(mdr.get());
2737 return;
2738 }
2739 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2740 if (m->is_abort()) {
2741 mdr->aborted = true;
2742 if (mdr->peer_request) {
2743 // only abort on-going xlock, wrlock and auth pin
2744 ceph_assert(!mdr->peer_did_prepare());
2745 } else {
2746 mdcache->request_finish(mdr);
2747 }
2748 } else {
2749 if (m->inode_export.length() > 0)
2750 mdr->more()->inode_import = m->inode_export;
2751 // finish off request.
2752 mdcache->request_finish(mdr);
2753 }
2754 return;
2755 }
2756 }
2757 if (!mdr.get()) {
2758 // new?
2759 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2760 dout(10) << "missing peer request for " << m->get_reqid()
2761 << " OP_FINISH, must have lost race with a forward" << dendl;
2762 return;
2763 }
2764 mdr = mdcache->request_start_peer(m->get_reqid(), m->get_attempt(), m);
2765 mdr->set_op_stamp(m->op_stamp);
2766 }
2767 ceph_assert(mdr->peer_request == 0); // only one at a time, please!
2768
2769 if (straydn) {
2770 mdr->pin(straydn);
2771 mdr->straydn = straydn;
2772 }
2773
2774 if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2775 mdr->locks.empty()) {
2776 dout(3) << "not active yet, waiting" << dendl;
2777 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2778 return;
2779 }
2780
2781 mdr->reset_peer_request(m);
2782
2783 dispatch_peer_request(mdr);
2784 }
2785
2786 void Server::handle_peer_request_reply(const cref_t<MMDSPeerRequest> &m)
2787 {
2788 mds_rank_t from = mds_rank_t(m->get_source().num());
2789
2790 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2791 metareqid_t r = m->get_reqid();
2792 if (!mdcache->have_uncommitted_leader(r, from)) {
2793 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
2794 << from << " reqid " << r << dendl;
2795 return;
2796 }
2797 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2798 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2799 return;
2800 }
2801
2802 if (m->get_op() == MMDSPeerRequest::OP_COMMITTED) {
2803 metareqid_t r = m->get_reqid();
2804 mdcache->committed_leader_peer(r, from);
2805 return;
2806 }
2807
2808 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2809 if (m->get_attempt() != mdr->attempt) {
2810 dout(10) << "handle_peer_request_reply " << *mdr << " ignoring reply from other attempt "
2811 << m->get_attempt() << dendl;
2812 return;
2813 }
2814
2815 switch (m->get_op()) {
2816 case MMDSPeerRequest::OP_XLOCKACK:
2817 {
2818 // identify lock, leader request
2819 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2820 m->get_object_info());
2821 mdr->more()->peers.insert(from);
2822 lock->decode_locked_state(m->get_lock_data());
2823 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2824 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
2825 mdr->finish_locking(lock);
2826 lock->get_xlock(mdr, mdr->get_client());
2827
2828 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2829 mdr->more()->waiting_on_peer.erase(from);
2830 ceph_assert(mdr->more()->waiting_on_peer.empty());
2831 mdcache->dispatch_request(mdr);
2832 }
2833 break;
2834
2835 case MMDSPeerRequest::OP_WRLOCKACK:
2836 {
2837 // identify lock, leader request
2838 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2839 m->get_object_info());
2840 mdr->more()->peers.insert(from);
2841 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2842 auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
2843 ceph_assert(it->is_remote_wrlock());
2844 ceph_assert(it->wrlock_target == from);
2845
2846 mdr->finish_locking(lock);
2847
2848 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2849 mdr->more()->waiting_on_peer.erase(from);
2850 ceph_assert(mdr->more()->waiting_on_peer.empty());
2851 mdcache->dispatch_request(mdr);
2852 }
2853 break;
2854
2855 case MMDSPeerRequest::OP_AUTHPINACK:
2856 handle_peer_auth_pin_ack(mdr, m);
2857 break;
2858
2859 case MMDSPeerRequest::OP_LINKPREPACK:
2860 handle_peer_link_prep_ack(mdr, m);
2861 break;
2862
2863 case MMDSPeerRequest::OP_RMDIRPREPACK:
2864 handle_peer_rmdir_prep_ack(mdr, m);
2865 break;
2866
2867 case MMDSPeerRequest::OP_RENAMEPREPACK:
2868 handle_peer_rename_prep_ack(mdr, m);
2869 break;
2870
2871 case MMDSPeerRequest::OP_RENAMENOTIFYACK:
2872 handle_peer_rename_notify_ack(mdr, m);
2873 break;
2874
2875 default:
2876 ceph_abort();
2877 }
2878 }
2879
2880 void Server::dispatch_peer_request(MDRequestRef& mdr)
2881 {
2882 dout(7) << "dispatch_peer_request " << *mdr << " " << *mdr->peer_request << dendl;
2883
2884 if (mdr->aborted) {
2885 dout(7) << " abort flag set, finishing" << dendl;
2886 mdcache->request_finish(mdr);
2887 return;
2888 }
2889
2890 if (logger) logger->inc(l_mdss_dispatch_peer_request);
2891
2892 int op = mdr->peer_request->get_op();
2893 switch (op) {
2894 case MMDSPeerRequest::OP_XLOCK:
2895 case MMDSPeerRequest::OP_WRLOCK:
2896 {
2897 // identify object
2898 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2899 mdr->peer_request->get_object_info());
2900
2901 if (!lock) {
2902 dout(10) << "don't have object, dropping" << dendl;
2903 ceph_abort(); // can this happen, if we auth pinned properly.
2904 }
2905 if (op == MMDSPeerRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2906 dout(10) << "not auth for remote xlock attempt, dropping on "
2907 << *lock << " on " << *lock->get_parent() << dendl;
2908 } else {
2909 // use acquire_locks so that we get auth_pinning.
2910 MutationImpl::LockOpVec lov;
2911 for (const auto& p : mdr->locks) {
2912 if (p.is_xlock())
2913 lov.add_xlock(p.lock);
2914 else if (p.is_wrlock())
2915 lov.add_wrlock(p.lock);
2916 }
2917
2918 int replycode = 0;
2919 switch (op) {
2920 case MMDSPeerRequest::OP_XLOCK:
2921 lov.add_xlock(lock);
2922 replycode = MMDSPeerRequest::OP_XLOCKACK;
2923 break;
2924 case MMDSPeerRequest::OP_WRLOCK:
2925 lov.add_wrlock(lock);
2926 replycode = MMDSPeerRequest::OP_WRLOCKACK;
2927 break;
2928 }
2929
2930 if (!mds->locker->acquire_locks(mdr, lov))
2931 return;
2932
2933 // ack
2934 auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, replycode);
2935 r->set_lock_type(lock->get_type());
2936 lock->get_parent()->set_object_info(r->get_object_info());
2937 if (replycode == MMDSPeerRequest::OP_XLOCKACK)
2938 lock->encode_locked_state(r->get_lock_data());
2939 mds->send_message(r, mdr->peer_request->get_connection());
2940 }
2941
2942 // done.
2943 mdr->reset_peer_request();
2944 }
2945 break;
2946
2947 case MMDSPeerRequest::OP_UNXLOCK:
2948 case MMDSPeerRequest::OP_UNWRLOCK:
2949 {
2950 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2951 mdr->peer_request->get_object_info());
2952 ceph_assert(lock);
2953 auto it = mdr->locks.find(lock);
2954 ceph_assert(it != mdr->locks.end());
2955 bool need_issue = false;
2956 switch (op) {
2957 case MMDSPeerRequest::OP_UNXLOCK:
2958 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
2959 break;
2960 case MMDSPeerRequest::OP_UNWRLOCK:
2961 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
2962 break;
2963 }
2964 if (need_issue)
2965 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2966
2967 // done. no ack necessary.
2968 mdr->reset_peer_request();
2969 }
2970 break;
2971
2972 case MMDSPeerRequest::OP_AUTHPIN:
2973 handle_peer_auth_pin(mdr);
2974 break;
2975
2976 case MMDSPeerRequest::OP_LINKPREP:
2977 case MMDSPeerRequest::OP_UNLINKPREP:
2978 handle_peer_link_prep(mdr);
2979 break;
2980
2981 case MMDSPeerRequest::OP_RMDIRPREP:
2982 handle_peer_rmdir_prep(mdr);
2983 break;
2984
2985 case MMDSPeerRequest::OP_RENAMEPREP:
2986 handle_peer_rename_prep(mdr);
2987 break;
2988
2989 default:
2990 ceph_abort();
2991 }
2992 }
2993
2994 void Server::handle_peer_auth_pin(MDRequestRef& mdr)
2995 {
2996 dout(10) << "handle_peer_auth_pin " << *mdr << dendl;
2997
2998 // build list of objects
2999 list<MDSCacheObject*> objects;
3000 CInode *auth_pin_freeze = NULL;
3001 bool nonblocking = mdr->peer_request->is_nonblocking();
3002 bool fail = false, wouldblock = false, readonly = false;
3003 ref_t<MMDSPeerRequest> reply;
3004
3005 if (mdcache->is_readonly()) {
3006 dout(10) << " read-only FS" << dendl;
3007 readonly = true;
3008 fail = true;
3009 }
3010
3011 if (!fail) {
3012 for (const auto &oi : mdr->peer_request->get_authpins()) {
3013 MDSCacheObject *object = mdcache->get_object(oi);
3014 if (!object) {
3015 dout(10) << " don't have " << oi << dendl;
3016 fail = true;
3017 break;
3018 }
3019
3020 objects.push_back(object);
3021 if (oi == mdr->peer_request->get_authpin_freeze())
3022 auth_pin_freeze = static_cast<CInode*>(object);
3023 }
3024 }
3025
3026 // can we auth pin them?
3027 if (!fail) {
3028 for (const auto& obj : objects) {
3029 if (!obj->is_auth()) {
3030 dout(10) << " not auth for " << *obj << dendl;
3031 fail = true;
3032 break;
3033 }
3034 if (mdr->is_auth_pinned(obj))
3035 continue;
3036 if (!mdr->can_auth_pin(obj)) {
3037 if (nonblocking) {
3038 dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
3039 fail = true;
3040 wouldblock = true;
3041 break;
3042 }
3043 // wait
3044 dout(10) << " waiting for authpinnable on " << *obj << dendl;
3045 obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3046 mdr->drop_local_auth_pins();
3047
3048 mds->locker->notify_freeze_waiter(obj);
3049 goto blocked;
3050 }
3051 }
3052 }
3053
3054 if (!fail) {
3055 /* freeze authpin wrong inode */
3056 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
3057 mdr->more()->rename_inode != auth_pin_freeze)
3058 mdr->unfreeze_auth_pin(true);
3059
3060 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
3061 * on the source inode to complete. This happens after all locks for the rename
3062 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3063 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3064 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
3065 * The solution is freeze the inode and prevent other MDRequests from getting new
3066 * auth pins.
3067 */
3068 if (auth_pin_freeze) {
3069 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
3070 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
3071 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
3072 mds->mdlog->flush();
3073 goto blocked;
3074 }
3075 }
3076 }
3077
3078 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
3079
3080 if (fail) {
3081 mdr->drop_local_auth_pins(); // just in case
3082 if (readonly)
3083 reply->mark_error_rofs();
3084 if (wouldblock)
3085 reply->mark_error_wouldblock();
3086 } else {
3087 // auth pin!
3088 for (const auto& obj : objects) {
3089 dout(10) << "auth_pinning " << *obj << dendl;
3090 mdr->auth_pin(obj);
3091 }
3092 // return list of my auth_pins (if any)
3093 for (const auto &p : mdr->object_states) {
3094 if (!p.second.auth_pinned)
3095 continue;
3096 MDSCacheObjectInfo info;
3097 p.first->set_object_info(info);
3098 reply->get_authpins().push_back(info);
3099 if (p.first == (MDSCacheObject*)auth_pin_freeze)
3100 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
3101 }
3102 }
3103
3104 mds->send_message_mds(reply, mdr->peer_to_mds);
3105
3106 // clean up this request
3107 mdr->reset_peer_request();
3108 return;
3109
3110 blocked:
3111 if (mdr->peer_request->should_notify_blocking()) {
3112 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
3113 reply->mark_req_blocked();
3114 mds->send_message_mds(reply, mdr->peer_to_mds);
3115 mdr->peer_request->clear_notify_blocking();
3116 }
3117 return;
3118 }
3119
3120 void Server::handle_peer_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
3121 {
3122 dout(10) << "handle_peer_auth_pin_ack on " << *mdr << " " << *ack << dendl;
3123 mds_rank_t from = mds_rank_t(ack->get_source().num());
3124
3125 if (ack->is_req_blocked()) {
3126 mdr->disable_lock_cache();
3127 // peer auth pin is blocked, drop locks to avoid deadlock
3128 mds->locker->drop_locks(mdr.get(), nullptr);
3129 return;
3130 }
3131
3132 // added auth pins?
3133 set<MDSCacheObject*> pinned;
3134 for (const auto &oi : ack->get_authpins()) {
3135 MDSCacheObject *object = mdcache->get_object(oi);
3136 ceph_assert(object); // we pinned it
3137 dout(10) << " remote has pinned " << *object << dendl;
3138 mdr->set_remote_auth_pinned(object, from);
3139 if (oi == ack->get_authpin_freeze())
3140 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
3141 pinned.insert(object);
3142 }
3143
3144 // removed frozen auth pin ?
3145 if (mdr->more()->is_remote_frozen_authpin &&
3146 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
3147 auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
3148 ceph_assert(stat_p);
3149 if (stat_p->remote_auth_pinned == from) {
3150 mdr->more()->is_remote_frozen_authpin = false;
3151 }
3152 }
3153
3154 // removed auth pins?
3155 for (auto& p : mdr->object_states) {
3156 if (p.second.remote_auth_pinned == MDS_RANK_NONE)
3157 continue;
3158 MDSCacheObject* object = p.first;
3159 if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
3160 dout(10) << " remote has unpinned " << *object << dendl;
3161 mdr->_clear_remote_auth_pinned(p.second);
3162 }
3163 }
3164
3165 // note peer
3166 mdr->more()->peers.insert(from);
3167
3168 // clear from waiting list
3169 auto ret = mdr->more()->waiting_on_peer.erase(from);
3170 ceph_assert(ret);
3171
3172 if (ack->is_error_rofs()) {
3173 mdr->more()->peer_error = -CEPHFS_EROFS;
3174 } else if (ack->is_error_wouldblock()) {
3175 mdr->more()->peer_error = -CEPHFS_EWOULDBLOCK;
3176 }
3177
3178 // go again?
3179 if (mdr->more()->waiting_on_peer.empty())
3180 mdcache->dispatch_request(mdr);
3181 else
3182 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
3183 }
3184
3185
3186 // ---------------------------------------
3187 // HELPERS
3188
3189
3190 /**
3191 * check whether we are permitted to complete a request
3192 *
3193 * Check whether we have permission to perform the operation specified
3194 * by mask on the given inode, based on the capability in the mdr's
3195 * session.
3196 */
3197 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
3198 {
3199 if (mdr->session) {
3200 int r = mdr->session->check_access(
3201 in, mask,
3202 mdr->client_request->get_caller_uid(),
3203 mdr->client_request->get_caller_gid(),
3204 &mdr->client_request->get_caller_gid_list(),
3205 mdr->client_request->head.args.setattr.uid,
3206 mdr->client_request->head.args.setattr.gid);
3207 if (r < 0) {
3208 respond_to_request(mdr, r);
3209 return false;
3210 }
3211 }
3212 return true;
3213 }
3214
3215 /**
3216 * check whether fragment has reached maximum size
3217 *
3218 */
3219 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *dir)
3220 {
3221 const auto size = dir->get_frag_size();
3222 const auto max = bal_fragment_size_max;
3223 if (size >= max) {
3224 dout(10) << "fragment " << *dir << " size exceeds " << max << " (CEPHFS_ENOSPC)" << dendl;
3225 respond_to_request(mdr, -CEPHFS_ENOSPC);
3226 return false;
3227 } else {
3228 dout(20) << "fragment " << *dir << " size " << size << " < " << max << dendl;
3229 }
3230
3231 return true;
3232 }
3233
3234 /**
3235 * check whether entries in a dir reached maximum size
3236 *
3237 */
3238 bool Server::check_dir_max_entries(MDRequestRef &mdr, CDir *in)
3239 {
3240 const uint64_t size = in->inode->get_projected_inode()->dirstat.nfiles +
3241 in->inode->get_projected_inode()->dirstat.nsubdirs;
3242 if (dir_max_entries && size >= dir_max_entries) {
3243 dout(10) << "entries per dir " << *in << " size exceeds " << dir_max_entries << " (ENOSPC)" << dendl;
3244 respond_to_request(mdr, -ENOSPC);
3245 return false;
3246 }
3247 return true;
3248 }
3249
3250
3251 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3252 {
3253 string straydname;
3254 in->name_stray_dentry(straydname);
3255
3256 CDentry *straydn = mdr->straydn;
3257 if (straydn) {
3258 ceph_assert(straydn->get_name() == straydname);
3259 return straydn;
3260 }
3261 CDir *straydir = mdcache->get_stray_dir(in);
3262
3263 if (!mdr->client_request->is_replay() &&
3264 !check_fragment_space(mdr, straydir))
3265 return nullptr;
3266
3267 straydn = straydir->lookup(straydname);
3268 if (!straydn) {
3269 if (straydir->is_frozen_dir()) {
3270 dout(10) << __func__ << ": " << *straydir << " is frozen, waiting" << dendl;
3271 mds->locker->drop_locks(mdr.get());
3272 mdr->drop_local_auth_pins();
3273 straydir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3274 return nullptr;
3275 }
3276 straydn = straydir->add_null_dentry(straydname);
3277 straydn->mark_new();
3278 } else {
3279 ceph_assert(straydn->get_projected_linkage()->is_null());
3280 }
3281
3282 straydn->state_set(CDentry::STATE_STRAY);
3283 mdr->straydn = straydn;
3284 mdr->pin(straydn);
3285
3286 return straydn;
3287 }
3288
3289 /** prepare_new_inode
3290 *
3291 * create a new inode. set c/m/atime. hit dir pop.
3292 */
3293 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
3294 const file_layout_t *layout)
3295 {
3296 CInode *in = new CInode(mdcache);
3297 auto _inode = in->_get_inode();
3298
3299 // Server::prepare_force_open_sessions() can re-open session in closing
3300 // state. In that corner case, session's prealloc_inos are being freed.
3301 // To simplify the code, we disallow using/refilling session's prealloc_ino
3302 // while session is opening.
3303 bool allow_prealloc_inos = mdr->session->is_open();
3304
3305 // assign ino
3306 if (allow_prealloc_inos && (mdr->used_prealloc_ino = _inode->ino = mdr->session->take_ino(useino))) {
3307 mds->sessionmap.mark_projected(mdr->session);
3308 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3309 << " (" << mdr->session->info.prealloc_inos.size() << " left)"
3310 << dendl;
3311 } else {
3312 mdr->alloc_ino =
3313 _inode->ino = mds->inotable->project_alloc_id(useino);
3314 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3315 }
3316
3317 if (useino && useino != _inode->ino) {
3318 dout(0) << "WARNING: client specified " << useino << " and i allocated " << _inode->ino << dendl;
3319 mds->clog->error() << mdr->client_request->get_source()
3320 << " specified ino " << useino
3321 << " but mds." << mds->get_nodeid() << " allocated " << _inode->ino;
3322 //ceph_abort(); // just for now.
3323 }
3324
3325 if (allow_prealloc_inos &&
3326 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3327 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
3328 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
3329 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
3330 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3331 mds->sessionmap.mark_projected(mdr->session);
3332 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3333 }
3334
3335 _inode->version = 1;
3336 _inode->xattr_version = 1;
3337 _inode->nlink = 1; // FIXME
3338
3339 _inode->mode = mode;
3340
3341 // FIPS zeroization audit 20191117: this memset is not security related.
3342 memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
3343 if (_inode->is_dir()) {
3344 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
3345 } else if (layout) {
3346 _inode->layout = *layout;
3347 } else {
3348 _inode->layout = mdcache->default_file_layout;
3349 }
3350
3351 _inode->truncate_size = -1ull; // not truncated, yet!
3352 _inode->truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
3353
3354 CInode *diri = dir->get_inode();
3355
3356 dout(10) << oct << " dir mode 0" << diri->get_inode()->mode << " new mode 0" << mode << dec << dendl;
3357
3358 if (diri->get_inode()->mode & S_ISGID) {
3359 dout(10) << " dir is sticky" << dendl;
3360 _inode->gid = diri->get_inode()->gid;
3361 if (S_ISDIR(mode)) {
3362 dout(10) << " new dir also sticky" << dendl;
3363 _inode->mode |= S_ISGID;
3364 }
3365 } else
3366 _inode->gid = mdr->client_request->get_caller_gid();
3367
3368 _inode->uid = mdr->client_request->get_caller_uid();
3369
3370 _inode->btime = _inode->ctime = _inode->mtime = _inode->atime =
3371 mdr->get_op_stamp();
3372
3373 _inode->change_attr = 0;
3374
3375 const cref_t<MClientRequest> &req = mdr->client_request;
3376 if (req->get_data().length()) {
3377 auto p = req->get_data().cbegin();
3378
3379 // xattrs on new inode?
3380 auto _xattrs = CInode::allocate_xattr_map();
3381 decode_noshare(*_xattrs, p);
3382 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs << dendl;
3383 if (_xattrs->count("encryption.ctx")) {
3384 _inode->fscrypt = true;
3385 }
3386 in->reset_xattrs(std::move(_xattrs));
3387 }
3388
3389 if (!mds->mdsmap->get_inline_data_enabled() ||
3390 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
3391 _inode->inline_data.version = CEPH_INLINE_NONE;
3392
3393 mdcache->add_inode(in); // add
3394 dout(10) << "prepare_new_inode " << *in << dendl;
3395 return in;
3396 }
3397
3398 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3399 {
3400 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3401 << " inotablev " << mds->inotable->get_projected_version()
3402 << dendl;
3403 blob->set_ino_alloc(mdr->alloc_ino,
3404 mdr->used_prealloc_ino,
3405 mdr->prealloc_inos,
3406 mdr->client_request->get_source(),
3407 mds->sessionmap.get_projected(),
3408 mds->inotable->get_projected_version());
3409 }
3410
3411 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3412 {
3413 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3414 << " / " << mdr->prealloc_inos
3415 << " / " << mdr->used_prealloc_ino << dendl;
3416
3417 if (mdr->alloc_ino) {
3418 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3419 }
3420 if (mdr->prealloc_inos.size()) {
3421 ceph_assert(session);
3422 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
3423 session->free_prealloc_inos.insert(mdr->prealloc_inos);
3424 session->info.prealloc_inos.insert(mdr->prealloc_inos);
3425 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
3426 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3427 }
3428 if (mdr->used_prealloc_ino) {
3429 ceph_assert(session);
3430 session->info.prealloc_inos.erase(mdr->used_prealloc_ino);
3431 mds->sessionmap.mark_dirty(session);
3432 }
3433 }
3434
3435 class C_MDS_TryFindInode : public ServerContext {
3436 MDRequestRef mdr;
3437 public:
3438 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3439 void finish(int r) override {
3440 if (r == -CEPHFS_ESTALE) // :( find_ino_peers failed
3441 server->respond_to_request(mdr, r);
3442 else
3443 server->dispatch_client_request(mdr);
3444 }
3445 };
3446
3447 /* If this returns null, the request has been handled
3448 * as appropriate: forwarded on, or the client's been replied to */
3449 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
3450 bool want_auth,
3451 bool no_want_auth)
3452 {
3453 const filepath& refpath = mdr->get_filepath();
3454 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3455
3456 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3457 return mdr->in[0];
3458
3459 // traverse
3460 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3461 int flags = 0;
3462 if (refpath.is_last_snap()) {
3463 if (!no_want_auth)
3464 want_auth = true;
3465 } else {
3466 if (!no_want_auth && forward_all_requests_to_auth)
3467 want_auth = true;
3468 flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
3469 }
3470 if (want_auth)
3471 flags |= MDS_TRAVERSE_WANT_AUTH;
3472 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
3473 if (r > 0)
3474 return nullptr; // delayed
3475 if (r < 0) { // error
3476 if (r == -CEPHFS_ENOENT && !mdr->dn[0].empty()) {
3477 if (mdr->client_request &&
3478 mdr->client_request->get_dentry_wanted())
3479 mdr->tracedn = mdr->dn[0].back();
3480 respond_to_request(mdr, r);
3481 } else if (r == -CEPHFS_ESTALE) {
3482 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
3483 MDSContext *c = new C_MDS_TryFindInode(this, mdr);
3484 mdcache->find_ino_peers(refpath.get_ino(), c);
3485 } else {
3486 dout(10) << "FAIL on error " << r << dendl;
3487 respond_to_request(mdr, r);
3488 }
3489 return nullptr;
3490 }
3491 CInode *ref = mdr->in[0];
3492 dout(10) << "ref is " << *ref << dendl;
3493
3494 if (want_auth) {
3495 // auth_pin?
3496 // do NOT proceed if freezing, as cap release may defer in that case, and
3497 // we could deadlock when we try to lock @ref.
3498 // if we're already auth_pinned, continue; the release has already been processed.
3499 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3500 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3501 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
3502 ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
3503 if (mdr->is_any_remote_auth_pin())
3504 mds->locker->notify_freeze_waiter(ref);
3505 return 0;
3506 }
3507 mdr->auth_pin(ref);
3508 }
3509
3510 // set and pin ref
3511 mdr->pin(ref);
3512 return ref;
3513 }
3514
3515
3516 /** rdlock_path_xlock_dentry
3517 * traverse path to the directory that could/would contain dentry.
3518 * make sure i am auth for that dentry, forward as necessary.
3519 * create null dentry in place (or use existing if okexist).
3520 * get rdlocks on traversed dentries, xlock on new dentry.
3521 */
3522 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
3523 bool create, bool okexist, bool want_layout)
3524 {
3525 const filepath& refpath = mdr->get_filepath();
3526 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3527
3528 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3529 return mdr->dn[0].back();
3530
3531 // figure parent dir vs dname
3532 if (refpath.depth() == 0) {
3533 dout(7) << "invalid path (zero length)" << dendl;
3534 respond_to_request(mdr, -CEPHFS_EINVAL);
3535 return nullptr;
3536 }
3537
3538 if (refpath.is_last_snap()) {
3539 respond_to_request(mdr, -CEPHFS_EROFS);
3540 return nullptr;
3541 }
3542
3543 if (refpath.is_last_dot_or_dotdot()) {
3544 dout(7) << "invalid path (last dot or dot_dot)" << dendl;
3545 if (create)
3546 respond_to_request(mdr, -CEPHFS_EEXIST);
3547 else
3548 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
3549 return nullptr;
3550 }
3551
3552 // traverse to parent dir
3553 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3554 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
3555 MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
3556 MDS_TRAVERSE_WANT_AUTH;
3557 if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
3558 flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
3559 if (create)
3560 flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
3561 if (want_layout)
3562 flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
3563 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3564 if (r > 0)
3565 return nullptr; // delayed
3566 if (r < 0) {
3567 if (r == -CEPHFS_ESTALE) {
3568 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
3569 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3570 return nullptr;
3571 }
3572 respond_to_request(mdr, r);
3573 return nullptr;
3574 }
3575
3576 CDentry *dn = mdr->dn[0].back();
3577 CDir *dir = dn->get_dir();
3578 CInode *diri = dir->get_inode();
3579
3580 if (!mdr->reqid.name.is_mds()) {
3581 if (diri->is_system() && !diri->is_root()) {
3582 respond_to_request(mdr, -CEPHFS_EROFS);
3583 return nullptr;
3584 }
3585 }
3586
3587 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
3588 respond_to_request(mdr, -CEPHFS_ENOENT);
3589 return nullptr;
3590 }
3591
3592 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3593 if (dnl->is_null()) {
3594 if (!create && okexist) {
3595 respond_to_request(mdr, -CEPHFS_ENOENT);
3596 return nullptr;
3597 }
3598
3599 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3600 dn->first = std::max(dn->first, next_snap);
3601 } else {
3602 if (!okexist) {
3603 respond_to_request(mdr, -CEPHFS_EEXIST);
3604 return nullptr;
3605 }
3606 mdr->in[0] = dnl->get_inode();
3607 }
3608
3609 return dn;
3610 }
3611
3612 /** rdlock_two_paths_xlock_destdn
3613 * traverse two paths and lock the two paths in proper order.
3614 * The order of taking locks is:
3615 * 1. Lock directory inodes or dentries according to which trees they
3616 * are under. Lock objects under fs root before objects under mdsdir.
3617 * 2. Lock directory inodes or dentries according to their depth, in
3618 * ascending order.
3619 * 3. Lock directory inodes or dentries according to inode numbers or
3620 * dentries' parent inode numbers, in ascending order.
3621 * 4. Lock dentries in the same directory in order of their keys.
3622 * 5. Lock non-directory inodes according to inode numbers, in ascending
3623 * order.
3624 */
3625 std::pair<CDentry*, CDentry*>
3626 Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
3627 {
3628
3629 const filepath& refpath = mdr->get_filepath();
3630 const filepath& refpath2 = mdr->get_filepath2();
3631
3632 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
3633
3634 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3635 return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
3636
3637 if (refpath.depth() != 1 || refpath2.depth() != 1) {
3638 respond_to_request(mdr, -CEPHFS_EINVAL);
3639 return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
3640 }
3641
3642 if (refpath.is_last_snap() || refpath2.is_last_snap()) {
3643 respond_to_request(mdr, -CEPHFS_EROFS);
3644 return std::make_pair(nullptr, nullptr);
3645 }
3646
3647 // traverse to parent dir
3648 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3649 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
3650 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3651 if (r != 0) {
3652 if (r == -CEPHFS_ESTALE) {
3653 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl;
3654 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3655 } else if (r < 0) {
3656 respond_to_request(mdr, r);
3657 }
3658 return std::make_pair(nullptr, nullptr);
3659 }
3660
3661 flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
3662 r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
3663 if (r != 0) {
3664 if (r == -CEPHFS_ESTALE) {
3665 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
3666 mdcache->find_ino_peers(refpath2.get_ino(), new C_MDS_TryFindInode(this, mdr));
3667 } else if (r < 0) {
3668 respond_to_request(mdr, r);
3669 }
3670 return std::make_pair(nullptr, nullptr);
3671 }
3672
3673 CDentry *srcdn = mdr->dn[1].back();
3674 CDir *srcdir = srcdn->get_dir();
3675 CDentry *destdn = mdr->dn[0].back();
3676 CDir *destdir = destdn->get_dir();
3677
3678 if (!mdr->reqid.name.is_mds()) {
3679 if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
3680 (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
3681 respond_to_request(mdr, -CEPHFS_EROFS);
3682 return std::make_pair(nullptr, nullptr);
3683 }
3684 }
3685
3686 if (!destdir->get_inode()->is_base() &&
3687 destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
3688 respond_to_request(mdr, -CEPHFS_ENOENT);
3689 return std::make_pair(nullptr, nullptr);
3690 }
3691
3692 MutationImpl::LockOpVec lov;
3693 if (srcdir->get_inode() == destdir->get_inode()) {
3694 lov.add_wrlock(&destdir->inode->filelock);
3695 lov.add_wrlock(&destdir->inode->nestlock);
3696 if (xlock_srcdn && srcdir != destdir) {
3697 mds_rank_t srcdir_auth = srcdir->authority().first;
3698 if (srcdir_auth != mds->get_nodeid()) {
3699 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3700 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3701 }
3702 }
3703
3704 if (srcdn->get_name() > destdn->get_name())
3705 lov.add_xlock(&destdn->lock);
3706
3707 if (xlock_srcdn)
3708 lov.add_xlock(&srcdn->lock);
3709 else
3710 lov.add_rdlock(&srcdn->lock);
3711
3712 if (srcdn->get_name() < destdn->get_name())
3713 lov.add_xlock(&destdn->lock);
3714 } else {
3715 int cmp = mdr->compare_paths();
3716 bool lock_destdir_first =
3717 (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
3718
3719 if (lock_destdir_first) {
3720 lov.add_wrlock(&destdir->inode->filelock);
3721 lov.add_wrlock(&destdir->inode->nestlock);
3722 lov.add_xlock(&destdn->lock);
3723 }
3724
3725 if (xlock_srcdn) {
3726 mds_rank_t srcdir_auth = srcdir->authority().first;
3727 if (srcdir_auth == mds->get_nodeid()) {
3728 lov.add_wrlock(&srcdir->inode->filelock);
3729 lov.add_wrlock(&srcdir->inode->nestlock);
3730 } else {
3731 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3732 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3733 }
3734 lov.add_xlock(&srcdn->lock);
3735 } else {
3736 lov.add_rdlock(&srcdn->lock);
3737 }
3738
3739 if (!lock_destdir_first) {
3740 lov.add_wrlock(&destdir->inode->filelock);
3741 lov.add_wrlock(&destdir->inode->nestlock);
3742 lov.add_xlock(&destdn->lock);
3743 }
3744 }
3745
3746 CInode *auth_pin_freeze = nullptr;
3747 // XXX any better way to do this?
3748 if (xlock_srcdn && !srcdn->is_auth()) {
3749 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
3750 auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
3751 }
3752 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
3753 return std::make_pair(nullptr, nullptr);
3754
3755 if (srcdn->get_projected_linkage()->is_null()) {
3756 respond_to_request(mdr, -CEPHFS_ENOENT);
3757 return std::make_pair(nullptr, nullptr);
3758 }
3759
3760 if (destdn->get_projected_linkage()->is_null()) {
3761 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3762 destdn->first = std::max(destdn->first, next_snap);
3763 }
3764
3765 mdr->locking_state |= MutationImpl::PATH_LOCKED;
3766
3767 return std::make_pair(destdn, srcdn);
3768 }
3769
3770 /**
3771 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3772 *
3773 * @param diri base inode
3774 * @param fg the exact frag we want
3775 * @param mdr request
3776 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3777 */
3778 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3779 {
3780 CDir *dir = diri->get_dirfrag(fg);
3781
3782 if (dir) {
3783 // am i auth for the dirfrag?
3784 if (!dir->is_auth()) {
3785 mds_rank_t auth = dir->authority().first;
3786 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3787 << ", fw to mds." << auth << dendl;
3788 mdcache->request_forward(mdr, auth);
3789 return nullptr;
3790 }
3791 } else {
3792 // not open and inode not mine?
3793 if (!diri->is_auth()) {
3794 mds_rank_t inauth = diri->authority().first;
3795 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3796 mdcache->request_forward(mdr, inauth);
3797 return nullptr;
3798 }
3799
3800 // not open and inode frozen?
3801 if (diri->is_frozen()) {
3802 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3803 ceph_assert(diri->get_parent_dir());
3804 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3805 return nullptr;
3806 }
3807
3808 // invent?
3809 dir = diri->get_or_open_dirfrag(mdcache, fg);
3810 }
3811
3812 return dir;
3813 }
3814
3815
3816 // ===============================================================================
3817 // STAT
3818
3819 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3820 {
3821 const cref_t<MClientRequest> &req = mdr->client_request;
3822
3823 if (req->get_filepath().depth() == 0 && is_lookup) {
3824 // refpath can't be empty for lookup but it can for
3825 // getattr (we do getattr with empty refpath for mount of '/')
3826 respond_to_request(mdr, -CEPHFS_EINVAL);
3827 return;
3828 }
3829
3830 bool want_auth = false;
3831 int mask = req->head.args.getattr.mask;
3832 if (mask & CEPH_STAT_RSTAT)
3833 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3834
3835 if (!mdr->is_batch_head() && mdr->can_batch()) {
3836 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
3837 int r = mdcache->path_traverse(mdr, cf, mdr->get_filepath(),
3838 (want_auth ? MDS_TRAVERSE_WANT_AUTH : 0),
3839 &mdr->dn[0], &mdr->in[0]);
3840 if (r > 0)
3841 return; // delayed
3842
3843 if (r < 0) {
3844 // fall-thru. let rdlock_path_pin_ref() check again.
3845 } else if (is_lookup) {
3846 CDentry* dn = mdr->dn[0].back();
3847 mdr->pin(dn);
3848 auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3849 if (em.second) {
3850 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
3851 } else {
3852 dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3853 em.first->second->add_request(mdr);
3854 return;
3855 }
3856 } else {
3857 CInode *in = mdr->in[0];
3858 mdr->pin(in);
3859 auto em = in->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3860 if (em.second) {
3861 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
3862 } else {
3863 dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3864 em.first->second->add_request(mdr);
3865 return;
3866 }
3867 }
3868 }
3869
3870 CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
3871 if (!ref)
3872 return;
3873
3874 mdr->getattr_caps = mask;
3875
3876 /*
3877 * if client currently holds the EXCL cap on a field, do not rdlock
3878 * it; client's stat() will result in valid info if _either_ EXCL
3879 * cap is held or MDS rdlocks and reads the value here.
3880 *
3881 * handling this case here is easier than weakening rdlock
3882 * semantics... that would cause problems elsewhere.
3883 */
3884 client_t client = mdr->get_client();
3885 int issued = 0;
3886 Capability *cap = ref->get_client_cap(client);
3887 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3888 mdr->snapid <= cap->client_follows))
3889 issued = cap->issued();
3890
3891 // FIXME
3892 MutationImpl::LockOpVec lov;
3893 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
3894 lov.add_rdlock(&ref->linklock);
3895 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
3896 lov.add_rdlock(&ref->authlock);
3897 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
3898 lov.add_rdlock(&ref->xattrlock);
3899 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3900 // Don't wait on unstable filelock if client is allowed to read file size.
3901 // This can reduce the response time of getattr in the case that multiple
3902 // clients do stat(2) and there are writers.
3903 // The downside of this optimization is that mds may not issue Fs caps along
3904 // with getattr reply. Client may need to send more getattr requests.
3905 if (mdr->is_rdlocked(&ref->filelock)) {
3906 lov.add_rdlock(&ref->filelock);
3907 } else if (ref->filelock.is_stable() ||
3908 ref->filelock.get_num_wrlocks() > 0 ||
3909 !ref->filelock.can_read(mdr->get_client())) {
3910 lov.add_rdlock(&ref->filelock);
3911 mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
3912 }
3913 }
3914
3915 if (!mds->locker->acquire_locks(mdr, lov))
3916 return;
3917
3918 if (!check_access(mdr, ref, MAY_READ))
3919 return;
3920
3921 utime_t now = ceph_clock_now();
3922 mdr->set_mds_stamp(now);
3923
3924 // note which caps are requested, so we return at least a snapshot
3925 // value for them. (currently this matters for xattrs and inline data)
3926 mdr->getattr_caps = mask;
3927
3928 mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
3929
3930 // reply
3931 dout(10) << "reply to stat on " << *req << dendl;
3932 mdr->tracei = ref;
3933 if (is_lookup)
3934 mdr->tracedn = mdr->dn[0].back();
3935 respond_to_request(mdr, 0);
3936 }
3937
3938 struct C_MDS_LookupIno2 : public ServerContext {
3939 MDRequestRef mdr;
3940 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3941 void finish(int r) override {
3942 server->_lookup_ino_2(mdr, r);
3943 }
3944 };
3945
3946 /*
3947 * filepath: ino
3948 */
3949 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3950 bool want_parent, bool want_dentry)
3951 {
3952 const cref_t<MClientRequest> &req = mdr->client_request;
3953
3954 if ((uint64_t)req->head.args.lookupino.snapid > 0)
3955 return _lookup_snap_ino(mdr);
3956
3957 inodeno_t ino = req->get_filepath().get_ino();
3958 auto _ino = ino.val;
3959
3960 /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
3961 * I do not have an explanation for how that happened organically but this
3962 * check will ensure that the client can no longer do that.
3963 *
3964 * [1] https://tracker.ceph.com/issues/49922
3965 */
3966 if (MDS_IS_PRIVATE_INO(_ino)) {
3967 respond_to_request(mdr, -CEPHFS_ESTALE);
3968 return;
3969 }
3970
3971 CInode *in = mdcache->get_inode(ino);
3972 if (in && in->state_test(CInode::STATE_PURGING)) {
3973 respond_to_request(mdr, -CEPHFS_ESTALE);
3974 return;
3975 }
3976 if (!in) {
3977 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3978 return;
3979 }
3980
3981 // check for nothing (not read or write); this still applies the
3982 // path check.
3983 if (!check_access(mdr, in, 0))
3984 return;
3985
3986 CDentry *dn = in->get_projected_parent_dn();
3987 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3988
3989 MutationImpl::LockOpVec lov;
3990 if (dn && (want_parent || want_dentry)) {
3991 mdr->pin(dn);
3992 lov.add_rdlock(&dn->lock);
3993 }
3994
3995 unsigned mask = req->head.args.lookupino.mask;
3996 if (mask) {
3997 Capability *cap = in->get_client_cap(mdr->get_client());
3998 int issued = 0;
3999 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4000 issued = cap->issued();
4001 // FIXME
4002 // permission bits, ACL/security xattrs
4003 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
4004 lov.add_rdlock(&in->authlock);
4005 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
4006 lov.add_rdlock(&in->xattrlock);
4007
4008 mdr->getattr_caps = mask;
4009 }
4010
4011 if (!lov.empty()) {
4012 if (!mds->locker->acquire_locks(mdr, lov))
4013 return;
4014
4015 if (diri != NULL) {
4016 // need read access to directory inode
4017 if (!check_access(mdr, diri, MAY_READ))
4018 return;
4019 }
4020 }
4021
4022 if (want_parent) {
4023 if (in->is_base()) {
4024 respond_to_request(mdr, -CEPHFS_EINVAL);
4025 return;
4026 }
4027 if (!diri || diri->is_stray()) {
4028 respond_to_request(mdr, -CEPHFS_ESTALE);
4029 return;
4030 }
4031 dout(10) << "reply to lookup_parent " << *in << dendl;
4032 mdr->tracei = diri;
4033 respond_to_request(mdr, 0);
4034 } else {
4035 if (want_dentry) {
4036 inodeno_t dirino = req->get_filepath2().get_ino();
4037 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
4038 respond_to_request(mdr, -CEPHFS_ENOENT);
4039 return;
4040 }
4041 dout(10) << "reply to lookup_name " << *in << dendl;
4042 } else
4043 dout(10) << "reply to lookup_ino " << *in << dendl;
4044
4045 mdr->tracei = in;
4046 if (want_dentry)
4047 mdr->tracedn = dn;
4048 respond_to_request(mdr, 0);
4049 }
4050 }
4051
4052 void Server::_lookup_snap_ino(MDRequestRef& mdr)
4053 {
4054 const cref_t<MClientRequest> &req = mdr->client_request;
4055
4056 vinodeno_t vino;
4057 vino.ino = req->get_filepath().get_ino();
4058 vino.snapid = (__u64)req->head.args.lookupino.snapid;
4059 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
4060 __u32 hash = req->head.args.lookupino.hash;
4061
4062 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
4063
4064 CInode *in = mdcache->lookup_snap_inode(vino);
4065 if (!in) {
4066 in = mdcache->get_inode(vino.ino);
4067 if (in) {
4068 if (in->state_test(CInode::STATE_PURGING) ||
4069 !in->has_snap_data(vino.snapid)) {
4070 if (in->is_dir() || !parent_ino) {
4071 respond_to_request(mdr, -CEPHFS_ESTALE);
4072 return;
4073 }
4074 in = NULL;
4075 }
4076 }
4077 }
4078
4079 if (in) {
4080 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
4081 mdr->snapid = vino.snapid;
4082 mdr->tracei = in;
4083 respond_to_request(mdr, 0);
4084 return;
4085 }
4086
4087 CInode *diri = NULL;
4088 if (parent_ino) {
4089 diri = mdcache->get_inode(parent_ino);
4090 if (!diri) {
4091 mdcache->open_ino(parent_ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
4092 return;
4093 }
4094
4095 if (!diri->is_dir()) {
4096 respond_to_request(mdr, -CEPHFS_EINVAL);
4097 return;
4098 }
4099
4100 MutationImpl::LockOpVec lov;
4101 lov.add_rdlock(&diri->dirfragtreelock);
4102 if (!mds->locker->acquire_locks(mdr, lov))
4103 return;
4104
4105 frag_t frag = diri->dirfragtree[hash];
4106 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
4107 if (!dir)
4108 return;
4109
4110 if (!dir->is_complete()) {
4111 if (dir->is_frozen()) {
4112 mds->locker->drop_locks(mdr.get());
4113 mdr->drop_local_auth_pins();
4114 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4115 return;
4116 }
4117 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4118 return;
4119 }
4120
4121 respond_to_request(mdr, -CEPHFS_ESTALE);
4122 } else {
4123 mdcache->open_ino(vino.ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
4124 }
4125 }
4126
4127 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
4128 {
4129 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
4130 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
4131
4132 // `r` is a rank if >=0, else an error code
4133 if (r >= 0) {
4134 mds_rank_t dest_rank(r);
4135 if (dest_rank == mds->get_nodeid())
4136 dispatch_client_request(mdr);
4137 else
4138 mdcache->request_forward(mdr, dest_rank);
4139 return;
4140 }
4141
4142 // give up
4143 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
4144 r = -CEPHFS_ESTALE;
4145 respond_to_request(mdr, r);
4146 }
4147
4148
4149 /* This function takes responsibility for the passed mdr*/
4150 void Server::handle_client_open(MDRequestRef& mdr)
4151 {
4152 const cref_t<MClientRequest> &req = mdr->client_request;
4153 dout(7) << "open on " << req->get_filepath() << dendl;
4154
4155 int flags = req->head.args.open.flags;
4156 int cmode = ceph_flags_to_mode(flags);
4157 if (cmode < 0) {
4158 respond_to_request(mdr, -CEPHFS_EINVAL);
4159 return;
4160 }
4161
4162 bool need_auth = !file_mode_is_readonly(cmode) ||
4163 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
4164
4165 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
4166 dout(7) << "read-only FS" << dendl;
4167 respond_to_request(mdr, -CEPHFS_EROFS);
4168 return;
4169 }
4170
4171 CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
4172 if (!cur)
4173 return;
4174
4175 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
4176 ceph_assert(!need_auth);
4177 mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
4178 CInode *cur = rdlock_path_pin_ref(mdr, true);
4179 if (!cur)
4180 return;
4181 }
4182
4183 if (!cur->is_file()) {
4184 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4185 cmode = CEPH_FILE_MODE_PIN;
4186 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4187 if (cur->is_symlink() && !(flags & CEPH_O_NOFOLLOW))
4188 flags &= ~CEPH_O_TRUNC;
4189 }
4190
4191 dout(10) << "open flags = " << flags
4192 << ", filemode = " << cmode
4193 << ", need_auth = " << need_auth
4194 << dendl;
4195
4196 // regular file?
4197 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4198 dout(7) << "not a file or dir " << *cur << dendl;
4199 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
4200 return;
4201 }*/
4202 if ((flags & CEPH_O_DIRECTORY) && !cur->is_dir() && !cur->is_symlink()) {
4203 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
4204 respond_to_request(mdr, -CEPHFS_EINVAL);
4205 return;
4206 }
4207
4208 if ((flags & CEPH_O_TRUNC) && !cur->is_file()) {
4209 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
4210 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4211 respond_to_request(mdr, cur->is_dir() ? -CEPHFS_EISDIR : -CEPHFS_EINVAL);
4212 return;
4213 }
4214
4215 if (cur->get_inode()->inline_data.version != CEPH_INLINE_NONE &&
4216 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
4217 dout(7) << "old client cannot open inline data file " << *cur << dendl;
4218 respond_to_request(mdr, -CEPHFS_EPERM);
4219 return;
4220 }
4221
4222 // snapped data is read only
4223 if (mdr->snapid != CEPH_NOSNAP &&
4224 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
4225 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
4226 respond_to_request(mdr, -CEPHFS_EROFS);
4227 return;
4228 }
4229
4230 MutationImpl::LockOpVec lov;
4231
4232 unsigned mask = req->head.args.open.mask;
4233 if (mask) {
4234 Capability *cap = cur->get_client_cap(mdr->get_client());
4235 int issued = 0;
4236 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4237 issued = cap->issued();
4238 // permission bits, ACL/security xattrs
4239 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
4240 lov.add_rdlock(&cur->authlock);
4241 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
4242 lov.add_rdlock(&cur->xattrlock);
4243
4244 mdr->getattr_caps = mask;
4245 }
4246
4247 // O_TRUNC
4248 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
4249 ceph_assert(cur->is_auth());
4250
4251 lov.add_xlock(&cur->filelock);
4252 if (!mds->locker->acquire_locks(mdr, lov))
4253 return;
4254
4255 if (!check_access(mdr, cur, MAY_WRITE))
4256 return;
4257
4258 // wait for pending truncate?
4259 const auto& pi = cur->get_projected_inode();
4260 if (pi->is_truncating()) {
4261 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4262 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4263 mds->locker->drop_locks(mdr.get());
4264 mdr->drop_local_auth_pins();
4265 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4266 return;
4267 }
4268
4269 do_open_truncate(mdr, cmode);
4270 return;
4271 }
4272
4273 // sync filelock if snapped.
4274 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4275 // and that data itself is flushed so that we can read the snapped data off disk.
4276 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
4277 lov.add_rdlock(&cur->filelock);
4278 }
4279
4280 if (!mds->locker->acquire_locks(mdr, lov))
4281 return;
4282
4283 mask = MAY_READ;
4284 if (cmode & CEPH_FILE_MODE_WR)
4285 mask |= MAY_WRITE;
4286 if (!check_access(mdr, cur, mask))
4287 return;
4288
4289 utime_t now = ceph_clock_now();
4290 mdr->set_mds_stamp(now);
4291
4292 if (cur->is_file() || cur->is_dir()) {
4293 if (mdr->snapid == CEPH_NOSNAP) {
4294 // register new cap
4295 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
4296 if (cap)
4297 dout(12) << "open issued caps " << ccap_string(cap->pending())
4298 << " for " << req->get_source()
4299 << " on " << *cur << dendl;
4300 } else {
4301 int caps = ceph_caps_for_mode(cmode);
4302 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
4303 << " for " << req->get_source()
4304 << " snapid " << mdr->snapid
4305 << " on " << *cur << dendl;
4306 mdr->snap_caps = caps;
4307 }
4308 }
4309
4310 // increase max_size?
4311 if (cmode & CEPH_FILE_MODE_WR)
4312 mds->locker->check_inode_max_size(cur);
4313
4314 // make sure this inode gets into the journal
4315 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
4316 mdcache->open_file_table.should_log_open(cur)) {
4317 EOpen *le = new EOpen(mds->mdlog);
4318 mdlog->start_entry(le);
4319 le->add_clean_inode(cur);
4320 mdlog->submit_entry(le);
4321 }
4322
4323 // hit pop
4324 if (cmode & CEPH_FILE_MODE_WR)
4325 mds->balancer->hit_inode(cur, META_POP_IWR);
4326 else
4327 mds->balancer->hit_inode(cur, META_POP_IRD,
4328 mdr->client_request->get_source().num());
4329
4330 CDentry *dn = 0;
4331 if (req->get_dentry_wanted()) {
4332 ceph_assert(mdr->dn[0].size());
4333 dn = mdr->dn[0].back();
4334 }
4335
4336 mdr->tracei = cur;
4337 mdr->tracedn = dn;
4338 respond_to_request(mdr, 0);
4339 }
4340
4341 class C_MDS_openc_finish : public ServerLogContext {
4342 CDentry *dn;
4343 CInode *newi;
4344 public:
4345 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4346 ServerLogContext(s, r), dn(d), newi(ni) {}
4347 void finish(int r) override {
4348 ceph_assert(r == 0);
4349
4350 dn->pop_projected_linkage();
4351
4352 // dirty inode, dn, dir
4353 newi->mark_dirty(mdr->ls);
4354 newi->mark_dirty_parent(mdr->ls, true);
4355
4356 mdr->apply();
4357
4358 get_mds()->locker->share_inode_max_size(newi);
4359
4360 MDRequestRef null_ref;
4361 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4362
4363 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
4364
4365 server->respond_to_request(mdr, 0);
4366
4367 ceph_assert(g_conf()->mds_kill_openc_at != 1);
4368 }
4369 };
4370
4371 /* This function takes responsibility for the passed mdr*/
4372 void Server::handle_client_openc(MDRequestRef& mdr)
4373 {
4374 const cref_t<MClientRequest> &req = mdr->client_request;
4375 client_t client = mdr->get_client();
4376
4377 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4378
4379 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4380 if (cmode < 0) {
4381 respond_to_request(mdr, -CEPHFS_EINVAL);
4382 return;
4383 }
4384
4385 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
4386 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true);
4387 if (!dn)
4388 return;
4389
4390 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4391 if (!excl && !dnl->is_null()) {
4392 // it existed.
4393 mds->locker->xlock_downgrade(&dn->lock, mdr.get());
4394
4395 MutationImpl::LockOpVec lov;
4396 lov.add_rdlock(&dnl->get_inode()->snaplock);
4397 if (!mds->locker->acquire_locks(mdr, lov))
4398 return;
4399
4400 handle_client_open(mdr);
4401 return;
4402 }
4403
4404 ceph_assert(dnl->is_null());
4405
4406 if (req->get_alternate_name().size() > alternate_name_max) {
4407 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
4408 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
4409 return;
4410 }
4411 dn->set_alternate_name(req->get_alternate_name());
4412
4413 // set layout
4414 file_layout_t layout;
4415 if (mdr->dir_layout != file_layout_t())
4416 layout = mdr->dir_layout;
4417 else
4418 layout = mdcache->default_file_layout;
4419
4420 // What kind of client caps are required to complete this operation
4421 uint64_t access = MAY_WRITE;
4422
4423 const auto default_layout = layout;
4424
4425 // fill in any special params from client
4426 if (req->head.args.open.stripe_unit)
4427 layout.stripe_unit = req->head.args.open.stripe_unit;
4428 if (req->head.args.open.stripe_count)
4429 layout.stripe_count = req->head.args.open.stripe_count;
4430 if (req->head.args.open.object_size)
4431 layout.object_size = req->head.args.open.object_size;
4432 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4433 (__s32)req->head.args.open.pool >= 0) {
4434 layout.pool_id = req->head.args.open.pool;
4435
4436 // make sure we have as new a map as the client
4437 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4438 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4439 return;
4440 }
4441 }
4442
4443 // If client doesn't have capability to modify layout pools, then
4444 // only permit this request if the requested pool matches what the
4445 // file would have inherited anyway from its parent.
4446 if (default_layout != layout) {
4447 access |= MAY_SET_VXATTR;
4448 }
4449
4450 if (!layout.is_valid()) {
4451 dout(10) << " invalid initial file layout" << dendl;
4452 respond_to_request(mdr, -CEPHFS_EINVAL);
4453 return;
4454 }
4455 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4456 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4457 respond_to_request(mdr, -CEPHFS_EINVAL);
4458 return;
4459 }
4460
4461 // created null dn.
4462 CDir *dir = dn->get_dir();
4463 CInode *diri = dir->get_inode();
4464 if (!check_access(mdr, diri, access))
4465 return;
4466 if (!check_fragment_space(mdr, dir))
4467 return;
4468 if (!check_dir_max_entries(mdr, dir))
4469 return;
4470
4471 if (mdr->dn[0].size() == 1)
4472 mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
4473
4474 // create inode.
4475 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4476 req->head.args.open.mode | S_IFREG, &layout);
4477 ceph_assert(newi);
4478
4479 // it's a file.
4480 dn->push_projected_linkage(newi);
4481
4482 auto _inode = newi->_get_inode();
4483 _inode->version = dn->pre_dirty();
4484 if (layout.pool_id != mdcache->default_file_layout.pool_id)
4485 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
4486 _inode->update_backtrace();
4487 _inode->rstat.rfiles = 1;
4488 _inode->accounted_rstat = _inode->rstat;
4489
4490 SnapRealm *realm = diri->find_snaprealm();
4491 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4492 ceph_assert(follows >= realm->get_newest_seq());
4493
4494 ceph_assert(dn->first == follows+1);
4495 newi->first = dn->first;
4496
4497 // do the open
4498 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
4499 newi->authlock.set_state(LOCK_EXCL);
4500 newi->xattrlock.set_state(LOCK_EXCL);
4501
4502 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
4503 _inode->client_ranges[client].range.first = 0;
4504 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
4505 _inode->client_ranges[client].follows = follows;
4506 newi->mark_clientwriteable();
4507 cap->mark_clientwriteable();
4508 }
4509
4510 // prepare finisher
4511 mdr->ls = mdlog->get_current_segment();
4512 EUpdate *le = new EUpdate(mdlog, "openc");
4513 mdlog->start_entry(le);
4514 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4515 journal_allocated_inos(mdr, &le->metablob);
4516 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4517 le->metablob.add_primary_dentry(dn, newi, true, true, true);
4518
4519 // make sure this inode gets into the journal
4520 le->metablob.add_opened_ino(newi->ino());
4521
4522 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, newi);
4523
4524 if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
4525 openc_response_t ocresp;
4526
4527 dout(10) << "adding created_ino and delegated_inos" << dendl;
4528 ocresp.created_ino = _inode->ino;
4529
4530 if (delegate_inos_pct && !req->is_queued_for_replay()) {
4531 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4532 unsigned frac = 100 / delegate_inos_pct;
4533 if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
4534 mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
4535 }
4536
4537 encode(ocresp, mdr->reply_extra_bl);
4538 } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
4539 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4540 // add the file created flag onto the reply if create_flags features is supported
4541 encode(newi->ino(), mdr->reply_extra_bl);
4542 }
4543
4544 journal_and_reply(mdr, newi, dn, le, fin);
4545
4546 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4547 // have overshot the split size (multiple opencs in flight), so here is
4548 // an early chance to split the dir if this openc makes it oversized.
4549 mds->balancer->maybe_fragment(dir, false);
4550 }
4551
4552
4553
4554 void Server::handle_client_readdir(MDRequestRef& mdr)
4555 {
4556 const cref_t<MClientRequest> &req = mdr->client_request;
4557 Session *session = mds->get_session(req);
4558 client_t client = req->get_source().num();
4559 MutationImpl::LockOpVec lov;
4560 CInode *diri = rdlock_path_pin_ref(mdr, false, true);
4561 if (!diri) return;
4562
4563 // it's a directory, right?
4564 if (!diri->is_dir()) {
4565 // not a dir
4566 dout(10) << "reply to " << *req << " readdir -CEPHFS_ENOTDIR" << dendl;
4567 respond_to_request(mdr, -CEPHFS_ENOTDIR);
4568 return;
4569 }
4570
4571 auto num_caps = session->get_num_caps();
4572 auto session_cap_acquisition = session->get_cap_acquisition();
4573
4574 if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
4575 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
4576 << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
4577 if (logger)
4578 logger->inc(l_mdss_cap_acquisition_throttle);
4579
4580 mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
4581 return;
4582 }
4583
4584 lov.add_rdlock(&diri->filelock);
4585 lov.add_rdlock(&diri->dirfragtreelock);
4586
4587 if (!mds->locker->acquire_locks(mdr, lov))
4588 return;
4589
4590 if (!check_access(mdr, diri, MAY_READ))
4591 return;
4592
4593 // which frag?
4594 frag_t fg = (__u32)req->head.args.readdir.frag;
4595 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4596 string offset_str = req->get_path2();
4597
4598 __u32 offset_hash = 0;
4599 if (!offset_str.empty())
4600 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4601 else
4602 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4603
4604 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4605 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4606
4607 // does the frag exist?
4608 if (diri->dirfragtree[fg.value()] != fg) {
4609 frag_t newfg;
4610 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4611 if (fg.contains((unsigned)offset_hash)) {
4612 newfg = diri->dirfragtree[offset_hash];
4613 } else {
4614 // client actually wants next frag
4615 newfg = diri->dirfragtree[fg.value()];
4616 }
4617 } else {
4618 offset_str.clear();
4619 newfg = diri->dirfragtree[fg.value()];
4620 }
4621 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4622 fg = newfg;
4623 }
4624
4625 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4626 if (!dir) return;
4627
4628 // ok!
4629 dout(10) << "handle_client_readdir on " << *dir << dendl;
4630 ceph_assert(dir->is_auth());
4631
4632 if (!dir->is_complete()) {
4633 if (dir->is_frozen()) {
4634 dout(7) << "dir is frozen " << *dir << dendl;
4635 mds->locker->drop_locks(mdr.get());
4636 mdr->drop_local_auth_pins();
4637 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4638 return;
4639 }
4640 // fetch
4641 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4642 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4643 return;
4644 }
4645
4646 #ifdef MDS_VERIFY_FRAGSTAT
4647 dir->verify_fragstat();
4648 #endif
4649
4650 utime_t now = ceph_clock_now();
4651 mdr->set_mds_stamp(now);
4652
4653 snapid_t snapid = mdr->snapid;
4654 dout(10) << "snapid " << snapid << dendl;
4655
4656 SnapRealm *realm = diri->find_snaprealm();
4657
4658 unsigned max = req->head.args.readdir.max_entries;
4659 if (!max)
4660 max = dir->get_num_any(); // whatever, something big.
4661 unsigned max_bytes = req->head.args.readdir.max_bytes;
4662 if (!max_bytes)
4663 // make sure at least one item can be encoded
4664 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
4665
4666 // start final blob
4667 bufferlist dirbl;
4668 DirStat ds;
4669 ds.frag = dir->get_frag();
4670 ds.auth = dir->get_dir_auth().first;
4671 if (dir->is_auth() && !forward_all_requests_to_auth)
4672 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4673
4674 dir->encode_dirstat(dirbl, mdr->session->info, ds);
4675
4676 // count bytes available.
4677 // this isn't perfect, but we should capture the main variable/unbounded size items!
4678 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4679 int bytes_left = max_bytes - front_bytes;
4680 bytes_left -= realm->get_snap_trace().length();
4681
4682 // build dir contents
4683 bufferlist dnbl;
4684 __u32 numfiles = 0;
4685 bool start = !offset_hash && offset_str.empty();
4686 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4687 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
4688 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4689 bool end = (it == dir->end());
4690 for (; !end && numfiles < max; end = (it == dir->end())) {
4691 CDentry *dn = it->second;
4692 ++it;
4693
4694 if (dn->state_test(CDentry::STATE_PURGING))
4695 continue;
4696
4697 bool dnp = dn->use_projected(client, mdr);
4698 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4699
4700 if (dnl->is_null())
4701 continue;
4702
4703 if (dn->last < snapid || dn->first > snapid) {
4704 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4705 continue;
4706 }
4707
4708 if (!start) {
4709 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4710 if (!(offset_key < dn->key()))
4711 continue;
4712 }
4713
4714 CInode *in = dnl->get_inode();
4715
4716 if (in && in->ino() == CEPH_INO_CEPH)
4717 continue;
4718
4719 // remote link?
4720 // better for the MDS to do the work, if we think the client will stat any of these files.
4721 if (dnl->is_remote() && !in) {
4722 in = mdcache->get_inode(dnl->get_remote_ino());
4723 if (in) {
4724 dn->link_remote(dnl, in);
4725 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4726 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4727 continue;
4728 } else {
4729 // touch everything i _do_ have
4730 for (auto &p : *dir) {
4731 if (!p.second->get_linkage()->is_null())
4732 mdcache->lru.lru_touch(p.second);
4733 }
4734
4735 // already issued caps and leases, reply immediately.
4736 if (dnbl.length() > 0) {
4737 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4738 dout(10) << " open remote dentry after caps were issued, stopping at "
4739 << dnbl.length() << " < " << bytes_left << dendl;
4740 break;
4741 }
4742
4743 mds->locker->drop_locks(mdr.get());
4744 mdr->drop_local_auth_pins();
4745 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4746 return;
4747 }
4748 }
4749 ceph_assert(in);
4750
4751 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
4752 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4753 break;
4754 }
4755
4756 unsigned start_len = dnbl.length();
4757
4758 // dentry
4759 dout(12) << "including dn " << *dn << dendl;
4760 encode(dn->get_name(), dnbl);
4761 int lease_mask = dnl->is_primary() ? CEPH_LEASE_PRIMARY_LINK : 0;
4762 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, dnbl);
4763
4764 // inode
4765 dout(12) << "including inode " << *in << dendl;
4766 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4767 if (r < 0) {
4768 // chop off dn->name, lease
4769 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4770 bufferlist keep;
4771 keep.substr_of(dnbl, 0, start_len);
4772 dnbl.swap(keep);
4773 break;
4774 }
4775 ceph_assert(r >= 0);
4776 numfiles++;
4777
4778 // touch dn
4779 mdcache->lru.lru_touch(dn);
4780 }
4781
4782 session->touch_readdir_cap(numfiles);
4783
4784 __u16 flags = 0;
4785 if (end) {
4786 flags = CEPH_READDIR_FRAG_END;
4787 if (start)
4788 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4789 }
4790 // client only understand END and COMPLETE flags ?
4791 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4792 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4793 }
4794
4795 // finish final blob
4796 encode(numfiles, dirbl);
4797 encode(flags, dirbl);
4798 dirbl.claim_append(dnbl);
4799
4800 // yay, reply
4801 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4802 << " bytes=" << dirbl.length()
4803 << " start=" << (int)start
4804 << " end=" << (int)end
4805 << dendl;
4806 mdr->reply_extra_bl = dirbl;
4807
4808 // bump popularity. NOTE: this doesn't quite capture it.
4809 mds->balancer->hit_dir(dir, META_POP_READDIR, -1, numfiles);
4810
4811 // reply
4812 mdr->tracei = diri;
4813 respond_to_request(mdr, 0);
4814 }
4815
4816
4817
4818 // ===============================================================================
4819 // INODE UPDATES
4820
4821
4822 /*
4823 * finisher for basic inode updates
4824 */
4825 class C_MDS_inode_update_finish : public ServerLogContext {
4826 CInode *in;
4827 bool truncating_smaller, changed_ranges, adjust_realm;
4828 public:
4829 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
4830 bool sm=false, bool cr=false, bool ar=false) :
4831 ServerLogContext(s, r), in(i),
4832 truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { }
4833 void finish(int r) override {
4834 ceph_assert(r == 0);
4835
4836 int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT);
4837
4838 // apply
4839 mdr->apply();
4840
4841 MDSRank *mds = get_mds();
4842
4843 // notify any clients
4844 if (truncating_smaller && in->get_inode()->is_truncating()) {
4845 mds->locker->issue_truncate(in);
4846 mds->mdcache->truncate_inode(in, mdr->ls);
4847 }
4848
4849 if (adjust_realm) {
4850 mds->mdcache->send_snap_update(in, 0, snap_op);
4851 mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op);
4852 }
4853
4854 get_mds()->balancer->hit_inode(in, META_POP_IWR);
4855
4856 server->respond_to_request(mdr, 0);
4857
4858 if (changed_ranges)
4859 get_mds()->locker->share_inode_max_size(in);
4860 }
4861 };
4862
4863 void Server::handle_client_file_setlock(MDRequestRef& mdr)
4864 {
4865 const cref_t<MClientRequest> &req = mdr->client_request;
4866 MutationImpl::LockOpVec lov;
4867
4868 // get the inode to operate on, and set up any locks needed for that
4869 CInode *cur = rdlock_path_pin_ref(mdr, true);
4870 if (!cur)
4871 return;
4872
4873 lov.add_xlock(&cur->flocklock);
4874 /* acquire_locks will return true if it gets the locks. If it fails,
4875 it will redeliver this request at a later date, so drop the request.
4876 */
4877 if (!mds->locker->acquire_locks(mdr, lov)) {
4878 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
4879 return;
4880 }
4881
4882 // copy the lock change into a ceph_filelock so we can store/apply it
4883 ceph_filelock set_lock;
4884 set_lock.start = req->head.args.filelock_change.start;
4885 set_lock.length = req->head.args.filelock_change.length;
4886 set_lock.client = req->get_orig_source().num();
4887 set_lock.owner = req->head.args.filelock_change.owner;
4888 set_lock.pid = req->head.args.filelock_change.pid;
4889 set_lock.type = req->head.args.filelock_change.type;
4890 bool will_wait = req->head.args.filelock_change.wait;
4891
4892 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
4893
4894 ceph_lock_state_t *lock_state = NULL;
4895 bool interrupt = false;
4896
4897 // get the appropriate lock state
4898 switch (req->head.args.filelock_change.rule) {
4899 case CEPH_LOCK_FLOCK_INTR:
4900 interrupt = true;
4901 // fall-thru
4902 case CEPH_LOCK_FLOCK:
4903 lock_state = cur->get_flock_lock_state();
4904 break;
4905
4906 case CEPH_LOCK_FCNTL_INTR:
4907 interrupt = true;
4908 // fall-thru
4909 case CEPH_LOCK_FCNTL:
4910 lock_state = cur->get_fcntl_lock_state();
4911 break;
4912
4913 default:
4914 dout(10) << "got unknown lock type " << set_lock.type
4915 << ", dropping request!" << dendl;
4916 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
4917 return;
4918 }
4919
4920 dout(10) << " state prior to lock change: " << *lock_state << dendl;
4921 if (CEPH_LOCK_UNLOCK == set_lock.type) {
4922 list<ceph_filelock> activated_locks;
4923 MDSContext::vec waiters;
4924 if (lock_state->is_waiting(set_lock)) {
4925 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
4926 lock_state->remove_waiting(set_lock);
4927 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4928 } else if (!interrupt) {
4929 dout(10) << " unlock attempt on " << set_lock << dendl;
4930 lock_state->remove_lock(set_lock, activated_locks);
4931 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4932 }
4933 mds->queue_waiters(waiters);
4934
4935 respond_to_request(mdr, 0);
4936 } else {
4937 dout(10) << " lock attempt on " << set_lock << dendl;
4938 bool deadlock = false;
4939 if (mdr->more()->flock_was_waiting &&
4940 !lock_state->is_waiting(set_lock)) {
4941 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
4942 respond_to_request(mdr, -CEPHFS_EINTR);
4943 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
4944 dout(10) << " it failed on this attempt" << dendl;
4945 // couldn't set lock right now
4946 if (deadlock) {
4947 respond_to_request(mdr, -CEPHFS_EDEADLK);
4948 } else if (!will_wait) {
4949 respond_to_request(mdr, -CEPHFS_EWOULDBLOCK);
4950 } else {
4951 dout(10) << " added to waiting list" << dendl;
4952 ceph_assert(lock_state->is_waiting(set_lock));
4953 mdr->more()->flock_was_waiting = true;
4954 mds->locker->drop_locks(mdr.get());
4955 mdr->drop_local_auth_pins();
4956 mdr->mark_event("failed to add lock, waiting");
4957 mdr->mark_nowarn();
4958 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
4959 }
4960 } else
4961 respond_to_request(mdr, 0);
4962 }
4963 dout(10) << " state after lock change: " << *lock_state << dendl;
4964 }
4965
4966 void Server::handle_client_file_readlock(MDRequestRef& mdr)
4967 {
4968 const cref_t<MClientRequest> &req = mdr->client_request;
4969 MutationImpl::LockOpVec lov;
4970
4971 // get the inode to operate on, and set up any locks needed for that
4972 CInode *cur = rdlock_path_pin_ref(mdr, true);
4973 if (!cur)
4974 return;
4975
4976 /* acquire_locks will return true if it gets the locks. If it fails,
4977 it will redeliver this request at a later date, so drop the request.
4978 */
4979 lov.add_rdlock(&cur->flocklock);
4980 if (!mds->locker->acquire_locks(mdr, lov)) {
4981 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
4982 return;
4983 }
4984
4985 // copy the lock change into a ceph_filelock so we can store/apply it
4986 ceph_filelock checking_lock;
4987 checking_lock.start = req->head.args.filelock_change.start;
4988 checking_lock.length = req->head.args.filelock_change.length;
4989 checking_lock.client = req->get_orig_source().num();
4990 checking_lock.owner = req->head.args.filelock_change.owner;
4991 checking_lock.pid = req->head.args.filelock_change.pid;
4992 checking_lock.type = req->head.args.filelock_change.type;
4993
4994 // get the appropriate lock state
4995 ceph_lock_state_t *lock_state = NULL;
4996 switch (req->head.args.filelock_change.rule) {
4997 case CEPH_LOCK_FLOCK:
4998 lock_state = cur->get_flock_lock_state();
4999 break;
5000
5001 case CEPH_LOCK_FCNTL:
5002 lock_state = cur->get_fcntl_lock_state();
5003 break;
5004
5005 default:
5006 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
5007 respond_to_request(mdr, -CEPHFS_EINVAL);
5008 return;
5009 }
5010 lock_state->look_for_lock(checking_lock);
5011
5012 bufferlist lock_bl;
5013 encode(checking_lock, lock_bl);
5014
5015 mdr->reply_extra_bl = lock_bl;
5016 respond_to_request(mdr, 0);
5017 }
5018
5019 void Server::handle_client_setattr(MDRequestRef& mdr)
5020 {
5021 const cref_t<MClientRequest> &req = mdr->client_request;
5022 MutationImpl::LockOpVec lov;
5023 CInode *cur = rdlock_path_pin_ref(mdr, true);
5024 if (!cur) return;
5025
5026 if (mdr->snapid != CEPH_NOSNAP) {
5027 respond_to_request(mdr, -CEPHFS_EROFS);
5028 return;
5029 }
5030 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
5031 respond_to_request(mdr, -CEPHFS_EPERM);
5032 return;
5033 }
5034
5035 __u32 mask = req->head.args.setattr.mask;
5036 __u32 access_mask = MAY_WRITE;
5037
5038 // xlock inode
5039 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
5040 lov.add_xlock(&cur->authlock);
5041 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
5042 lov.add_xlock(&cur->filelock);
5043 if (mask & CEPH_SETATTR_CTIME)
5044 lov.add_wrlock(&cur->versionlock);
5045
5046 if (!mds->locker->acquire_locks(mdr, lov))
5047 return;
5048
5049 if ((mask & CEPH_SETATTR_UID) && (cur->get_inode()->uid != req->head.args.setattr.uid))
5050 access_mask |= MAY_CHOWN;
5051
5052 if ((mask & CEPH_SETATTR_GID) && (cur->get_inode()->gid != req->head.args.setattr.gid))
5053 access_mask |= MAY_CHGRP;
5054
5055 if (!check_access(mdr, cur, access_mask))
5056 return;
5057
5058 // trunc from bigger -> smaller?
5059 const auto& pip = cur->get_projected_inode();
5060
5061 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
5062
5063 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
5064 if (is_full && req->head.args.setattr.size > old_size) {
5065 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl;
5066 respond_to_request(mdr, -CEPHFS_ENOSPC);
5067 return;
5068 }
5069
5070 bool truncating_smaller = false;
5071 if (mask & CEPH_SETATTR_SIZE) {
5072 truncating_smaller = req->head.args.setattr.size < old_size;
5073 if (truncating_smaller && pip->is_truncating()) {
5074 dout(10) << " waiting for pending truncate from " << pip->truncate_from
5075 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
5076 mds->locker->drop_locks(mdr.get());
5077 mdr->drop_local_auth_pins();
5078 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
5079 return;
5080 }
5081 }
5082
5083 bool changed_ranges = false;
5084
5085 // project update
5086 mdr->ls = mdlog->get_current_segment();
5087 EUpdate *le = new EUpdate(mdlog, "setattr");
5088 mdlog->start_entry(le);
5089
5090 auto pi = cur->project_inode(mdr);
5091
5092 if (mask & CEPH_SETATTR_UID)
5093 pi.inode->uid = req->head.args.setattr.uid;
5094 if (mask & CEPH_SETATTR_GID)
5095 pi.inode->gid = req->head.args.setattr.gid;
5096
5097 if (mask & CEPH_SETATTR_MODE)
5098 pi.inode->mode = (pi.inode->mode & ~07777) | (req->head.args.setattr.mode & 07777);
5099 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
5100 S_ISREG(pi.inode->mode) &&
5101 (pi.inode->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
5102 pi.inode->mode &= ~(S_ISUID|S_ISGID);
5103 }
5104
5105 if (mask & CEPH_SETATTR_MTIME)
5106 pi.inode->mtime = req->head.args.setattr.mtime;
5107 if (mask & CEPH_SETATTR_ATIME)
5108 pi.inode->atime = req->head.args.setattr.atime;
5109 if (mask & CEPH_SETATTR_BTIME)
5110 pi.inode->btime = req->head.args.setattr.btime;
5111 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
5112 pi.inode->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
5113 if (mask & CEPH_SETATTR_SIZE) {
5114 if (truncating_smaller) {
5115 pi.inode->truncate(old_size, req->head.args.setattr.size);
5116 le->metablob.add_truncate_start(cur->ino());
5117 } else {
5118 pi.inode->size = req->head.args.setattr.size;
5119 pi.inode->rstat.rbytes = pi.inode->size;
5120 }
5121 pi.inode->mtime = mdr->get_op_stamp();
5122
5123 // adjust client's max_size?
5124 if (mds->locker->calc_new_client_ranges(cur, pi.inode->size)) {
5125 dout(10) << " client_ranges " << cur->get_previous_projected_inode()->client_ranges
5126 << " -> " << pi.inode->client_ranges << dendl;
5127 changed_ranges = true;
5128 }
5129 }
5130
5131 pi.inode->version = cur->pre_dirty();
5132 pi.inode->ctime = mdr->get_op_stamp();
5133 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5134 pi.inode->rstat.rctime = mdr->get_op_stamp();
5135 pi.inode->change_attr++;
5136
5137 // log + wait
5138 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5139 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5140 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5141
5142 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5143 truncating_smaller, changed_ranges));
5144
5145 // flush immediately if there are readers/writers waiting
5146 if (mdr->is_xlocked(&cur->filelock) &&
5147 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
5148 mds->mdlog->flush();
5149 }
5150
5151 /* Takes responsibility for mdr */
5152 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
5153 {
5154 CInode *in = mdr->in[0];
5155 client_t client = mdr->get_client();
5156 ceph_assert(in);
5157
5158 dout(10) << "do_open_truncate " << *in << dendl;
5159
5160 SnapRealm *realm = in->find_snaprealm();
5161 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
5162
5163 mdr->ls = mdlog->get_current_segment();
5164 EUpdate *le = new EUpdate(mdlog, "open_truncate");
5165 mdlog->start_entry(le);
5166
5167 // prepare
5168 auto pi = in->project_inode(mdr);
5169 pi.inode->version = in->pre_dirty();
5170 pi.inode->mtime = pi.inode->ctime = mdr->get_op_stamp();
5171 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5172 pi.inode->rstat.rctime = mdr->get_op_stamp();
5173 pi.inode->change_attr++;
5174
5175 uint64_t old_size = std::max<uint64_t>(pi.inode->size, mdr->client_request->head.args.open.old_size);
5176 if (old_size > 0) {
5177 pi.inode->truncate(old_size, 0);
5178 le->metablob.add_truncate_start(in->ino());
5179 }
5180
5181 bool changed_ranges = false;
5182 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
5183 pi.inode->client_ranges[client].range.first = 0;
5184 pi.inode->client_ranges[client].range.last = pi.inode->get_layout_size_increment();
5185 pi.inode->client_ranges[client].follows = realm->get_newest_seq();
5186 changed_ranges = true;
5187 in->mark_clientwriteable();
5188 cap->mark_clientwriteable();
5189 }
5190
5191 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5192
5193 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5194 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5195
5196 // make sure ino gets into the journal
5197 le->metablob.add_opened_ino(in->ino());
5198
5199 mdr->o_trunc = true;
5200
5201 CDentry *dn = 0;
5202 if (mdr->client_request->get_dentry_wanted()) {
5203 ceph_assert(mdr->dn[0].size());
5204 dn = mdr->dn[0].back();
5205 }
5206
5207 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
5208 changed_ranges));
5209 // Although the `open` part can give an early reply, the truncation won't
5210 // happen until our EUpdate is persistent, to give the client a prompt
5211 // response we must also flush that event.
5212 mdlog->flush();
5213 }
5214
5215
5216 /* This function cleans up the passed mdr */
5217 void Server::handle_client_setlayout(MDRequestRef& mdr)
5218 {
5219 const cref_t<MClientRequest> &req = mdr->client_request;
5220 CInode *cur = rdlock_path_pin_ref(mdr, true);
5221 if (!cur) return;
5222
5223 if (mdr->snapid != CEPH_NOSNAP) {
5224 respond_to_request(mdr, -CEPHFS_EROFS);
5225 return;
5226 }
5227 if (!cur->is_file()) {
5228 respond_to_request(mdr, -CEPHFS_EINVAL);
5229 return;
5230 }
5231 if (cur->get_projected_inode()->size ||
5232 cur->get_projected_inode()->truncate_seq > 1) {
5233 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
5234 return;
5235 }
5236
5237 // validate layout
5238 file_layout_t layout = cur->get_projected_inode()->layout;
5239 // save existing layout for later
5240 const auto old_layout = layout;
5241
5242 int access = MAY_WRITE;
5243
5244 if (req->head.args.setlayout.layout.fl_object_size > 0)
5245 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5246 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5247 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5248 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5249 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5250 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5251 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5252
5253 // make sure we have as new a map as the client
5254 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5255 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5256 return;
5257 }
5258 }
5259
5260 // Don't permit layout modifications without 'p' caps
5261 if (layout != old_layout) {
5262 access |= MAY_SET_VXATTR;
5263 }
5264
5265 if (!layout.is_valid()) {
5266 dout(10) << "bad layout" << dendl;
5267 respond_to_request(mdr, -CEPHFS_EINVAL);
5268 return;
5269 }
5270 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5271 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5272 respond_to_request(mdr, -CEPHFS_EINVAL);
5273 return;
5274 }
5275
5276 MutationImpl::LockOpVec lov;
5277 lov.add_xlock(&cur->filelock);
5278 if (!mds->locker->acquire_locks(mdr, lov))
5279 return;
5280
5281 if (!check_access(mdr, cur, access))
5282 return;
5283
5284 // project update
5285 auto pi = cur->project_inode(mdr);
5286 pi.inode->layout = layout;
5287 // add the old pool to the inode
5288 pi.inode->add_old_pool(old_layout.pool_id);
5289 pi.inode->version = cur->pre_dirty();
5290 pi.inode->ctime = mdr->get_op_stamp();
5291 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5292 pi.inode->rstat.rctime = mdr->get_op_stamp();
5293 pi.inode->change_attr++;
5294
5295 // log + wait
5296 mdr->ls = mdlog->get_current_segment();
5297 EUpdate *le = new EUpdate(mdlog, "setlayout");
5298 mdlog->start_entry(le);
5299 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5300 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5301 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5302
5303 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5304 }
5305
5306 bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
5307 {
5308 if (mdr->locking_state & MutationImpl::ALL_LOCKED)
5309 return true;
5310
5311 MutationImpl::LockOpVec lov;
5312 lov.add_xlock(&in->policylock);
5313 if (xlock_snaplock)
5314 lov.add_xlock(&in->snaplock);
5315 else
5316 lov.add_rdlock(&in->snaplock);
5317 if (!mds->locker->acquire_locks(mdr, lov))
5318 return false;
5319
5320 if (want_layout && in->get_projected_inode()->has_layout()) {
5321 mdr->dir_layout = in->get_projected_inode()->layout;
5322 want_layout = false;
5323 }
5324 if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
5325 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
5326 return false;
5327 }
5328
5329 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5330 return true;
5331 }
5332
5333 CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
5334 {
5335 CInode *in = mdcache->get_inode(ino);
5336 if (!in || in->state_test(CInode::STATE_PURGING)) {
5337 respond_to_request(mdr, -CEPHFS_ESTALE);
5338 return nullptr;
5339 }
5340 if (!in->is_auth()) {
5341 mdcache->request_forward(mdr, in->authority().first);
5342 return nullptr;
5343 }
5344
5345 return in;
5346 }
5347
5348 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
5349 {
5350 const cref_t<MClientRequest> &req = mdr->client_request;
5351
5352 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5353 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5354 if (!cur)
5355 return;
5356
5357 if (!cur->is_dir()) {
5358 respond_to_request(mdr, -CEPHFS_ENOTDIR);
5359 return;
5360 }
5361
5362 if (!xlock_policylock(mdr, cur, true))
5363 return;
5364
5365 // validate layout
5366 const auto& old_pi = cur->get_projected_inode();
5367 file_layout_t layout;
5368 if (old_pi->has_layout())
5369 layout = old_pi->layout;
5370 else if (mdr->dir_layout != file_layout_t())
5371 layout = mdr->dir_layout;
5372 else
5373 layout = mdcache->default_file_layout;
5374
5375 // Level of access required to complete
5376 int access = MAY_WRITE;
5377
5378 const auto old_layout = layout;
5379
5380 if (req->head.args.setlayout.layout.fl_object_size > 0)
5381 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5382 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5383 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5384 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5385 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5386 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5387 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5388 // make sure we have as new a map as the client
5389 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5390 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5391 return;
5392 }
5393 }
5394
5395 if (layout != old_layout) {
5396 access |= MAY_SET_VXATTR;
5397 }
5398
5399 if (!layout.is_valid()) {
5400 dout(10) << "bad layout" << dendl;
5401 respond_to_request(mdr, -CEPHFS_EINVAL);
5402 return;
5403 }
5404 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5405 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5406 respond_to_request(mdr, -CEPHFS_EINVAL);
5407 return;
5408 }
5409
5410 if (!check_access(mdr, cur, access))
5411 return;
5412
5413 auto pi = cur->project_inode(mdr);
5414 pi.inode->layout = layout;
5415 pi.inode->version = cur->pre_dirty();
5416
5417 // log + wait
5418 mdr->ls = mdlog->get_current_segment();
5419 EUpdate *le = new EUpdate(mdlog, "setlayout");
5420 mdlog->start_entry(le);
5421 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5422 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5423 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5424
5425 mdr->no_early_reply = true;
5426 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5427 }
5428
5429 // XATTRS
5430 int Server::parse_layout_vxattr_json(
5431 string name, string value, const OSDMap& osdmap, file_layout_t *layout)
5432 {
5433 auto parse_pool = [&](std::string pool_name, int64_t pool_id) -> int64_t {
5434 if (pool_name != "") {
5435 int64_t _pool_id = osdmap.lookup_pg_pool_name(pool_name);
5436 if (_pool_id < 0) {
5437 dout(10) << __func__ << ": unknown pool name:" << pool_name << dendl;
5438 return -CEPHFS_EINVAL;
5439 }
5440 return _pool_id;
5441 } else if (pool_id >= 0) {
5442 const auto pools = osdmap.get_pools();
5443 if (pools.find(pool_id) == pools.end()) {
5444 dout(10) << __func__ << ": unknown pool id:" << pool_id << dendl;
5445 return -CEPHFS_EINVAL;
5446 }
5447 return pool_id;
5448 } else {
5449 return -CEPHFS_EINVAL;
5450 }
5451 };
5452
5453 try {
5454 if (name == "layout.json") {
5455 JSONParser json_parser;
5456 if (json_parser.parse(value.c_str(), value.length()) and json_parser.is_object()) {
5457 std::string field;
5458 try {
5459 field = "object_size";
5460 JSONDecoder::decode_json("object_size", layout->object_size, &json_parser, true);
5461
5462 field = "stripe_unit";
5463 JSONDecoder::decode_json("stripe_unit", layout->stripe_unit, &json_parser, true);
5464
5465 field = "stripe_count";
5466 JSONDecoder::decode_json("stripe_count", layout->stripe_count, &json_parser, true);
5467
5468 field = "pool_namespace";
5469 JSONDecoder::decode_json("pool_namespace", layout->pool_ns, &json_parser, false);
5470
5471 field = "pool_id";
5472 int64_t pool_id = 0;
5473 JSONDecoder::decode_json("pool_id", pool_id, &json_parser, false);
5474
5475 field = "pool_name";
5476 std::string pool_name;
5477 JSONDecoder::decode_json("pool_name", pool_name, &json_parser, false);
5478
5479 pool_id = parse_pool(pool_name, pool_id);
5480 if (pool_id < 0) {
5481 return (int)pool_id;
5482 }
5483 layout->pool_id = pool_id;
5484 } catch (JSONDecoder::err&) {
5485 dout(10) << __func__ << ": json is missing a mandatory field named "
5486 << field << dendl;
5487 return -CEPHFS_EINVAL;
5488 }
5489 } else {
5490 dout(10) << __func__ << ": bad json" << dendl;
5491 return -CEPHFS_EINVAL;
5492 }
5493 } else {
5494 dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
5495 return -CEPHFS_ENODATA; // no such attribute
5496 }
5497 } catch (boost::bad_lexical_cast const&) {
5498 dout(10) << __func__ << ": bad vxattr value:" << value
5499 << ", unable to parse for xattr:" << name << dendl;
5500 return -CEPHFS_EINVAL;
5501 }
5502 return 0;
5503 }
5504
5505 // parse old style layout string
5506 int Server::parse_layout_vxattr_string(
5507 string name, string value, const OSDMap& osdmap, file_layout_t *layout)
5508 {
5509 try {
5510 if (name == "layout") {
5511 string::iterator begin = value.begin();
5512 string::iterator end = value.end();
5513 keys_and_values<string::iterator> p; // create instance of parser
5514 std::map<string, string> m; // map to receive results
5515 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5516 return -CEPHFS_EINVAL;
5517 }
5518 string left(begin, end);
5519 dout(10) << __func__ << ": parsed " << m << " left '" << left << "'" << dendl;
5520 if (begin != end)
5521 return -CEPHFS_EINVAL;
5522 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5523 // Skip validation on each attr, we do it once at the end (avoid
5524 // rejecting intermediate states if the overall result is ok)
5525 int r = parse_layout_vxattr_string(string("layout.") + q->first, q->second,
5526 osdmap, layout);
5527 if (r < 0)
5528 return r;
5529 }
5530 } else if (name == "layout.object_size") {
5531 layout->object_size = boost::lexical_cast<unsigned>(value);
5532 } else if (name == "layout.stripe_unit") {
5533 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5534 } else if (name == "layout.stripe_count") {
5535 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5536 } else if (name == "layout.pool") {
5537 try {
5538 layout->pool_id = boost::lexical_cast<unsigned>(value);
5539 } catch (boost::bad_lexical_cast const&) {
5540 int64_t pool = osdmap.lookup_pg_pool_name(value);
5541 if (pool < 0) {
5542 dout(10) << __func__ << ": unknown pool " << value << dendl;
5543 return -CEPHFS_ENOENT;
5544 }
5545 layout->pool_id = pool;
5546 }
5547 } else if (name == "layout.pool_id") {
5548 layout->pool_id = boost::lexical_cast<int64_t>(value);
5549 } else if (name == "layout.pool_name") {
5550 layout->pool_id = osdmap.lookup_pg_pool_name(value);
5551 if (layout->pool_id < 0) {
5552 dout(10) << __func__ << ": unknown pool " << value << dendl;
5553 return -CEPHFS_EINVAL;
5554 }
5555 } else if (name == "layout.pool_namespace") {
5556 layout->pool_ns = value;
5557 } else {
5558 dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
5559 return -CEPHFS_ENODATA; // no such attribute
5560 }
5561 } catch (boost::bad_lexical_cast const&) {
5562 dout(10) << __func__ << ": bad vxattr value, unable to parse int for "
5563 << name << dendl;
5564 return -CEPHFS_EINVAL;
5565 }
5566 return 0;
5567 }
5568
5569 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5570 file_layout_t *layout, bool validate)
5571 {
5572 dout(20) << __func__ << ": name:" << name << " value:'" << value << "'" << dendl;
5573
5574 int r;
5575 if (name == "layout.json") {
5576 r = parse_layout_vxattr_json(name, value, osdmap, layout);
5577 } else {
5578 r = parse_layout_vxattr_string(name, value, osdmap, layout);
5579 }
5580 if (r < 0) {
5581 return r;
5582 }
5583
5584 if (validate && !layout->is_valid()) {
5585 dout(10) << __func__ << ": bad layout" << dendl;
5586 return -CEPHFS_EINVAL;
5587 }
5588 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
5589 dout(10) << __func__ << ": invalid data pool " << layout->pool_id << dendl;
5590 return -CEPHFS_EINVAL;
5591 }
5592 return 0;
5593 }
5594
5595 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5596 {
5597 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5598 try {
5599 if (name == "quota") {
5600 string::iterator begin = value.begin();
5601 string::iterator end = value.end();
5602 if (begin == end) {
5603 // keep quota unchanged. (for create_quota_realm())
5604 return 0;
5605 }
5606 keys_and_values<string::iterator> p; // create instance of parser
5607 std::map<string, string> m; // map to receive results
5608 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5609 return -CEPHFS_EINVAL;
5610 }
5611 string left(begin, end);
5612 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5613 if (begin != end)
5614 return -CEPHFS_EINVAL;
5615 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5616 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5617 if (r < 0)
5618 return r;
5619 }
5620 } else if (name == "quota.max_bytes") {
5621 int64_t q = boost::lexical_cast<int64_t>(value);
5622 if (q < 0)
5623 return -CEPHFS_EINVAL;
5624 quota->max_bytes = q;
5625 } else if (name == "quota.max_files") {
5626 int64_t q = boost::lexical_cast<int64_t>(value);
5627 if (q < 0)
5628 return -CEPHFS_EINVAL;
5629 quota->max_files = q;
5630 } else {
5631 dout(10) << " unknown quota vxattr " << name << dendl;
5632 return -CEPHFS_EINVAL;
5633 }
5634 } catch (boost::bad_lexical_cast const&) {
5635 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5636 return -CEPHFS_EINVAL;
5637 }
5638
5639 if (!quota->is_valid()) {
5640 dout(10) << "bad quota" << dendl;
5641 return -CEPHFS_EINVAL;
5642 }
5643 return 0;
5644 }
5645
5646 void Server::create_quota_realm(CInode *in)
5647 {
5648 dout(10) << __func__ << " " << *in << dendl;
5649
5650 auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
5651 req->set_filepath(filepath(in->ino()));
5652 req->set_string2("ceph.quota");
5653 // empty vxattr value
5654 req->set_tid(mds->issue_tid());
5655
5656 mds->send_message_mds(req, in->authority().first);
5657 }
5658
5659 /*
5660 * Verify that the file layout attribute carried by client
5661 * is well-formatted.
5662 * Return 0 on success, otherwise this function takes
5663 * responsibility for the passed mdr.
5664 */
5665 int Server::check_layout_vxattr(MDRequestRef& mdr,
5666 string name,
5667 string value,
5668 file_layout_t *layout)
5669 {
5670 const cref_t<MClientRequest> &req = mdr->client_request;
5671 epoch_t epoch;
5672 int r;
5673
5674 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5675 r = parse_layout_vxattr(name, value, osdmap, layout);
5676 epoch = osdmap.get_epoch();
5677 });
5678
5679 if (r == -CEPHFS_ENOENT) {
5680
5681 // we don't have the specified pool, make sure our map
5682 // is newer than or as new as the client.
5683 epoch_t req_epoch = req->get_osdmap_epoch();
5684
5685 if (req_epoch > epoch) {
5686
5687 // well, our map is older. consult mds.
5688 auto fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
5689
5690 mds->objecter->wait_for_map(req_epoch, lambdafy(fin));
5691 return r;
5692 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5693
5694 // For compatibility with client w/ old code, we still need get the
5695 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5696 // we can remove those code.
5697 mdr->waited_for_osdmap = true;
5698 mds->objecter->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5699 mds, new C_MDS_RetryRequest(mdcache, mdr))));
5700 return r;
5701 }
5702 }
5703
5704 if (r < 0) {
5705
5706 if (r == -CEPHFS_ENOENT)
5707 r = -CEPHFS_EINVAL;
5708
5709 respond_to_request(mdr, r);
5710 return r;
5711 }
5712
5713 // all is well
5714 return 0;
5715 }
5716
5717 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
5718 {
5719 const cref_t<MClientRequest> &req = mdr->client_request;
5720 string name(req->get_path2());
5721 bufferlist bl = req->get_data();
5722 string value (bl.c_str(), bl.length());
5723 dout(10) << "handle_set_vxattr " << name
5724 << " val " << value.length()
5725 << " bytes on " << *cur
5726 << dendl;
5727
5728 CInode::mempool_inode *pip = nullptr;
5729 string rest;
5730
5731 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5732 return;
5733 }
5734
5735 bool adjust_realm = false;
5736 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5737 if (!cur->is_dir()) {
5738 respond_to_request(mdr, -CEPHFS_EINVAL);
5739 return;
5740 }
5741
5742 if (!xlock_policylock(mdr, cur, true))
5743 return;
5744
5745 file_layout_t layout;
5746 if (cur->get_projected_inode()->has_layout())
5747 layout = cur->get_projected_inode()->layout;
5748 else if (mdr->dir_layout != file_layout_t())
5749 layout = mdr->dir_layout;
5750 else
5751 layout = mdcache->default_file_layout;
5752
5753 rest = name.substr(name.find("layout"));
5754 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5755 return;
5756
5757 auto pi = cur->project_inode(mdr);
5758 pi.inode->layout = layout;
5759 mdr->no_early_reply = true;
5760 pip = pi.inode.get();
5761 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
5762 if (!cur->is_file()) {
5763 respond_to_request(mdr, -CEPHFS_EINVAL);
5764 return;
5765 }
5766 if (cur->get_projected_inode()->size ||
5767 cur->get_projected_inode()->truncate_seq > 1) {
5768 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
5769 return;
5770 }
5771 file_layout_t layout = cur->get_projected_inode()->layout;
5772 rest = name.substr(name.find("layout"));
5773 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5774 return;
5775
5776 MutationImpl::LockOpVec lov;
5777 lov.add_xlock(&cur->filelock);
5778 if (!mds->locker->acquire_locks(mdr, lov))
5779 return;
5780
5781 auto pi = cur->project_inode(mdr);
5782 int64_t old_pool = pi.inode->layout.pool_id;
5783 pi.inode->add_old_pool(old_pool);
5784 pi.inode->layout = layout;
5785 pip = pi.inode.get();
5786 } else if (name.compare(0, 10, "ceph.quota") == 0) {
5787 if (!cur->is_dir()) {
5788 respond_to_request(mdr, -CEPHFS_EINVAL);
5789 return;
5790 }
5791
5792 quota_info_t quota = cur->get_projected_inode()->quota;
5793
5794 rest = name.substr(name.find("quota"));
5795 int r = parse_quota_vxattr(rest, value, &quota);
5796 if (r < 0) {
5797 respond_to_request(mdr, r);
5798 return;
5799 }
5800
5801 if (quota.is_enable() && !cur->get_projected_srnode())
5802 adjust_realm = true;
5803
5804 if (!xlock_policylock(mdr, cur, false, adjust_realm))
5805 return;
5806
5807 if (cur->get_projected_inode()->quota == quota) {
5808 respond_to_request(mdr, 0);
5809 return;
5810 }
5811
5812 auto pi = cur->project_inode(mdr, false, adjust_realm);
5813 pi.inode->quota = quota;
5814
5815 if (adjust_realm)
5816 pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq();
5817
5818 mdr->no_early_reply = true;
5819 pip = pi.inode.get();
5820
5821 client_t exclude_ct = mdr->get_client();
5822 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
5823 } else if (name == "ceph.dir.subvolume"sv) {
5824 if (!cur->is_dir()) {
5825 respond_to_request(mdr, -CEPHFS_EINVAL);
5826 return;
5827 }
5828
5829 bool val;
5830 try {
5831 val = boost::lexical_cast<bool>(value);
5832 } catch (boost::bad_lexical_cast const&) {
5833 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
5834 respond_to_request(mdr, -CEPHFS_EINVAL);
5835 return;
5836 }
5837
5838 /* Verify it's not already a subvolume with lighter weight
5839 * rdlock.
5840 */
5841 if (!mdr->more()->rdonly_checks) {
5842 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
5843 MutationImpl::LockOpVec lov;
5844 lov.add_rdlock(&cur->snaplock);
5845 if (!mds->locker->acquire_locks(mdr, lov))
5846 return;
5847 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5848 }
5849 const auto srnode = cur->get_projected_srnode();
5850 if (val == (srnode && srnode->is_subvolume())) {
5851 dout(20) << "already marked subvolume" << dendl;
5852 respond_to_request(mdr, 0);
5853 return;
5854 }
5855 mdr->more()->rdonly_checks = true;
5856 }
5857
5858 if ((mdr->locking_state & MutationImpl::ALL_LOCKED) && !mdr->is_xlocked(&cur->snaplock)) {
5859 /* drop the rdlock and acquire xlocks */
5860 dout(20) << "dropping rdlocks" << dendl;
5861 mds->locker->drop_locks(mdr.get());
5862 if (!xlock_policylock(mdr, cur, false, true))
5863 return;
5864 }
5865
5866 /* repeat rdonly checks in case changed between rdlock -> xlock */
5867 SnapRealm *realm = cur->find_snaprealm();
5868 if (val) {
5869 inodeno_t subvol_ino = realm->get_subvolume_ino();
5870 // can't create subvolume inside another subvolume
5871 if (subvol_ino && subvol_ino != cur->ino()) {
5872 respond_to_request(mdr, -CEPHFS_EINVAL);
5873 return;
5874 }
5875 }
5876
5877 const auto srnode = cur->get_projected_srnode();
5878 if (val == (srnode && srnode->is_subvolume())) {
5879 respond_to_request(mdr, 0);
5880 return;
5881 }
5882
5883 auto pi = cur->project_inode(mdr, false, true);
5884 if (!srnode)
5885 pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
5886 if (val)
5887 pi.snapnode->mark_subvolume();
5888 else
5889 pi.snapnode->clear_subvolume();
5890
5891 mdr->no_early_reply = true;
5892 pip = pi.inode.get();
5893 adjust_realm = true;
5894 } else if (name == "ceph.dir.pin"sv) {
5895 if (!cur->is_dir() || cur->is_root()) {
5896 respond_to_request(mdr, -CEPHFS_EINVAL);
5897 return;
5898 }
5899
5900 mds_rank_t rank;
5901 try {
5902 rank = boost::lexical_cast<mds_rank_t>(value);
5903 if (rank < 0) rank = MDS_RANK_NONE;
5904 else if (rank >= MAX_MDS) {
5905 respond_to_request(mdr, -CEPHFS_EDOM);
5906 return;
5907 }
5908 } catch (boost::bad_lexical_cast const&) {
5909 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5910 respond_to_request(mdr, -CEPHFS_EINVAL);
5911 return;
5912 }
5913
5914 if (!xlock_policylock(mdr, cur))
5915 return;
5916
5917 auto pi = cur->project_inode(mdr);
5918 cur->set_export_pin(rank);
5919 pip = pi.inode.get();
5920 } else if (name == "ceph.dir.pin.random"sv) {
5921 if (!cur->is_dir() || cur->is_root()) {
5922 respond_to_request(mdr, -CEPHFS_EINVAL);
5923 return;
5924 }
5925
5926 double val;
5927 try {
5928 val = boost::lexical_cast<double>(value);
5929 } catch (boost::bad_lexical_cast const&) {
5930 dout(10) << "bad vxattr value, unable to parse float for " << name << dendl;
5931 respond_to_request(mdr, -CEPHFS_EINVAL);
5932 return;
5933 }
5934
5935 if (val < 0.0 || 1.0 < val) {
5936 respond_to_request(mdr, -CEPHFS_EDOM);
5937 return;
5938 } else if (mdcache->export_ephemeral_random_max < val) {
5939 respond_to_request(mdr, -CEPHFS_EINVAL);
5940 return;
5941 }
5942
5943 if (!xlock_policylock(mdr, cur))
5944 return;
5945
5946 auto pi = cur->project_inode(mdr);
5947 cur->setxattr_ephemeral_rand(val);
5948 pip = pi.inode.get();
5949 } else if (name == "ceph.dir.pin.distributed"sv) {
5950 if (!cur->is_dir() || cur->is_root()) {
5951 respond_to_request(mdr, -CEPHFS_EINVAL);
5952 return;
5953 }
5954
5955 bool val;
5956 try {
5957 val = boost::lexical_cast<bool>(value);
5958 } catch (boost::bad_lexical_cast const&) {
5959 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
5960 respond_to_request(mdr, -CEPHFS_EINVAL);
5961 return;
5962 }
5963
5964 if (!xlock_policylock(mdr, cur))
5965 return;
5966
5967 auto pi = cur->project_inode(mdr);
5968 cur->setxattr_ephemeral_dist(val);
5969 pip = pi.inode.get();
5970 } else {
5971 dout(10) << " unknown vxattr " << name << dendl;
5972 respond_to_request(mdr, -CEPHFS_EINVAL);
5973 return;
5974 }
5975
5976 pip->change_attr++;
5977 pip->ctime = mdr->get_op_stamp();
5978 if (mdr->get_op_stamp() > pip->rstat.rctime)
5979 pip->rstat.rctime = mdr->get_op_stamp();
5980 pip->version = cur->pre_dirty();
5981 if (cur->is_file())
5982 pip->update_backtrace();
5983
5984 // log + wait
5985 mdr->ls = mdlog->get_current_segment();
5986 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
5987 mdlog->start_entry(le);
5988 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5989 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5990 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5991
5992 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5993 false, false, adjust_realm));
5994 return;
5995 }
5996
5997 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
5998 {
5999 const cref_t<MClientRequest> &req = mdr->client_request;
6000 string name(req->get_path2());
6001
6002 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
6003
6004 if (name == "ceph.dir.layout") {
6005 if (!cur->is_dir()) {
6006 respond_to_request(mdr, -CEPHFS_ENODATA);
6007 return;
6008 }
6009 if (cur->is_root()) {
6010 dout(10) << "can't remove layout policy on the root directory" << dendl;
6011 respond_to_request(mdr, -CEPHFS_EINVAL);
6012 return;
6013 }
6014
6015 if (!cur->get_projected_inode()->has_layout()) {
6016 respond_to_request(mdr, -CEPHFS_ENODATA);
6017 return;
6018 }
6019
6020 MutationImpl::LockOpVec lov;
6021 lov.add_xlock(&cur->policylock);
6022 if (!mds->locker->acquire_locks(mdr, lov))
6023 return;
6024
6025 auto pi = cur->project_inode(mdr);
6026 pi.inode->clear_layout();
6027 pi.inode->version = cur->pre_dirty();
6028
6029 // log + wait
6030 mdr->ls = mdlog->get_current_segment();
6031 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
6032 mdlog->start_entry(le);
6033 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6034 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6035 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6036
6037 mdr->no_early_reply = true;
6038 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6039 return;
6040 } else if (name == "ceph.dir.layout.pool_namespace"
6041 || name == "ceph.file.layout.pool_namespace") {
6042 // Namespace is the only layout field that has a meaningful
6043 // null/none value (empty string, means default layout). Is equivalent
6044 // to a setxattr with empty string: pass through the empty payload of
6045 // the rmxattr request to do this.
6046 handle_set_vxattr(mdr, cur);
6047 return;
6048 }
6049
6050 respond_to_request(mdr, -CEPHFS_ENODATA);
6051 }
6052
6053 const Server::XattrHandler Server::xattr_handlers[] = {
6054 {
6055 xattr_name: Server::DEFAULT_HANDLER,
6056 description: "default xattr handler",
6057 validate: &Server::default_xattr_validate,
6058 setxattr: &Server::default_setxattr_handler,
6059 removexattr: &Server::default_removexattr_handler,
6060 },
6061 {
6062 xattr_name: "ceph.mirror.info",
6063 description: "mirror info xattr handler",
6064 validate: &Server::mirror_info_xattr_validate,
6065 setxattr: &Server::mirror_info_setxattr_handler,
6066 removexattr: &Server::mirror_info_removexattr_handler
6067 },
6068 };
6069
6070 const Server::XattrHandler* Server::get_xattr_or_default_handler(std::string_view xattr_name) {
6071 const XattrHandler *default_xattr_handler = nullptr;
6072
6073 for (auto &handler : xattr_handlers) {
6074 if (handler.xattr_name == Server::DEFAULT_HANDLER) {
6075 ceph_assert(default_xattr_handler == nullptr);
6076 default_xattr_handler = &handler;
6077 }
6078 if (handler.xattr_name == xattr_name) {
6079 dout(20) << "handler=" << handler.description << dendl;
6080 return &handler;
6081 }
6082 }
6083
6084 ceph_assert(default_xattr_handler != nullptr);
6085 dout(20) << "handler=" << default_xattr_handler->description << dendl;
6086 return default_xattr_handler;
6087 }
6088
6089 int Server::xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6090 const std::string &xattr_name, int op, int flags) {
6091 if (op == CEPH_MDS_OP_SETXATTR) {
6092 if (xattrs) {
6093 if ((flags & CEPH_XATTR_CREATE) && xattrs->count(mempool::mds_co::string(xattr_name))) {
6094 dout(10) << "setxattr '" << xattr_name << "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur << dendl;
6095 return -CEPHFS_EEXIST;
6096 }
6097 }
6098 if ((flags & CEPH_XATTR_REPLACE) && !(xattrs && xattrs->count(mempool::mds_co::string(xattr_name)))) {
6099 dout(10) << "setxattr '" << xattr_name << "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur << dendl;
6100 return -CEPHFS_ENODATA;
6101 }
6102
6103 return 0;
6104 }
6105
6106 if (op == CEPH_MDS_OP_RMXATTR) {
6107 if (!xattrs || xattrs->count(mempool::mds_co::string(xattr_name)) == 0) {
6108 dout(10) << "removexattr '" << xattr_name << "' and CEPHFS_ENODATA on " << *cur << dendl;
6109 return -CEPHFS_ENODATA;
6110 }
6111
6112 return 0;
6113 }
6114
6115 derr << ": unhandled validation for: " << xattr_name << dendl;
6116 return -CEPHFS_EINVAL;
6117 }
6118
6119 void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name,
6120 const bufferlist &xattr_value) {
6121 size_t len = xattr_value.length();
6122 bufferptr b = buffer::create(len);
6123 if (len) {
6124 xattr_value.begin().copy(len, b.c_str());
6125 }
6126 auto em = xattrs->emplace(std::piecewise_construct,
6127 std::forward_as_tuple(mempool::mds_co::string(xattr_name)),
6128 std::forward_as_tuple(b));
6129 if (!em.second) {
6130 em.first->second = b;
6131 }
6132 }
6133
6134 void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name) {
6135 xattrs->erase(mempool::mds_co::string(xattr_name));
6136 }
6137
6138 int Server::default_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6139 XattrOp *xattr_op) {
6140 return xattr_validate(cur, xattrs, xattr_op->xattr_name, xattr_op->op, xattr_op->flags);
6141 }
6142
6143 void Server::default_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6144 const XattrOp &xattr_op) {
6145 xattr_set(xattrs, xattr_op.xattr_name, xattr_op.xattr_value);
6146 }
6147
6148 void Server::default_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6149 const XattrOp &xattr_op) {
6150 xattr_rm(xattrs, xattr_op.xattr_name);
6151 }
6152
6153 // mirror info xattr handlers
6154 const std::string Server::MirrorXattrInfo::MIRROR_INFO_REGEX = "^cluster_id=([a-f0-9]{8}-" \
6155 "[a-f0-9]{4}-[a-f0-9]{4}-" \
6156 "[a-f0-9]{4}-[a-f0-9]{12})" \
6157 " fs_id=(\\d+)$";
6158 const std::string Server::MirrorXattrInfo::CLUSTER_ID = "ceph.mirror.info.cluster_id";
6159 const std::string Server::MirrorXattrInfo::FS_ID = "ceph.mirror.info.fs_id";
6160 int Server::parse_mirror_info_xattr(const std::string &name, const std::string &value,
6161 std::string &cluster_id, std::string &fs_id) {
6162 dout(20) << "parsing name=" << name << ", value=" << value << dendl;
6163
6164 static const std::regex regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX);
6165 std::smatch match;
6166
6167 std::regex_search(value, match, regex);
6168 if (match.size() != 3) {
6169 derr << "mirror info parse error" << dendl;
6170 return -CEPHFS_EINVAL;
6171 }
6172
6173 cluster_id = match[1];
6174 fs_id = match[2];
6175 dout(20) << " parsed cluster_id=" << cluster_id << ", fs_id=" << fs_id << dendl;
6176 return 0;
6177 }
6178
6179 int Server::mirror_info_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6180 XattrOp *xattr_op) {
6181 if (!cur->is_root()) {
6182 return -CEPHFS_EINVAL;
6183 }
6184
6185 int v1 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::CLUSTER_ID, xattr_op->op, xattr_op->flags);
6186 int v2 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::FS_ID, xattr_op->op, xattr_op->flags);
6187 if (v1 != v2) {
6188 derr << "inconsistent mirror info state (" << v1 << "," << v2 << ")" << dendl;
6189 return -CEPHFS_EINVAL;
6190 }
6191
6192 if (v1 < 0) {
6193 return v1;
6194 }
6195
6196 if (xattr_op->op == CEPH_MDS_OP_RMXATTR) {
6197 return 0;
6198 }
6199
6200 std::string cluster_id;
6201 std::string fs_id;
6202 int r = parse_mirror_info_xattr(xattr_op->xattr_name, xattr_op->xattr_value.to_str(),
6203 cluster_id, fs_id);
6204 if (r < 0) {
6205 return r;
6206 }
6207
6208 xattr_op->xinfo = std::make_unique<MirrorXattrInfo>(cluster_id, fs_id);
6209 return 0;
6210 }
6211
6212 void Server::mirror_info_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6213 const XattrOp &xattr_op) {
6214 auto mirror_info = dynamic_cast<MirrorXattrInfo&>(*(xattr_op.xinfo));
6215
6216 bufferlist bl;
6217 bl.append(mirror_info.cluster_id.c_str(), mirror_info.cluster_id.length());
6218 xattr_set(xattrs, Server::MirrorXattrInfo::CLUSTER_ID, bl);
6219
6220 bl.clear();
6221 bl.append(mirror_info.fs_id.c_str(), mirror_info.fs_id.length());
6222 xattr_set(xattrs, Server::MirrorXattrInfo::FS_ID, bl);
6223 }
6224
6225 void Server::mirror_info_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6226 const XattrOp &xattr_op) {
6227 xattr_rm(xattrs, Server::MirrorXattrInfo::CLUSTER_ID);
6228 xattr_rm(xattrs, Server::MirrorXattrInfo::FS_ID);
6229 }
6230
6231 void Server::handle_client_setxattr(MDRequestRef& mdr)
6232 {
6233 const cref_t<MClientRequest> &req = mdr->client_request;
6234 string name(req->get_path2());
6235
6236 // is a ceph virtual xattr?
6237 if (is_ceph_vxattr(name)) {
6238 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6239 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6240 if (!cur)
6241 return;
6242
6243 handle_set_vxattr(mdr, cur);
6244 return;
6245 }
6246
6247 if (!is_allowed_ceph_xattr(name)) {
6248 respond_to_request(mdr, -CEPHFS_EINVAL);
6249 return;
6250 }
6251
6252 CInode *cur = rdlock_path_pin_ref(mdr, true);
6253 if (!cur)
6254 return;
6255
6256 if (mdr->snapid != CEPH_NOSNAP) {
6257 respond_to_request(mdr, -CEPHFS_EROFS);
6258 return;
6259 }
6260
6261 int flags = req->head.args.setxattr.flags;
6262
6263 MutationImpl::LockOpVec lov;
6264 lov.add_xlock(&cur->xattrlock);
6265 if (!mds->locker->acquire_locks(mdr, lov))
6266 return;
6267
6268 if (!check_access(mdr, cur, MAY_WRITE))
6269 return;
6270
6271 size_t len = req->get_data().length();
6272 size_t inc = len + name.length();
6273
6274 auto handler = Server::get_xattr_or_default_handler(name);
6275 const auto& pxattrs = cur->get_projected_xattrs();
6276 if (pxattrs) {
6277 // check xattrs kv pairs size
6278 size_t cur_xattrs_size = 0;
6279 for (const auto& p : *pxattrs) {
6280 if ((flags & CEPH_XATTR_REPLACE) && name.compare(p.first) == 0) {
6281 continue;
6282 }
6283 cur_xattrs_size += p.first.length() + p.second.length();
6284 }
6285
6286 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
6287 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6288 << cur_xattrs_size << ", inc " << inc << dendl;
6289 respond_to_request(mdr, -CEPHFS_ENOSPC);
6290 return;
6291 }
6292 }
6293
6294 XattrOp xattr_op(CEPH_MDS_OP_SETXATTR, name, req->get_data(), flags);
6295 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6296 if (r < 0) {
6297 respond_to_request(mdr, r);
6298 return;
6299 }
6300
6301 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
6302
6303 // project update
6304 auto pi = cur->project_inode(mdr, true);
6305 pi.inode->version = cur->pre_dirty();
6306 pi.inode->ctime = mdr->get_op_stamp();
6307 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6308 pi.inode->rstat.rctime = mdr->get_op_stamp();
6309 if (name == "encryption.ctx"sv)
6310 pi.inode->fscrypt = true;
6311 pi.inode->change_attr++;
6312 pi.inode->xattr_version++;
6313
6314 if ((flags & CEPH_XATTR_REMOVE)) {
6315 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
6316 } else {
6317 std::invoke(handler->setxattr, this, cur, pi.xattrs, xattr_op);
6318 }
6319
6320 // log + wait
6321 mdr->ls = mdlog->get_current_segment();
6322 EUpdate *le = new EUpdate(mdlog, "setxattr");
6323 mdlog->start_entry(le);
6324 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6325 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6326 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6327
6328 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6329 }
6330
6331 void Server::handle_client_removexattr(MDRequestRef& mdr)
6332 {
6333 const cref_t<MClientRequest> &req = mdr->client_request;
6334 std::string name(req->get_path2());
6335
6336 // is a ceph virtual xattr?
6337 if (is_ceph_vxattr(name)) {
6338 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6339 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6340 if (!cur)
6341 return;
6342
6343 handle_remove_vxattr(mdr, cur);
6344 return;
6345 }
6346
6347 if (!is_allowed_ceph_xattr(name)) {
6348 respond_to_request(mdr, -CEPHFS_EINVAL);
6349 return;
6350 }
6351
6352 CInode* cur = rdlock_path_pin_ref(mdr, true);
6353 if (!cur)
6354 return;
6355
6356 if (mdr->snapid != CEPH_NOSNAP) {
6357 respond_to_request(mdr, -CEPHFS_EROFS);
6358 return;
6359 }
6360
6361 MutationImpl::LockOpVec lov;
6362 lov.add_xlock(&cur->xattrlock);
6363 if (!mds->locker->acquire_locks(mdr, lov))
6364 return;
6365
6366
6367 auto handler = Server::get_xattr_or_default_handler(name);
6368 bufferlist bl;
6369 XattrOp xattr_op(CEPH_MDS_OP_RMXATTR, name, bl, 0);
6370
6371 const auto& pxattrs = cur->get_projected_xattrs();
6372 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6373 if (r < 0) {
6374 respond_to_request(mdr, r);
6375 return;
6376 }
6377
6378 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
6379
6380 // project update
6381 auto pi = cur->project_inode(mdr, true);
6382 pi.inode->version = cur->pre_dirty();
6383 pi.inode->ctime = mdr->get_op_stamp();
6384 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6385 pi.inode->rstat.rctime = mdr->get_op_stamp();
6386 pi.inode->change_attr++;
6387 pi.inode->xattr_version++;
6388 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
6389
6390 // log + wait
6391 mdr->ls = mdlog->get_current_segment();
6392 EUpdate *le = new EUpdate(mdlog, "removexattr");
6393 mdlog->start_entry(le);
6394 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6395 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6396 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6397
6398 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6399 }
6400
6401 void Server::handle_client_getvxattr(MDRequestRef& mdr)
6402 {
6403 const auto& req = mdr->client_request;
6404 string xattr_name{req->get_path2()};
6405
6406 // is a ceph virtual xattr?
6407 if (!is_ceph_vxattr(xattr_name)) {
6408 respond_to_request(mdr, -CEPHFS_ENODATA);
6409 return;
6410 }
6411
6412 CInode *cur = rdlock_path_pin_ref(mdr, true, false);
6413 if (!cur) {
6414 return;
6415 }
6416
6417 if (is_ceph_dir_vxattr(xattr_name)) {
6418 if (!cur->is_dir()) {
6419 respond_to_request(mdr, -CEPHFS_ENODATA);
6420 return;
6421 }
6422 } else if (is_ceph_file_vxattr(xattr_name)) {
6423 if (cur->is_dir()) {
6424 respond_to_request(mdr, -CEPHFS_ENODATA);
6425 return;
6426 }
6427 }
6428
6429 CachedStackStringStream css;
6430 int r = 0;
6431 ceph::bufferlist bl;
6432 // handle these vxattrs
6433 if ((xattr_name.substr(0, 15) == "ceph.dir.layout"sv) ||
6434 (xattr_name.substr(0, 16) == "ceph.file.layout"sv)) {
6435 std::string layout_field;
6436
6437 struct layout_xattr_info_t {
6438 enum class InheritanceStatus : uint32_t {
6439 DEFAULT = 0,
6440 SET = 1,
6441 INHERITED = 2
6442 };
6443
6444 const file_layout_t layout;
6445 const InheritanceStatus status;
6446
6447 layout_xattr_info_t(const file_layout_t& l, InheritanceStatus inh)
6448 : layout(l), status(inh) { }
6449
6450 static std::string status_to_string(InheritanceStatus status) {
6451 switch (status) {
6452 case InheritanceStatus::DEFAULT: return "default"s;
6453 case InheritanceStatus::SET: return "set"s;
6454 case InheritanceStatus::INHERITED: return "inherited"s;
6455 default: return "unknown"s;
6456 }
6457 }
6458 };
6459
6460 auto is_default_layout = [&](const file_layout_t& layout) -> bool {
6461 return (layout == mdcache->default_file_layout);
6462 };
6463 auto get_inherited_layout = [&](CInode *cur) -> layout_xattr_info_t {
6464 auto orig_in = cur;
6465
6466 while (cur) {
6467 if (cur->get_projected_inode()->has_layout()) {
6468 auto& curr_layout = cur->get_projected_inode()->layout;
6469 if (is_default_layout(curr_layout)) {
6470 return {curr_layout, layout_xattr_info_t::InheritanceStatus::DEFAULT};
6471 }
6472 if (cur == orig_in) {
6473 // we've found a new layout at this inode
6474 return {curr_layout, layout_xattr_info_t::InheritanceStatus::SET};
6475 } else {
6476 return {curr_layout, layout_xattr_info_t::InheritanceStatus::INHERITED};
6477 }
6478 }
6479
6480 if (cur->is_root()) {
6481 break;
6482 }
6483
6484 cur = cur->get_projected_parent_dir()->get_inode();
6485 }
6486 mds->clog->error() << "no layout found at root dir!";
6487 ceph_abort("no layout found at root dir! something is really messed up with layouts!");
6488 };
6489
6490 if (xattr_name == "ceph.dir.layout.json"sv ||
6491 xattr_name == "ceph.file.layout.json"sv) {
6492 // fetch layout only for valid xattr_name
6493 const auto lxi = get_inherited_layout(cur);
6494
6495 *css << "{\"stripe_unit\": " << lxi.layout.stripe_unit
6496 << ", \"stripe_count\": " << lxi.layout.stripe_count
6497 << ", \"object_size\": " << lxi.layout.object_size
6498 << ", \"pool_name\": ";
6499 mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
6500 *css << "\"";
6501 if (o.have_pg_pool(lxi.layout.pool_id)) {
6502 *css << o.get_pool_name(lxi.layout.pool_id);
6503 }
6504 *css << "\"";
6505 });
6506 *css << ", \"pool_id\": " << (uint64_t)lxi.layout.pool_id;
6507 *css << ", \"pool_namespace\": \"" << lxi.layout.pool_ns << "\"";
6508 *css << ", \"inheritance\": \"@"
6509 << layout_xattr_info_t::status_to_string(lxi.status) << "\"}";
6510 } else if ((xattr_name == "ceph.dir.layout.pool_name"sv) ||
6511 (xattr_name == "ceph.file.layout.pool_name"sv)) {
6512 // fetch layout only for valid xattr_name
6513 const auto lxi = get_inherited_layout(cur);
6514 mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
6515 if (o.have_pg_pool(lxi.layout.pool_id)) {
6516 *css << o.get_pool_name(lxi.layout.pool_id);
6517 }
6518 });
6519 } else if ((xattr_name == "ceph.dir.layout.pool_id"sv) ||
6520 (xattr_name == "ceph.file.layout.pool_id"sv)) {
6521 // fetch layout only for valid xattr_name
6522 const auto lxi = get_inherited_layout(cur);
6523 *css << (uint64_t)lxi.layout.pool_id;
6524 } else {
6525 r = -CEPHFS_ENODATA; // no such attribute
6526 }
6527 } else if (xattr_name.substr(0, 12) == "ceph.dir.pin"sv) {
6528 if (xattr_name == "ceph.dir.pin"sv) {
6529 *css << cur->get_projected_inode()->export_pin;
6530 } else if (xattr_name == "ceph.dir.pin.random"sv) {
6531 *css << cur->get_projected_inode()->export_ephemeral_random_pin;
6532 } else if (xattr_name == "ceph.dir.pin.distributed"sv) {
6533 *css << cur->get_projected_inode()->export_ephemeral_distributed_pin;
6534 } else {
6535 // otherwise respond as invalid request
6536 // since we only handle ceph vxattrs here
6537 r = -CEPHFS_ENODATA; // no such attribute
6538 }
6539 } else {
6540 // otherwise respond as invalid request
6541 // since we only handle ceph vxattrs here
6542 r = -CEPHFS_ENODATA; // no such attribute
6543 }
6544
6545 if (r == 0) {
6546 ENCODE_START(1, 1, bl);
6547 encode(css->strv(), bl);
6548 ENCODE_FINISH(bl);
6549 mdr->reply_extra_bl = bl;
6550 }
6551
6552 respond_to_request(mdr, r);
6553 }
6554
6555 // =================================================================
6556 // DIRECTORY and NAMESPACE OPS
6557
6558
6559 // ------------------------------------------------
6560
6561 // MKNOD
6562
6563 class C_MDS_mknod_finish : public ServerLogContext {
6564 CDentry *dn;
6565 CInode *newi;
6566 public:
6567 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
6568 ServerLogContext(s, r), dn(d), newi(ni) {}
6569 void finish(int r) override {
6570 ceph_assert(r == 0);
6571
6572 // link the inode
6573 dn->pop_projected_linkage();
6574
6575 // be a bit hacky with the inode version, here.. we decrement it
6576 // just to keep mark_dirty() happen. (we didn't bother projecting
6577 // a new version of hte inode since it's just been created)
6578 newi->mark_dirty(mdr->ls);
6579 newi->mark_dirty_parent(mdr->ls, true);
6580
6581 // mkdir?
6582 if (newi->is_dir()) {
6583 CDir *dir = newi->get_dirfrag(frag_t());
6584 ceph_assert(dir);
6585 dir->mark_dirty(mdr->ls);
6586 dir->mark_new(mdr->ls);
6587 }
6588
6589 mdr->apply();
6590
6591 MDRequestRef null_ref;
6592 get_mds()->mdcache->send_dentry_link(dn, null_ref);
6593
6594 if (newi->is_file()) {
6595 get_mds()->locker->share_inode_max_size(newi);
6596 } else if (newi->is_dir()) {
6597 // We do this now so that the linkages on the new directory are stable.
6598 newi->maybe_ephemeral_rand();
6599 }
6600
6601 // hit pop
6602 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
6603
6604 // reply
6605 server->respond_to_request(mdr, 0);
6606 }
6607 };
6608
6609
6610 void Server::handle_client_mknod(MDRequestRef& mdr)
6611 {
6612 const cref_t<MClientRequest> &req = mdr->client_request;
6613 client_t client = mdr->get_client();
6614
6615 unsigned mode = req->head.args.mknod.mode;
6616 if ((mode & S_IFMT) == 0)
6617 mode |= S_IFREG;
6618
6619 mdr->disable_lock_cache();
6620 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, S_ISREG(mode));
6621 if (!dn)
6622 return;
6623
6624 CDir *dir = dn->get_dir();
6625 CInode *diri = dir->get_inode();
6626 if (!check_access(mdr, diri, MAY_WRITE))
6627 return;
6628 if (!check_fragment_space(mdr, dir))
6629 return;
6630 if (!check_dir_max_entries(mdr, dir))
6631 return;
6632
6633 ceph_assert(dn->get_projected_linkage()->is_null());
6634 if (req->get_alternate_name().size() > alternate_name_max) {
6635 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6636 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6637 return;
6638 }
6639 dn->set_alternate_name(req->get_alternate_name());
6640
6641 // set layout
6642 file_layout_t layout;
6643 if (mdr->dir_layout != file_layout_t())
6644 layout = mdr->dir_layout;
6645 else
6646 layout = mdcache->default_file_layout;
6647
6648 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
6649 ceph_assert(newi);
6650
6651 dn->push_projected_linkage(newi);
6652
6653 auto _inode = newi->_get_inode();
6654 _inode->version = dn->pre_dirty();
6655 _inode->rdev = req->head.args.mknod.rdev;
6656 _inode->rstat.rfiles = 1;
6657 _inode->accounted_rstat = _inode->rstat;
6658 if (layout.pool_id != mdcache->default_file_layout.pool_id)
6659 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
6660 _inode->update_backtrace();
6661
6662 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6663 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6664 ceph_assert(follows >= realm->get_newest_seq());
6665
6666 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6667 // want to write to it (e.g., if they are reexporting NFS)
6668 if (S_ISREG(_inode->mode)) {
6669 // issue a cap on the file
6670 int cmode = CEPH_FILE_MODE_RDWR;
6671 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
6672 if (cap) {
6673 cap->set_wanted(0);
6674
6675 // put locks in excl mode
6676 newi->filelock.set_state(LOCK_EXCL);
6677 newi->authlock.set_state(LOCK_EXCL);
6678 newi->xattrlock.set_state(LOCK_EXCL);
6679
6680 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
6681 _inode->client_ranges[client].range.first = 0;
6682 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
6683 _inode->client_ranges[client].follows = follows;
6684 newi->mark_clientwriteable();
6685 cap->mark_clientwriteable();
6686 }
6687 }
6688
6689 ceph_assert(dn->first == follows + 1);
6690 newi->first = dn->first;
6691
6692 dout(10) << "mknod mode " << _inode->mode << " rdev " << _inode->rdev << dendl;
6693
6694 // prepare finisher
6695 mdr->ls = mdlog->get_current_segment();
6696 EUpdate *le = new EUpdate(mdlog, "mknod");
6697 mdlog->start_entry(le);
6698 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6699 journal_allocated_inos(mdr, &le->metablob);
6700
6701 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
6702 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6703 le->metablob.add_primary_dentry(dn, newi, true, true, true);
6704
6705 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6706 mds->balancer->maybe_fragment(dn->get_dir(), false);
6707 }
6708
6709
6710
6711 // MKDIR
6712 /* This function takes responsibility for the passed mdr*/
6713 void Server::handle_client_mkdir(MDRequestRef& mdr)
6714 {
6715 const cref_t<MClientRequest> &req = mdr->client_request;
6716
6717 mdr->disable_lock_cache();
6718 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6719 if (!dn)
6720 return;
6721
6722 CDir *dir = dn->get_dir();
6723 CInode *diri = dir->get_inode();
6724
6725 // mkdir check access
6726 if (!check_access(mdr, diri, MAY_WRITE))
6727 return;
6728
6729 if (!check_fragment_space(mdr, dir))
6730 return;
6731 if (!check_dir_max_entries(mdr, dir))
6732 return;
6733
6734 ceph_assert(dn->get_projected_linkage()->is_null());
6735 if (req->get_alternate_name().size() > alternate_name_max) {
6736 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6737 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6738 return;
6739 }
6740 dn->set_alternate_name(req->get_alternate_name());
6741
6742 // new inode
6743 unsigned mode = req->head.args.mkdir.mode;
6744 mode &= ~S_IFMT;
6745 mode |= S_IFDIR;
6746 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
6747 ceph_assert(newi);
6748
6749 // it's a directory.
6750 dn->push_projected_linkage(newi);
6751
6752 auto _inode = newi->_get_inode();
6753 _inode->version = dn->pre_dirty();
6754 _inode->rstat.rsubdirs = 1;
6755 _inode->accounted_rstat = _inode->rstat;
6756 _inode->update_backtrace();
6757
6758 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6759 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6760 ceph_assert(follows >= realm->get_newest_seq());
6761
6762 dout(12) << " follows " << follows << dendl;
6763 ceph_assert(dn->first == follows + 1);
6764 newi->first = dn->first;
6765
6766 // ...and that new dir is empty.
6767 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
6768 newdir->state_set(CDir::STATE_CREATING);
6769 newdir->mark_complete();
6770 newdir->_get_fnode()->version = newdir->pre_dirty();
6771
6772 // prepare finisher
6773 mdr->ls = mdlog->get_current_segment();
6774 EUpdate *le = new EUpdate(mdlog, "mkdir");
6775 mdlog->start_entry(le);
6776 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6777 journal_allocated_inos(mdr, &le->metablob);
6778 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6779 le->metablob.add_primary_dentry(dn, newi, true, true);
6780 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
6781
6782 // issue a cap on the directory
6783 int cmode = CEPH_FILE_MODE_RDWR;
6784 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
6785 if (cap) {
6786 cap->set_wanted(0);
6787
6788 // put locks in excl mode
6789 newi->filelock.set_state(LOCK_EXCL);
6790 newi->authlock.set_state(LOCK_EXCL);
6791 newi->xattrlock.set_state(LOCK_EXCL);
6792 }
6793
6794 // make sure this inode gets into the journal
6795 le->metablob.add_opened_ino(newi->ino());
6796
6797 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6798
6799 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6800 // have overshot the split size (multiple mkdir in flight), so here is
6801 // an early chance to split the dir if this mkdir makes it oversized.
6802 mds->balancer->maybe_fragment(dir, false);
6803 }
6804
6805
6806 // SYMLINK
6807
6808 void Server::handle_client_symlink(MDRequestRef& mdr)
6809 {
6810 const auto& req = mdr->client_request;
6811
6812 mdr->disable_lock_cache();
6813 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6814 if (!dn)
6815 return;
6816
6817 CDir *dir = dn->get_dir();
6818 CInode *diri = dir->get_inode();
6819
6820 if (!check_access(mdr, diri, MAY_WRITE))
6821 return;
6822 if (!check_fragment_space(mdr, dir))
6823 return;
6824 if (!check_dir_max_entries(mdr, dir))
6825 return;
6826
6827 ceph_assert(dn->get_projected_linkage()->is_null());
6828 if (req->get_alternate_name().size() > alternate_name_max) {
6829 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6830 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6831 }
6832 dn->set_alternate_name(req->get_alternate_name());
6833
6834 unsigned mode = S_IFLNK | 0777;
6835 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
6836 ceph_assert(newi);
6837
6838 // it's a symlink
6839 dn->push_projected_linkage(newi);
6840
6841 newi->symlink = req->get_path2();
6842 auto _inode = newi->_get_inode();
6843 _inode->version = dn->pre_dirty();
6844 _inode->size = newi->symlink.length();
6845 _inode->rstat.rbytes = _inode->size;
6846 _inode->rstat.rfiles = 1;
6847 _inode->accounted_rstat = _inode->rstat;
6848 _inode->update_backtrace();
6849
6850 newi->first = dn->first;
6851
6852 // prepare finisher
6853 mdr->ls = mdlog->get_current_segment();
6854 EUpdate *le = new EUpdate(mdlog, "symlink");
6855 mdlog->start_entry(le);
6856 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6857 journal_allocated_inos(mdr, &le->metablob);
6858 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6859 le->metablob.add_primary_dentry(dn, newi, true, true);
6860
6861 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6862 mds->balancer->maybe_fragment(dir, false);
6863 }
6864
6865
6866
6867
6868
6869 // LINK
6870
6871 void Server::handle_client_link(MDRequestRef& mdr)
6872 {
6873 const cref_t<MClientRequest> &req = mdr->client_request;
6874
6875 dout(7) << "handle_client_link " << req->get_filepath()
6876 << " to " << req->get_filepath2()
6877 << dendl;
6878
6879 mdr->disable_lock_cache();
6880
6881 CDentry *destdn;
6882 CInode *targeti;
6883
6884 if (req->get_filepath2().depth() == 0) {
6885 targeti = mdcache->get_inode(req->get_filepath2().get_ino());
6886 if (!targeti) {
6887 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
6888 mdcache->find_ino_peers(req->get_filepath2().get_ino(), new C_MDS_TryFindInode(this, mdr));
6889 return;
6890 }
6891 mdr->pin(targeti);
6892
6893 if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
6894 CDentry *pdn = targeti->get_projected_parent_dn();
6895 if (!pdn) {
6896 dout(7) << "target has no parent dn, failing..." << dendl;
6897 respond_to_request(mdr, -CEPHFS_EINVAL);
6898 return;
6899 }
6900 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
6901 return;
6902 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
6903 }
6904
6905 destdn = rdlock_path_xlock_dentry(mdr, false);
6906 if (!destdn)
6907 return;
6908 } else {
6909 auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
6910 destdn = ret.first;
6911 if (!destdn)
6912 return;
6913
6914 if (!destdn->get_projected_linkage()->is_null()) {
6915 respond_to_request(mdr, -CEPHFS_EEXIST);
6916 return;
6917 }
6918
6919 targeti = ret.second->get_projected_linkage()->get_inode();
6920 }
6921
6922 ceph_assert(destdn->get_projected_linkage()->is_null());
6923 if (req->get_alternate_name().size() > alternate_name_max) {
6924 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6925 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6926 return;
6927 }
6928 destdn->set_alternate_name(req->get_alternate_name());
6929
6930 if (targeti->is_dir()) {
6931 dout(7) << "target is a dir, failing..." << dendl;
6932 respond_to_request(mdr, -CEPHFS_EINVAL);
6933 return;
6934 }
6935
6936 CDir *dir = destdn->get_dir();
6937 dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
6938 dout(7) << "target is " << *targeti << dendl;
6939
6940 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
6941 MutationImpl::LockOpVec lov;
6942 lov.add_xlock(&targeti->snaplock);
6943 lov.add_xlock(&targeti->linklock);
6944
6945 if (!mds->locker->acquire_locks(mdr, lov))
6946 return;
6947
6948 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6949 }
6950
6951 if (targeti->get_projected_inode()->nlink == 0) {
6952 dout(7) << "target has no link, failing..." << dendl;
6953 respond_to_request(mdr, -CEPHFS_ENOENT);
6954 return;
6955 }
6956
6957 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6958 if (!check_access(mdr, targeti, MAY_WRITE))
6959 return;
6960
6961 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
6962 return;
6963
6964 if (!check_fragment_space(mdr, dir))
6965 return;
6966
6967 if (!check_dir_max_entries(mdr, dir))
6968 return;
6969 }
6970
6971 CInode* target_pin = targeti->get_projected_parent_dir()->inode;
6972 SnapRealm *target_realm = target_pin->find_snaprealm();
6973 if (target_pin != dir->inode &&
6974 target_realm->get_subvolume_ino() !=
6975 dir->inode->find_snaprealm()->get_subvolume_ino()) {
6976 dout(7) << "target is in different subvolume, failing..." << dendl;
6977 respond_to_request(mdr, -CEPHFS_EXDEV);
6978 return;
6979 }
6980
6981 // go!
6982 ceph_assert(g_conf()->mds_kill_link_at != 1);
6983
6984 // local or remote?
6985 if (targeti->is_auth())
6986 _link_local(mdr, destdn, targeti, target_realm);
6987 else
6988 _link_remote(mdr, true, destdn, targeti);
6989 mds->balancer->maybe_fragment(dir, false);
6990 }
6991
6992
6993 class C_MDS_link_local_finish : public ServerLogContext {
6994 CDentry *dn;
6995 CInode *targeti;
6996 version_t dnpv;
6997 version_t tipv;
6998 bool adjust_realm;
6999 public:
7000 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
7001 version_t dnpv_, version_t tipv_, bool ar) :
7002 ServerLogContext(s, r), dn(d), targeti(ti),
7003 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
7004 void finish(int r) override {
7005 ceph_assert(r == 0);
7006 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
7007 }
7008 };
7009
7010
7011 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm)
7012 {
7013 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
7014
7015 mdr->ls = mdlog->get_current_segment();
7016
7017 // predirty NEW dentry
7018 version_t dnpv = dn->pre_dirty();
7019 version_t tipv = targeti->pre_dirty();
7020
7021 // project inode update
7022 auto pi = targeti->project_inode(mdr);
7023 pi.inode->nlink++;
7024 pi.inode->ctime = mdr->get_op_stamp();
7025 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
7026 pi.inode->rstat.rctime = mdr->get_op_stamp();
7027 pi.inode->change_attr++;
7028 pi.inode->version = tipv;
7029
7030 bool adjust_realm = false;
7031 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
7032 sr_t *newsnap = targeti->project_snaprealm();
7033 targeti->mark_snaprealm_global(newsnap);
7034 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true);
7035 adjust_realm = true;
7036 }
7037
7038 // log + wait
7039 EUpdate *le = new EUpdate(mdlog, "link_local");
7040 mdlog->start_entry(le);
7041 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7042 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
7043 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
7044 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
7045 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
7046
7047 // do this after predirty_*, to avoid funky extra dnl arg
7048 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
7049
7050 journal_and_reply(mdr, targeti, dn, le,
7051 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
7052 }
7053
7054 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
7055 version_t dnpv, version_t tipv, bool adjust_realm)
7056 {
7057 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
7058
7059 // link and unlock the NEW dentry
7060 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
7061 if (!dnl->get_inode())
7062 dn->link_remote(dnl, targeti);
7063 dn->mark_dirty(dnpv, mdr->ls);
7064
7065 // target inode
7066 mdr->apply();
7067
7068 MDRequestRef null_ref;
7069 mdcache->send_dentry_link(dn, null_ref);
7070
7071 if (adjust_realm) {
7072 int op = CEPH_SNAP_OP_SPLIT;
7073 mds->mdcache->send_snap_update(targeti, 0, op);
7074 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7075 }
7076
7077 // bump target popularity
7078 mds->balancer->hit_inode(targeti, META_POP_IWR);
7079 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7080
7081 // reply
7082 respond_to_request(mdr, 0);
7083 }
7084
7085
7086 // link / unlink remote
7087
7088 class C_MDS_link_remote_finish : public ServerLogContext {
7089 bool inc;
7090 CDentry *dn;
7091 CInode *targeti;
7092 version_t dpv;
7093 public:
7094 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
7095 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
7096 dpv(d->get_projected_version()) {}
7097 void finish(int r) override {
7098 ceph_assert(r == 0);
7099 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
7100 }
7101 };
7102
7103 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
7104 {
7105 dout(10) << "_link_remote "
7106 << (inc ? "link ":"unlink ")
7107 << *dn << " to " << *targeti << dendl;
7108
7109 // 1. send LinkPrepare to dest (journal nlink++ prepare)
7110 mds_rank_t linkauth = targeti->authority().first;
7111 if (mdr->more()->witnessed.count(linkauth) == 0) {
7112 if (mds->is_cluster_degraded() &&
7113 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
7114 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
7115 if (mdr->more()->waiting_on_peer.empty())
7116 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
7117 return;
7118 }
7119
7120 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
7121 int op;
7122 if (inc)
7123 op = MMDSPeerRequest::OP_LINKPREP;
7124 else
7125 op = MMDSPeerRequest::OP_UNLINKPREP;
7126 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, op);
7127 targeti->set_object_info(req->get_object_info());
7128 req->op_stamp = mdr->get_op_stamp();
7129 if (auto& desti_srnode = mdr->more()->desti_srnode)
7130 encode(*desti_srnode, req->desti_snapbl);
7131 mds->send_message_mds(req, linkauth);
7132
7133 ceph_assert(mdr->more()->waiting_on_peer.count(linkauth) == 0);
7134 mdr->more()->waiting_on_peer.insert(linkauth);
7135 return;
7136 }
7137 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
7138
7139 ceph_assert(g_conf()->mds_kill_link_at != 2);
7140
7141 if (auto& desti_srnode = mdr->more()->desti_srnode) {
7142 delete desti_srnode;
7143 desti_srnode = NULL;
7144 }
7145
7146 mdr->set_mds_stamp(ceph_clock_now());
7147
7148 // add to event
7149 mdr->ls = mdlog->get_current_segment();
7150 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
7151 mdlog->start_entry(le);
7152 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7153 if (!mdr->more()->witnessed.empty()) {
7154 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7155 le->reqid = mdr->reqid;
7156 le->had_peers = true;
7157 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7158 }
7159
7160 if (inc) {
7161 dn->pre_dirty();
7162 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
7163 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
7164 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
7165 } else {
7166 dn->pre_dirty();
7167 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
7168 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7169 le->metablob.add_null_dentry(dn, true);
7170 dn->push_projected_linkage();
7171 }
7172
7173 journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
7174 new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
7175 }
7176
7177 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
7178 CDentry *dn, CInode *targeti,
7179 version_t dpv)
7180 {
7181 dout(10) << "_link_remote_finish "
7182 << (inc ? "link ":"unlink ")
7183 << *dn << " to " << *targeti << dendl;
7184
7185 ceph_assert(g_conf()->mds_kill_link_at != 3);
7186
7187 if (!mdr->more()->witnessed.empty())
7188 mdcache->logged_leader_update(mdr->reqid);
7189
7190 if (inc) {
7191 // link the new dentry
7192 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
7193 if (!dnl->get_inode())
7194 dn->link_remote(dnl, targeti);
7195 dn->mark_dirty(dpv, mdr->ls);
7196 } else {
7197 // unlink main dentry
7198 dn->get_dir()->unlink_inode(dn);
7199 dn->pop_projected_linkage();
7200 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
7201 }
7202
7203 mdr->apply();
7204
7205 MDRequestRef null_ref;
7206 if (inc)
7207 mdcache->send_dentry_link(dn, null_ref);
7208 else
7209 mdcache->send_dentry_unlink(dn, NULL, null_ref);
7210
7211 // bump target popularity
7212 mds->balancer->hit_inode(targeti, META_POP_IWR);
7213 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7214
7215 // reply
7216 respond_to_request(mdr, 0);
7217
7218 if (!inc)
7219 // removing a new dn?
7220 dn->get_dir()->try_remove_unlinked_dn(dn);
7221 }
7222
7223
7224 // remote linking/unlinking
7225
7226 class C_MDS_PeerLinkPrep : public ServerLogContext {
7227 CInode *targeti;
7228 bool adjust_realm;
7229 public:
7230 C_MDS_PeerLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
7231 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
7232 void finish(int r) override {
7233 ceph_assert(r == 0);
7234 server->_logged_peer_link(mdr, targeti, adjust_realm);
7235 }
7236 };
7237
7238 class C_MDS_PeerLinkCommit : public ServerContext {
7239 MDRequestRef mdr;
7240 CInode *targeti;
7241 public:
7242 C_MDS_PeerLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
7243 ServerContext(s), mdr(r), targeti(t) { }
7244 void finish(int r) override {
7245 server->_commit_peer_link(mdr, r, targeti);
7246 }
7247 };
7248
7249 void Server::handle_peer_link_prep(MDRequestRef& mdr)
7250 {
7251 dout(10) << "handle_peer_link_prep " << *mdr
7252 << " on " << mdr->peer_request->get_object_info()
7253 << dendl;
7254
7255 ceph_assert(g_conf()->mds_kill_link_at != 4);
7256
7257 CInode *targeti = mdcache->get_inode(mdr->peer_request->get_object_info().ino);
7258 ceph_assert(targeti);
7259 dout(10) << "targeti " << *targeti << dendl;
7260 CDentry *dn = targeti->get_parent_dn();
7261 CDentry::linkage_t *dnl = dn->get_linkage();
7262 ceph_assert(dnl->is_primary());
7263
7264 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7265
7266 mdr->auth_pin(targeti);
7267
7268 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
7269 ceph_assert(g_conf()->mds_kill_link_at != 5);
7270
7271 // journal it
7272 mdr->ls = mdlog->get_current_segment();
7273 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_prep", mdr->reqid, mdr->peer_to_mds,
7274 EPeerUpdate::OP_PREPARE, EPeerUpdate::LINK);
7275 mdlog->start_entry(le);
7276
7277 auto pi = dnl->get_inode()->project_inode(mdr);
7278
7279 // update journaled target inode
7280 bool inc;
7281 bool adjust_realm = false;
7282 bool realm_projected = false;
7283 if (mdr->peer_request->get_op() == MMDSPeerRequest::OP_LINKPREP) {
7284 inc = true;
7285 pi.inode->nlink++;
7286
7287 CDentry *target_pdn = targeti->get_projected_parent_dn();
7288 SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm();
7289 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
7290 sr_t *newsnap = targeti->project_snaprealm();
7291 targeti->mark_snaprealm_global(newsnap);
7292 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true);
7293 adjust_realm = true;
7294 realm_projected = true;
7295 }
7296 } else {
7297 inc = false;
7298 pi.inode->nlink--;
7299 if (targeti->is_projected_snaprealm_global()) {
7300 ceph_assert(mdr->peer_request->desti_snapbl.length());
7301 auto p = mdr->peer_request->desti_snapbl.cbegin();
7302
7303 sr_t *newsnap = targeti->project_snaprealm();
7304 decode(*newsnap, p);
7305
7306 if (pi.inode->nlink == 0)
7307 ceph_assert(!newsnap->is_parent_global());
7308
7309 realm_projected = true;
7310 } else {
7311 ceph_assert(mdr->peer_request->desti_snapbl.length() == 0);
7312 }
7313 }
7314
7315 link_rollback rollback;
7316 rollback.reqid = mdr->reqid;
7317 rollback.ino = targeti->ino();
7318 rollback.old_ctime = targeti->get_inode()->ctime; // we hold versionlock xlock; no concorrent projections
7319 const auto& pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
7320 rollback.old_dir_mtime = pf->fragstat.mtime;
7321 rollback.old_dir_rctime = pf->rstat.rctime;
7322 rollback.was_inc = inc;
7323 if (realm_projected) {
7324 if (targeti->snaprealm) {
7325 encode(true, rollback.snapbl);
7326 targeti->encode_snap_blob(rollback.snapbl);
7327 } else {
7328 encode(false, rollback.snapbl);
7329 }
7330 }
7331 encode(rollback, le->rollback);
7332 mdr->more()->rollback_bl = le->rollback;
7333
7334 pi.inode->ctime = mdr->get_op_stamp();
7335 pi.inode->version = targeti->pre_dirty();
7336
7337 dout(10) << " projected inode " << pi.inode->ino << " v " << pi.inode->version << dendl;
7338
7339 // commit case
7340 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
7341 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
7342 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7343
7344 // set up commit waiter
7345 mdr->more()->peer_commit = new C_MDS_PeerLinkCommit(this, mdr, targeti);
7346
7347 mdr->more()->peer_update_journaled = true;
7348 submit_mdlog_entry(le, new C_MDS_PeerLinkPrep(this, mdr, targeti, adjust_realm),
7349 mdr, __func__);
7350 mdlog->flush();
7351 }
7352
7353 void Server::_logged_peer_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
7354 {
7355 dout(10) << "_logged_peer_link " << *mdr
7356 << " " << *targeti << dendl;
7357
7358 ceph_assert(g_conf()->mds_kill_link_at != 6);
7359
7360 // update the target
7361 mdr->apply();
7362
7363 // hit pop
7364 mds->balancer->hit_inode(targeti, META_POP_IWR);
7365
7366 // done.
7367 mdr->reset_peer_request();
7368
7369 if (adjust_realm) {
7370 int op = CEPH_SNAP_OP_SPLIT;
7371 mds->mdcache->send_snap_update(targeti, 0, op);
7372 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7373 }
7374
7375 // ack
7376 if (!mdr->aborted) {
7377 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_LINKPREPACK);
7378 mds->send_message_mds(reply, mdr->peer_to_mds);
7379 } else {
7380 dout(10) << " abort flag set, finishing" << dendl;
7381 mdcache->request_finish(mdr);
7382 }
7383 }
7384
7385
7386 struct C_MDS_CommittedPeer : public ServerLogContext {
7387 C_MDS_CommittedPeer(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
7388 void finish(int r) override {
7389 server->_committed_peer(mdr);
7390 }
7391 };
7392
7393 void Server::_commit_peer_link(MDRequestRef& mdr, int r, CInode *targeti)
7394 {
7395 dout(10) << "_commit_peer_link " << *mdr
7396 << " r=" << r
7397 << " " << *targeti << dendl;
7398
7399 ceph_assert(g_conf()->mds_kill_link_at != 7);
7400
7401 if (r == 0) {
7402 // drop our pins, etc.
7403 mdr->cleanup();
7404
7405 // write a commit to the journal
7406 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_commit", mdr->reqid, mdr->peer_to_mds,
7407 EPeerUpdate::OP_COMMIT, EPeerUpdate::LINK);
7408 mdlog->start_entry(le);
7409 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7410 mdlog->flush();
7411 } else {
7412 do_link_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7413 }
7414 }
7415
7416 void Server::_committed_peer(MDRequestRef& mdr)
7417 {
7418 dout(10) << "_committed_peer " << *mdr << dendl;
7419
7420 ceph_assert(g_conf()->mds_kill_link_at != 8);
7421
7422 bool assert_exist = mdr->more()->peer_update_journaled;
7423 mdcache->finish_uncommitted_peer(mdr->reqid, assert_exist);
7424 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_COMMITTED);
7425 mds->send_message_mds(req, mdr->peer_to_mds);
7426 mdcache->request_finish(mdr);
7427 }
7428
7429 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
7430 MutationRef mut;
7431 map<client_t,ref_t<MClientSnap>> splits;
7432 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
7433 map<client_t,ref_t<MClientSnap>>&& _splits) :
7434 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
7435 }
7436 void finish(int r) override {
7437 server->_link_rollback_finish(mut, mdr, splits);
7438 }
7439 };
7440
7441 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7442 {
7443 link_rollback rollback;
7444 auto p = rbl.cbegin();
7445 decode(rollback, p);
7446
7447 dout(10) << "do_link_rollback on " << rollback.reqid
7448 << (rollback.was_inc ? " inc":" dec")
7449 << " ino " << rollback.ino
7450 << dendl;
7451
7452 ceph_assert(g_conf()->mds_kill_link_at != 9);
7453
7454 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
7455 ceph_assert(mdr || mds->is_resolve());
7456
7457 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7458 mut->ls = mds->mdlog->get_current_segment();
7459
7460 CInode *in = mdcache->get_inode(rollback.ino);
7461 ceph_assert(in);
7462 dout(10) << " target is " << *in << dendl;
7463 ceph_assert(!in->is_projected()); // live peer request hold versionlock xlock.
7464
7465 auto pi = in->project_inode(mut);
7466 pi.inode->version = in->pre_dirty();
7467
7468 // parent dir rctime
7469 CDir *parent = in->get_projected_parent_dn()->get_dir();
7470 auto pf = parent->project_fnode(mut);
7471 pf->version = parent->pre_dirty();
7472 if (pf->fragstat.mtime == pi.inode->ctime) {
7473 pf->fragstat.mtime = rollback.old_dir_mtime;
7474 if (pf->rstat.rctime == pi.inode->ctime)
7475 pf->rstat.rctime = rollback.old_dir_rctime;
7476 mut->add_updated_lock(&parent->get_inode()->filelock);
7477 mut->add_updated_lock(&parent->get_inode()->nestlock);
7478 }
7479
7480 // inode
7481 pi.inode->ctime = rollback.old_ctime;
7482 if (rollback.was_inc)
7483 pi.inode->nlink--;
7484 else
7485 pi.inode->nlink++;
7486
7487 map<client_t,ref_t<MClientSnap>> splits;
7488 if (rollback.snapbl.length() && in->snaprealm) {
7489 bool hadrealm;
7490 auto p = rollback.snapbl.cbegin();
7491 decode(hadrealm, p);
7492 if (hadrealm) {
7493 if (!mds->is_resolve()) {
7494 sr_t *new_srnode = new sr_t();
7495 decode(*new_srnode, p);
7496 in->project_snaprealm(new_srnode);
7497 } else {
7498 decode(in->snaprealm->srnode, p);
7499 }
7500 } else {
7501 SnapRealm *realm = parent->get_inode()->find_snaprealm();
7502 if (!mds->is_resolve())
7503 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
7504 in->project_snaprealm(NULL);
7505 }
7506 }
7507
7508 // journal it
7509 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_rollback", rollback.reqid, leader,
7510 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::LINK);
7511 mdlog->start_entry(le);
7512 le->commit.add_dir_context(parent);
7513 le->commit.add_dir(parent, true);
7514 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
7515
7516 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
7517 mdr, __func__);
7518 mdlog->flush();
7519 }
7520
7521 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
7522 map<client_t,ref_t<MClientSnap>>& splits)
7523 {
7524 dout(10) << "_link_rollback_finish" << dendl;
7525
7526 ceph_assert(g_conf()->mds_kill_link_at != 10);
7527
7528 mut->apply();
7529
7530 if (!mds->is_resolve())
7531 mdcache->send_snaps(splits);
7532
7533 if (mdr)
7534 mdcache->request_finish(mdr);
7535
7536 mdcache->finish_rollback(mut->reqid, mdr);
7537
7538 mut->cleanup();
7539 }
7540
7541
7542 void Server::handle_peer_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m)
7543 {
7544 dout(10) << "handle_peer_link_prep_ack " << *mdr
7545 << " " << *m << dendl;
7546 mds_rank_t from = mds_rank_t(m->get_source().num());
7547
7548 ceph_assert(g_conf()->mds_kill_link_at != 11);
7549
7550 // note peer
7551 mdr->more()->peers.insert(from);
7552
7553 // witnessed!
7554 ceph_assert(mdr->more()->witnessed.count(from) == 0);
7555 mdr->more()->witnessed.insert(from);
7556 ceph_assert(!m->is_not_journaled());
7557 mdr->more()->has_journaled_peers = true;
7558
7559 // remove from waiting list
7560 ceph_assert(mdr->more()->waiting_on_peer.count(from));
7561 mdr->more()->waiting_on_peer.erase(from);
7562
7563 ceph_assert(mdr->more()->waiting_on_peer.empty());
7564
7565 dispatch_client_request(mdr); // go again!
7566 }
7567
7568
7569
7570
7571
7572 // UNLINK
7573
7574 void Server::handle_client_unlink(MDRequestRef& mdr)
7575 {
7576 const cref_t<MClientRequest> &req = mdr->client_request;
7577 client_t client = mdr->get_client();
7578
7579 // rmdir or unlink?
7580 bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
7581
7582 if (rmdir)
7583 mdr->disable_lock_cache();
7584 CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
7585 if (!dn)
7586 return;
7587
7588 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
7589 ceph_assert(!dnl->is_null());
7590 CInode *in = dnl->get_inode();
7591
7592 if (rmdir) {
7593 dout(7) << "handle_client_rmdir on " << *dn << dendl;
7594 } else {
7595 dout(7) << "handle_client_unlink on " << *dn << dendl;
7596 }
7597 dout(7) << "dn links to " << *in << dendl;
7598
7599 // rmdir vs is_dir
7600 if (in->is_dir()) {
7601 if (rmdir) {
7602 // do empty directory checks
7603 if (_dir_is_nonempty_unlocked(mdr, in)) {
7604 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7605 return;
7606 }
7607 } else {
7608 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
7609 respond_to_request(mdr, -CEPHFS_EISDIR);
7610 return;
7611 }
7612 } else {
7613 if (rmdir) {
7614 // unlink
7615 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
7616 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7617 return;
7618 }
7619 }
7620
7621 CInode *diri = dn->get_dir()->get_inode();
7622 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7623 if (!check_access(mdr, diri, MAY_WRITE))
7624 return;
7625 }
7626
7627 // -- create stray dentry? --
7628 CDentry *straydn = NULL;
7629 if (dnl->is_primary()) {
7630 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
7631 if (!straydn)
7632 return;
7633 dout(10) << " straydn is " << *straydn << dendl;
7634 } else if (mdr->straydn) {
7635 mdr->unpin(mdr->straydn);
7636 mdr->straydn = NULL;
7637 }
7638
7639 // lock
7640 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7641 MutationImpl::LockOpVec lov;
7642
7643 lov.add_xlock(&in->linklock);
7644 lov.add_xlock(&in->snaplock);
7645 if (in->is_dir())
7646 lov.add_rdlock(&in->filelock); // to verify it's empty
7647
7648 if (straydn) {
7649 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7650 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7651 lov.add_xlock(&straydn->lock);
7652 }
7653
7654 if (!mds->locker->acquire_locks(mdr, lov))
7655 return;
7656
7657 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7658 }
7659
7660 if (in->is_dir() &&
7661 _dir_is_nonempty(mdr, in)) {
7662 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7663 return;
7664 }
7665
7666 if (straydn)
7667 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7668
7669 if (!mdr->more()->desti_srnode) {
7670 if (in->is_projected_snaprealm_global()) {
7671 sr_t *new_srnode = in->prepare_new_srnode(0);
7672 in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary());
7673 // dropping the last linkage or dropping the last remote linkage,
7674 // detch the inode from global snaprealm
7675 auto nlink = in->get_projected_inode()->nlink;
7676 if (nlink == 1 ||
7677 (nlink == 2 && !dnl->is_primary() &&
7678 !in->get_projected_parent_dir()->inode->is_stray()))
7679 in->clear_snaprealm_global(new_srnode);
7680 mdr->more()->desti_srnode = new_srnode;
7681 } else if (dnl->is_primary()) {
7682 // prepare snaprealm blob for peer request
7683 SnapRealm *realm = in->find_snaprealm();
7684 snapid_t follows = realm->get_newest_seq();
7685 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
7686 sr_t *new_srnode = in->prepare_new_srnode(follows);
7687 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7688 mdr->more()->desti_srnode = new_srnode;
7689 }
7690 }
7691 }
7692
7693 // yay!
7694 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
7695 // subtree root auths need to be witnesses
7696 set<mds_rank_t> witnesses;
7697 in->list_replicas(witnesses);
7698 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7699
7700 for (set<mds_rank_t>::iterator p = witnesses.begin();
7701 p != witnesses.end();
7702 ++p) {
7703 if (mdr->more()->witnessed.count(*p)) {
7704 dout(10) << " already witnessed by mds." << *p << dendl;
7705 } else if (mdr->more()->waiting_on_peer.count(*p)) {
7706 dout(10) << " already waiting on witness mds." << *p << dendl;
7707 } else {
7708 if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
7709 return;
7710 }
7711 }
7712 if (!mdr->more()->waiting_on_peer.empty())
7713 return; // we're waiting for a witness.
7714 }
7715
7716 if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
7717 mds->locker->create_lock_cache(mdr, diri);
7718
7719 // ok!
7720 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
7721 _link_remote(mdr, false, dn, dnl->get_inode());
7722 else
7723 _unlink_local(mdr, dn, straydn);
7724 }
7725
7726 class C_MDS_unlink_local_finish : public ServerLogContext {
7727 CDentry *dn;
7728 CDentry *straydn;
7729 version_t dnpv; // deleted dentry
7730 public:
7731 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
7732 ServerLogContext(s, r), dn(d), straydn(sd),
7733 dnpv(d->get_projected_version()) {}
7734 void finish(int r) override {
7735 ceph_assert(r == 0);
7736 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
7737 }
7738 };
7739
7740 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7741 {
7742 dout(10) << "_unlink_local " << *dn << dendl;
7743
7744 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7745 CInode *in = dnl->get_inode();
7746
7747
7748 // ok, let's do it.
7749 mdr->ls = mdlog->get_current_segment();
7750
7751 // prepare log entry
7752 EUpdate *le = new EUpdate(mdlog, "unlink_local");
7753 mdlog->start_entry(le);
7754 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7755 if (!mdr->more()->witnessed.empty()) {
7756 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7757 le->reqid = mdr->reqid;
7758 le->had_peers = true;
7759 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7760 }
7761
7762 if (straydn) {
7763 ceph_assert(dnl->is_primary());
7764 straydn->push_projected_linkage(in);
7765 }
7766
7767 // the unlinked dentry
7768 dn->pre_dirty();
7769
7770 auto pi = in->project_inode(mdr);
7771 {
7772 std::string t;
7773 dn->make_path_string(t, true);
7774 pi.inode->stray_prior_path = std::move(t);
7775 }
7776 pi.inode->version = in->pre_dirty();
7777 pi.inode->ctime = mdr->get_op_stamp();
7778 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
7779 pi.inode->rstat.rctime = mdr->get_op_stamp();
7780 pi.inode->change_attr++;
7781 pi.inode->nlink--;
7782 if (pi.inode->nlink == 0)
7783 in->state_set(CInode::STATE_ORPHAN);
7784
7785 if (mdr->more()->desti_srnode) {
7786 auto& desti_srnode = mdr->more()->desti_srnode;
7787 in->project_snaprealm(desti_srnode);
7788 desti_srnode = NULL;
7789 }
7790
7791 if (straydn) {
7792 // will manually pop projected inode
7793
7794 // primary link. add stray dentry.
7795 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
7796 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7797
7798 pi.inode->update_backtrace();
7799 le->metablob.add_primary_dentry(straydn, in, true, true);
7800 } else {
7801 // remote link. update remote inode.
7802 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
7803 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
7804 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
7805 }
7806
7807 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7808 le->metablob.add_null_dentry(dn, true);
7809
7810 if (in->is_dir()) {
7811 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7812 le->metablob.renamed_dirino = in->ino();
7813 }
7814
7815 dn->push_projected_linkage();
7816
7817 if (straydn) {
7818 ceph_assert(in->first <= straydn->first);
7819 in->first = straydn->first;
7820 }
7821
7822 if (in->is_dir()) {
7823 ceph_assert(straydn);
7824 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7825 }
7826
7827 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
7828 }
7829
7830 void Server::_unlink_local_finish(MDRequestRef& mdr,
7831 CDentry *dn, CDentry *straydn,
7832 version_t dnpv)
7833 {
7834 dout(10) << "_unlink_local_finish " << *dn << dendl;
7835
7836 if (!mdr->more()->witnessed.empty())
7837 mdcache->logged_leader_update(mdr->reqid);
7838
7839 CInode *strayin = NULL;
7840 bool hadrealm = false;
7841 if (straydn) {
7842 // if there is newly created snaprealm, need to split old snaprealm's
7843 // inodes_with_caps. So pop snaprealm before linkage changes.
7844 strayin = dn->get_linkage()->get_inode();
7845 hadrealm = strayin->snaprealm ? true : false;
7846 strayin->early_pop_projected_snaprealm();
7847 }
7848
7849 // unlink main dentry
7850 dn->get_dir()->unlink_inode(dn);
7851 dn->pop_projected_linkage();
7852 dn->mark_dirty(dnpv, mdr->ls);
7853
7854 // relink as stray? (i.e. was primary link?)
7855 if (straydn) {
7856 dout(20) << " straydn is " << *straydn << dendl;
7857 straydn->pop_projected_linkage();
7858 mdcache->touch_dentry_bottom(straydn);
7859 }
7860
7861 mdr->apply();
7862
7863 mdcache->send_dentry_unlink(dn, straydn, mdr);
7864
7865 if (straydn) {
7866 // update subtree map?
7867 if (strayin->is_dir())
7868 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
7869
7870 if (strayin->snaprealm && !hadrealm)
7871 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
7872 }
7873
7874 // bump pop
7875 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7876
7877 // reply
7878 respond_to_request(mdr, 0);
7879
7880 // removing a new dn?
7881 dn->get_dir()->try_remove_unlinked_dn(dn);
7882
7883 // clean up ?
7884 // respond_to_request() drops locks. So stray reintegration can race with us.
7885 if (straydn && !straydn->get_projected_linkage()->is_null()) {
7886 // Tip off the MDCache that this dentry is a stray that
7887 // might be elegible for purge.
7888 mdcache->notify_stray(straydn);
7889 }
7890 }
7891
7892 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
7893 {
7894 if (mds->is_cluster_degraded() &&
7895 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7896 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
7897 if (mdr->more()->waiting_on_peer.empty())
7898 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7899 return false;
7900 }
7901
7902 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
7903 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREP);
7904 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
7905 for (auto dn : trace)
7906 req->srcdnpath.push_dentry(dn->get_name());
7907 mdcache->encode_replica_stray(straydn, who, req->straybl);
7908 if (mdr->more()->desti_srnode)
7909 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
7910
7911 req->op_stamp = mdr->get_op_stamp();
7912 mds->send_message_mds(req, who);
7913
7914 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
7915 mdr->more()->waiting_on_peer.insert(who);
7916 return true;
7917 }
7918
7919 struct C_MDS_PeerRmdirPrep : public ServerLogContext {
7920 CDentry *dn, *straydn;
7921 C_MDS_PeerRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
7922 : ServerLogContext(s, r), dn(d), straydn(st) {}
7923 void finish(int r) override {
7924 server->_logged_peer_rmdir(mdr, dn, straydn);
7925 }
7926 };
7927
7928 struct C_MDS_PeerRmdirCommit : public ServerContext {
7929 MDRequestRef mdr;
7930 CDentry *straydn;
7931 C_MDS_PeerRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
7932 : ServerContext(s), mdr(r), straydn(sd) { }
7933 void finish(int r) override {
7934 server->_commit_peer_rmdir(mdr, r, straydn);
7935 }
7936 };
7937
7938 void Server::handle_peer_rmdir_prep(MDRequestRef& mdr)
7939 {
7940 dout(10) << "handle_peer_rmdir_prep " << *mdr
7941 << " " << mdr->peer_request->srcdnpath
7942 << " to " << mdr->peer_request->destdnpath
7943 << dendl;
7944
7945 vector<CDentry*> trace;
7946 filepath srcpath(mdr->peer_request->srcdnpath);
7947 dout(10) << " src " << srcpath << dendl;
7948 CInode *in;
7949 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
7950 int r = mdcache->path_traverse(mdr, cf, srcpath,
7951 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
7952 &trace, &in);
7953 if (r > 0) return;
7954 if (r == -CEPHFS_ESTALE) {
7955 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
7956 mdr->peer_to_mds, true);
7957 return;
7958 }
7959 ceph_assert(r == 0);
7960 CDentry *dn = trace.back();
7961 dout(10) << " dn " << *dn << dendl;
7962 mdr->pin(dn);
7963
7964 ceph_assert(mdr->straydn);
7965 CDentry *straydn = mdr->straydn;
7966 dout(10) << " straydn " << *straydn << dendl;
7967
7968 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7969
7970 rmdir_rollback rollback;
7971 rollback.reqid = mdr->reqid;
7972 rollback.src_dir = dn->get_dir()->dirfrag();
7973 rollback.src_dname = dn->get_name();
7974 rollback.dest_dir = straydn->get_dir()->dirfrag();
7975 rollback.dest_dname = straydn->get_name();
7976 if (mdr->peer_request->desti_snapbl.length()) {
7977 if (in->snaprealm) {
7978 encode(true, rollback.snapbl);
7979 in->encode_snap_blob(rollback.snapbl);
7980 } else {
7981 encode(false, rollback.snapbl);
7982 }
7983 }
7984 encode(rollback, mdr->more()->rollback_bl);
7985 // FIXME: rollback snaprealm
7986 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7987
7988 // set up commit waiter
7989 mdr->more()->peer_commit = new C_MDS_PeerRmdirCommit(this, mdr, straydn);
7990
7991 straydn->push_projected_linkage(in);
7992 dn->push_projected_linkage();
7993
7994 ceph_assert(straydn->first >= in->first);
7995 in->first = straydn->first;
7996
7997 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
7998 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
7999 _logged_peer_rmdir(mdr, dn, straydn);
8000 return;
8001 }
8002
8003 mdr->ls = mdlog->get_current_segment();
8004 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir", mdr->reqid, mdr->peer_to_mds,
8005 EPeerUpdate::OP_PREPARE, EPeerUpdate::RMDIR);
8006 mdlog->start_entry(le);
8007 le->rollback = mdr->more()->rollback_bl;
8008
8009 le->commit.add_dir_context(straydn->get_dir());
8010 le->commit.add_primary_dentry(straydn, in, true);
8011 // peer: no need to journal original dentry
8012
8013 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8014 le->commit.renamed_dirino = in->ino();
8015
8016 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
8017 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
8018
8019 mdr->more()->peer_update_journaled = true;
8020 submit_mdlog_entry(le, new C_MDS_PeerRmdirPrep(this, mdr, dn, straydn),
8021 mdr, __func__);
8022 mdlog->flush();
8023 }
8024
8025 void Server::_logged_peer_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
8026 {
8027 dout(10) << "_logged_peer_rmdir " << *mdr << " on " << *dn << dendl;
8028 CInode *in = dn->get_linkage()->get_inode();
8029
8030 bool new_realm;
8031 if (mdr->peer_request->desti_snapbl.length()) {
8032 new_realm = !in->snaprealm;
8033 in->decode_snap_blob(mdr->peer_request->desti_snapbl);
8034 ceph_assert(in->snaprealm);
8035 } else {
8036 new_realm = false;
8037 }
8038
8039 // update our cache now, so we are consistent with what is in the journal
8040 // when we journal a subtree map
8041 dn->get_dir()->unlink_inode(dn);
8042 straydn->pop_projected_linkage();
8043 dn->pop_projected_linkage();
8044
8045 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->peer_update_journaled);
8046
8047 if (new_realm)
8048 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
8049
8050 // done.
8051 mdr->reset_peer_request();
8052 mdr->straydn = 0;
8053
8054 if (!mdr->aborted) {
8055 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREPACK);
8056 if (!mdr->more()->peer_update_journaled)
8057 reply->mark_not_journaled();
8058 mds->send_message_mds(reply, mdr->peer_to_mds);
8059 } else {
8060 dout(10) << " abort flag set, finishing" << dendl;
8061 mdcache->request_finish(mdr);
8062 }
8063 }
8064
8065 void Server::handle_peer_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
8066 {
8067 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
8068 << " " << *ack << dendl;
8069
8070 mds_rank_t from = mds_rank_t(ack->get_source().num());
8071
8072 mdr->more()->peers.insert(from);
8073 mdr->more()->witnessed.insert(from);
8074 if (!ack->is_not_journaled())
8075 mdr->more()->has_journaled_peers = true;
8076
8077 // remove from waiting list
8078 ceph_assert(mdr->more()->waiting_on_peer.count(from));
8079 mdr->more()->waiting_on_peer.erase(from);
8080
8081 if (mdr->more()->waiting_on_peer.empty())
8082 dispatch_client_request(mdr); // go again!
8083 else
8084 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
8085 }
8086
8087 void Server::_commit_peer_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
8088 {
8089 dout(10) << "_commit_peer_rmdir " << *mdr << " r=" << r << dendl;
8090
8091 if (r == 0) {
8092 if (mdr->more()->peer_update_journaled) {
8093 CInode *strayin = straydn->get_projected_linkage()->get_inode();
8094 if (strayin && !strayin->snaprealm)
8095 mdcache->clear_dirty_bits_for_stray(strayin);
8096 }
8097
8098 mdr->cleanup();
8099
8100 if (mdr->more()->peer_update_journaled) {
8101 // write a commit to the journal
8102 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_commit", mdr->reqid,
8103 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
8104 EPeerUpdate::RMDIR);
8105 mdlog->start_entry(le);
8106 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
8107 mdlog->flush();
8108 } else {
8109 _committed_peer(mdr);
8110 }
8111 } else {
8112 // abort
8113 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
8114 }
8115 }
8116
8117 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
8118 metareqid_t reqid;
8119 CDentry *dn;
8120 CDentry *straydn;
8121 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
8122 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
8123 void finish(int r) override {
8124 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
8125 }
8126 };
8127
8128 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
8129 {
8130 // unlink the other rollback methods, the rmdir rollback is only
8131 // needed to record the subtree changes in the journal for inode
8132 // replicas who are auth for empty dirfrags. no actual changes to
8133 // the file system are taking place here, so there is no Mutation.
8134
8135 rmdir_rollback rollback;
8136 auto p = rbl.cbegin();
8137 decode(rollback, p);
8138
8139 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
8140 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
8141 ceph_assert(mdr || mds->is_resolve());
8142
8143 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
8144 if (!dir)
8145 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
8146 ceph_assert(dir);
8147 CDentry *dn = dir->lookup(rollback.src_dname);
8148 ceph_assert(dn);
8149 dout(10) << " dn " << *dn << dendl;
8150 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
8151 ceph_assert(straydir);
8152 CDentry *straydn = straydir->lookup(rollback.dest_dname);
8153 ceph_assert(straydn);
8154 dout(10) << " straydn " << *straydn << dendl;
8155 CInode *in = straydn->get_linkage()->get_inode();
8156
8157 dn->push_projected_linkage(in);
8158 straydn->push_projected_linkage();
8159
8160 if (rollback.snapbl.length() && in->snaprealm) {
8161 bool hadrealm;
8162 auto p = rollback.snapbl.cbegin();
8163 decode(hadrealm, p);
8164 if (hadrealm) {
8165 decode(in->snaprealm->srnode, p);
8166 } else {
8167 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
8168 }
8169 }
8170
8171 if (mdr && !mdr->more()->peer_update_journaled) {
8172 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
8173
8174 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
8175 return;
8176 }
8177
8178
8179 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_rollback", rollback.reqid, leader,
8180 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RMDIR);
8181 mdlog->start_entry(le);
8182
8183 le->commit.add_dir_context(dn->get_dir());
8184 le->commit.add_primary_dentry(dn, in, true);
8185 // peer: no need to journal straydn
8186
8187 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8188 le->commit.renamed_dirino = in->ino();
8189
8190 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
8191
8192 submit_mdlog_entry(le,
8193 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
8194 dn, straydn),
8195 mdr, __func__);
8196 mdlog->flush();
8197 }
8198
8199 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
8200 {
8201 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
8202
8203 straydn->get_dir()->unlink_inode(straydn);
8204 dn->pop_projected_linkage();
8205 straydn->pop_projected_linkage();
8206
8207 CInode *in = dn->get_linkage()->get_inode();
8208 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
8209 !mdr || mdr->more()->peer_update_journaled);
8210
8211 if (mds->is_resolve()) {
8212 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
8213 mdcache->try_trim_non_auth_subtree(root);
8214 }
8215
8216 if (mdr)
8217 mdcache->request_finish(mdr);
8218
8219 mdcache->finish_rollback(reqid, mdr);
8220 }
8221
8222
8223 /** _dir_is_nonempty[_unlocked]
8224 *
8225 * check if a directory is non-empty (i.e. we can rmdir it).
8226 *
8227 * the unlocked varient this is a fastpath check. we can't really be
8228 * sure until we rdlock the filelock.
8229 */
8230 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
8231 {
8232 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
8233 ceph_assert(in->is_auth());
8234
8235 if (in->filelock.is_cached())
8236 return false; // there can be pending async create/unlink. don't know.
8237 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
8238 return true; // in a snapshot!
8239
8240 auto&& ls = in->get_dirfrags();
8241 for (const auto& dir : ls) {
8242 // is the frag obviously non-empty?
8243 if (dir->is_auth()) {
8244 if (dir->get_projected_fnode()->fragstat.size()) {
8245 dout(10) << "dir_is_nonempty_unlocked dirstat has "
8246 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
8247 return true;
8248 }
8249 }
8250 }
8251
8252 return false;
8253 }
8254
8255 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
8256 {
8257 dout(10) << "dir_is_nonempty " << *in << dendl;
8258 ceph_assert(in->is_auth());
8259 ceph_assert(in->filelock.can_read(mdr->get_client()));
8260
8261 frag_info_t dirstat;
8262 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
8263
8264 auto&& ls = in->get_dirfrags();
8265 for (const auto& dir : ls) {
8266 const auto& pf = dir->get_projected_fnode();
8267 if (pf->fragstat.size()) {
8268 dout(10) << "dir_is_nonempty dirstat has "
8269 << pf->fragstat.size() << " items " << *dir << dendl;
8270 return true;
8271 }
8272
8273 if (pf->accounted_fragstat.version == dirstat_version)
8274 dirstat.add(pf->accounted_fragstat);
8275 else
8276 dirstat.add(pf->fragstat);
8277 }
8278
8279 return dirstat.size() != in->get_projected_inode()->dirstat.size();
8280 }
8281
8282
8283 // ======================================================
8284
8285
8286 class C_MDS_rename_finish : public ServerLogContext {
8287 CDentry *srcdn;
8288 CDentry *destdn;
8289 CDentry *straydn;
8290 public:
8291 C_MDS_rename_finish(Server *s, MDRequestRef& r,
8292 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
8293 ServerLogContext(s, r),
8294 srcdn(sdn), destdn(ddn), straydn(stdn) { }
8295 void finish(int r) override {
8296 ceph_assert(r == 0);
8297 server->_rename_finish(mdr, srcdn, destdn, straydn);
8298 }
8299 };
8300
8301
8302 /** handle_client_rename
8303 *
8304 * rename leader is the destdn auth. this is because cached inodes
8305 * must remain connected. thus, any replica of srci, must also
8306 * replicate destdn, and possibly straydn, so that srci (and
8307 * destdn->inode) remain connected during the rename.
8308 *
8309 * to do this, we freeze srci, then leader (destdn auth) verifies that
8310 * all other nodes have also replciated destdn and straydn. note that
8311 * destdn replicas need not also replicate srci. this only works when
8312 * destdn is leader.
8313 *
8314 * This function takes responsibility for the passed mdr.
8315 */
8316 void Server::handle_client_rename(MDRequestRef& mdr)
8317 {
8318 const auto& req = mdr->client_request;
8319 dout(7) << "handle_client_rename " << *req << dendl;
8320
8321 filepath destpath = req->get_filepath();
8322 filepath srcpath = req->get_filepath2();
8323 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
8324 respond_to_request(mdr, -CEPHFS_EBUSY);
8325 return;
8326 }
8327
8328 if (req->get_alternate_name().size() > alternate_name_max) {
8329 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
8330 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
8331 return;
8332 }
8333
8334 auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
8335 if (!destdn)
8336 return;
8337
8338 dout(10) << " destdn " << *destdn << dendl;
8339 CDir *destdir = destdn->get_dir();
8340 ceph_assert(destdir->is_auth());
8341 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8342
8343 dout(10) << " srcdn " << *srcdn << dendl;
8344 CDir *srcdir = srcdn->get_dir();
8345 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8346 CInode *srci = srcdnl->get_inode();
8347 dout(10) << " srci " << *srci << dendl;
8348
8349 // -- some sanity checks --
8350 if (destdn == srcdn) {
8351 dout(7) << "rename src=dest, noop" << dendl;
8352 respond_to_request(mdr, 0);
8353 return;
8354 }
8355
8356 // dest a child of src?
8357 // e.g. mv /usr /usr/foo
8358 if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
8359 dout(7) << "cannot rename item to be a child of itself" << dendl;
8360 respond_to_request(mdr, -CEPHFS_EINVAL);
8361 return;
8362 }
8363
8364 // is this a stray migration, reintegration or merge? (sanity checks!)
8365 if (mdr->reqid.name.is_mds() &&
8366 !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
8367 MDS_INO_IS_STRAY(destpath.get_ino())) &&
8368 !(destdnl->is_remote() &&
8369 destdnl->get_remote_ino() == srci->ino())) {
8370 respond_to_request(mdr, -CEPHFS_EINVAL); // actually, this won't reply, but whatev.
8371 return;
8372 }
8373
8374 CInode *oldin = 0;
8375 if (!destdnl->is_null()) {
8376 //dout(10) << "dest dn exists " << *destdn << dendl;
8377 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
8378 if (!oldin) return;
8379 dout(10) << " oldin " << *oldin << dendl;
8380
8381 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8382 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
8383 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
8384 return;
8385 }
8386
8387 // mv /some/thing /to/some/existing_other_thing
8388 if (oldin->is_dir() && !srci->is_dir()) {
8389 respond_to_request(mdr, -CEPHFS_EISDIR);
8390 return;
8391 }
8392 if (!oldin->is_dir() && srci->is_dir()) {
8393 respond_to_request(mdr, -CEPHFS_ENOTDIR);
8394 return;
8395 }
8396 if (srci == oldin && !srcdir->inode->is_stray()) {
8397 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
8398 return;
8399 }
8400 if (destdn->get_alternate_name() != req->get_alternate_name()) {
8401 /* the dentry exists but the alternate_names do not match, fail... */
8402 respond_to_request(mdr, -CEPHFS_EINVAL);
8403 return;
8404 }
8405 }
8406
8407 vector<CDentry*>& srctrace = mdr->dn[1];
8408 vector<CDentry*>& desttrace = mdr->dn[0];
8409
8410 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8411 if (destpath.get_ino() != srcpath.get_ino() &&
8412 !(req->get_source().is_mds() &&
8413 MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
8414 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
8415 CInode *destbase = desttrace[0]->get_dir()->get_inode();
8416 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8417 while (srcbase != destbase &&
8418 !srcbase->is_projected_ancestor_of(destbase)) {
8419 CDentry *pdn = srcbase->get_projected_parent_dn();
8420 srctrace.insert(srctrace.begin(), pdn);
8421 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
8422 srcbase = pdn->get_dir()->get_inode();
8423 }
8424
8425 // then, extend destpath until it shares the same parent inode as srcpath.
8426 while (destbase != srcbase) {
8427 CDentry *pdn = destbase->get_projected_parent_dn();
8428 desttrace.insert(desttrace.begin(), pdn);
8429 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
8430 destbase = pdn->get_dir()->get_inode();
8431 }
8432 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
8433 }
8434
8435
8436 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
8437 if (linkmerge)
8438 dout(10) << " this is a link merge" << dendl;
8439
8440 // -- create stray dentry? --
8441 CDentry *straydn = NULL;
8442 if (destdnl->is_primary() && !linkmerge) {
8443 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
8444 if (!straydn)
8445 return;
8446 dout(10) << " straydn is " << *straydn << dendl;
8447 } else if (mdr->straydn) {
8448 mdr->unpin(mdr->straydn);
8449 mdr->straydn = NULL;
8450 }
8451
8452
8453 // -- locks --
8454 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
8455 MutationImpl::LockOpVec lov;
8456
8457 // we need to update srci's ctime. xlock its least contended lock to do that...
8458 lov.add_xlock(&srci->linklock);
8459 lov.add_xlock(&srci->snaplock);
8460
8461 if (oldin) {
8462 // xlock oldin (for nlink--)
8463 lov.add_xlock(&oldin->linklock);
8464 lov.add_xlock(&oldin->snaplock);
8465 if (oldin->is_dir()) {
8466 ceph_assert(srci->is_dir());
8467 lov.add_rdlock(&oldin->filelock); // to verify it's empty
8468
8469 // adjust locking order?
8470 int cmp = mdr->compare_paths();
8471 if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
8472 std::reverse(lov.begin(), lov.end());
8473 } else {
8474 ceph_assert(!srci->is_dir());
8475 // adjust locking order;
8476 if (srci->ino() > oldin->ino())
8477 std::reverse(lov.begin(), lov.end());
8478 }
8479 }
8480
8481 // straydn?
8482 if (straydn) {
8483 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
8484 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
8485 lov.add_xlock(&straydn->lock);
8486 }
8487
8488 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
8489 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
8490 return;
8491
8492 mdr->locking_state |= MutationImpl::ALL_LOCKED;
8493 }
8494
8495 if (linkmerge)
8496 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
8497
8498 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
8499 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
8500 return;
8501
8502 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
8503 return;
8504
8505 if (!linkmerge && !check_fragment_space(mdr, destdn->get_dir()))
8506 return;
8507
8508 if (!linkmerge && !check_dir_max_entries(mdr, destdn->get_dir()))
8509 return;
8510
8511 if (!check_access(mdr, srci, MAY_WRITE))
8512 return;
8513 }
8514
8515 // with read lock, really verify oldin is empty
8516 if (oldin &&
8517 oldin->is_dir() &&
8518 _dir_is_nonempty(mdr, oldin)) {
8519 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
8520 return;
8521 }
8522
8523 /* project_snaprealm_past_parent() will do this job
8524 *
8525 // moving between snaprealms?
8526 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8527 SnapRealm *srcrealm = srci->find_snaprealm();
8528 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8529 if (srcrealm != destrealm &&
8530 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8531 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8532 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8533 mdcache->snaprealm_create(mdr, srci);
8534 return;
8535 }
8536 }
8537 */
8538
8539 SnapRealm *dest_realm = nullptr;
8540 SnapRealm *src_realm = nullptr;
8541 if (!linkmerge) {
8542 dest_realm = destdir->inode->find_snaprealm();
8543 if (srcdir->inode == destdir->inode)
8544 src_realm = dest_realm;
8545 else
8546 src_realm = srcdir->inode->find_snaprealm();
8547 if (src_realm != dest_realm &&
8548 src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) {
8549 respond_to_request(mdr, -CEPHFS_EXDEV);
8550 return;
8551 }
8552 }
8553
8554 ceph_assert(g_conf()->mds_kill_rename_at != 1);
8555
8556 // -- open all srcdn inode frags, if any --
8557 // we need these open so that auth can properly delegate from inode to dirfrags
8558 // after the inode is _ours_.
8559 if (srcdnl->is_primary() &&
8560 !srcdn->is_auth() &&
8561 srci->is_dir()) {
8562 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
8563 mdr->set_stickydirs(srci);
8564
8565 frag_vec_t leaves;
8566 srci->dirfragtree.get_leaves(leaves);
8567 for (const auto& leaf : leaves) {
8568 CDir *dir = srci->get_dirfrag(leaf);
8569 if (!dir) {
8570 dout(10) << " opening " << leaf << " under " << *srci << dendl;
8571 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
8572 return;
8573 }
8574 }
8575 }
8576
8577 // -- prepare snaprealm ---
8578
8579 if (linkmerge) {
8580 if (!mdr->more()->srci_srnode &&
8581 srci->get_projected_inode()->nlink == 1 &&
8582 srci->is_projected_snaprealm_global()) {
8583 sr_t *new_srnode = srci->prepare_new_srnode(0);
8584 srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false);
8585
8586 srci->clear_snaprealm_global(new_srnode);
8587 mdr->more()->srci_srnode = new_srnode;
8588 }
8589 } else {
8590 if (oldin && !mdr->more()->desti_srnode) {
8591 if (oldin->is_projected_snaprealm_global()) {
8592 sr_t *new_srnode = oldin->prepare_new_srnode(0);
8593 oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary());
8594 // dropping the last linkage or dropping the last remote linkage,
8595 // detch the inode from global snaprealm
8596 auto nlink = oldin->get_projected_inode()->nlink;
8597 if (nlink == 1 ||
8598 (nlink == 2 && !destdnl->is_primary() &&
8599 !oldin->get_projected_parent_dir()->inode->is_stray()))
8600 oldin->clear_snaprealm_global(new_srnode);
8601 mdr->more()->desti_srnode = new_srnode;
8602 } else if (destdnl->is_primary()) {
8603 snapid_t follows = dest_realm->get_newest_seq();
8604 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
8605 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
8606 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
8607 mdr->more()->desti_srnode = new_srnode;
8608 }
8609 }
8610 }
8611 if (!mdr->more()->srci_srnode) {
8612 if (srci->is_projected_snaprealm_global()) {
8613 sr_t *new_srnode = srci->prepare_new_srnode(0);
8614 srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary());
8615 mdr->more()->srci_srnode = new_srnode;
8616 } else if (srcdnl->is_primary()) {
8617 snapid_t follows = src_realm->get_newest_seq();
8618 if (src_realm != dest_realm &&
8619 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
8620 sr_t *new_srnode = srci->prepare_new_srnode(follows);
8621 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
8622 mdr->more()->srci_srnode = new_srnode;
8623 }
8624 }
8625 }
8626 }
8627
8628 // -- prepare witnesses --
8629
8630 /*
8631 * NOTE: we use _all_ replicas as witnesses.
8632 * this probably isn't totally necessary (esp for file renames),
8633 * but if/when we change that, we have to make sure rejoin is
8634 * sufficiently robust to handle strong rejoins from survivors
8635 * with totally wrong dentry->inode linkage.
8636 * (currently, it can ignore rename effects, because the resolve
8637 * stage will sort them out.)
8638 */
8639 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
8640 if (srcdn->is_auth())
8641 srcdn->list_replicas(witnesses);
8642 else
8643 witnesses.insert(srcdn->authority().first);
8644 if (srcdnl->is_remote() && !srci->is_auth())
8645 witnesses.insert(srci->authority().first);
8646 destdn->list_replicas(witnesses);
8647 if (destdnl->is_remote() && !oldin->is_auth())
8648 witnesses.insert(oldin->authority().first);
8649 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
8650
8651 if (!witnesses.empty()) {
8652 // Replicas can't see projected dentry linkages and will get confused.
8653 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8654 // can't project these inodes' linkages.
8655 bool need_flush = false;
8656 for (auto& dn : srctrace) {
8657 if (dn->is_projected()) {
8658 need_flush = true;
8659 break;
8660 }
8661 }
8662 if (!need_flush) {
8663 CDentry *dn = destdn;
8664 do {
8665 if (dn->is_projected()) {
8666 need_flush = true;
8667 break;
8668 }
8669 CInode *diri = dn->get_dir()->get_inode();
8670 dn = diri->get_projected_parent_dn();
8671 } while (dn);
8672 }
8673 if (need_flush) {
8674 mdlog->wait_for_safe(
8675 new MDSInternalContextWrapper(mds,
8676 new C_MDS_RetryRequest(mdcache, mdr)));
8677 mdlog->flush();
8678 return;
8679 }
8680 }
8681
8682 // do srcdn auth last
8683 mds_rank_t last = MDS_RANK_NONE;
8684 if (!srcdn->is_auth()) {
8685 last = srcdn->authority().first;
8686 mdr->more()->srcdn_auth_mds = last;
8687 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8688 // are involved in the rename operation.
8689 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
8690 dout(10) << " preparing ambiguous auth for srci" << dendl;
8691 ceph_assert(mdr->more()->is_remote_frozen_authpin);
8692 ceph_assert(mdr->more()->rename_inode == srci);
8693 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8694 return;
8695 }
8696 }
8697
8698 for (set<mds_rank_t>::iterator p = witnesses.begin();
8699 p != witnesses.end();
8700 ++p) {
8701 if (*p == last) continue; // do it last!
8702 if (mdr->more()->witnessed.count(*p)) {
8703 dout(10) << " already witnessed by mds." << *p << dendl;
8704 } else if (mdr->more()->waiting_on_peer.count(*p)) {
8705 dout(10) << " already waiting on witness mds." << *p << dendl;
8706 } else {
8707 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
8708 return;
8709 }
8710 }
8711 if (!mdr->more()->waiting_on_peer.empty())
8712 return; // we're waiting for a witness.
8713
8714 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
8715 dout(10) << " preparing last witness (srcdn auth)" << dendl;
8716 ceph_assert(mdr->more()->waiting_on_peer.count(last) == 0);
8717 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8718 return;
8719 }
8720
8721 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
8722 if (!mdr->more()->peers.empty() && !srci->is_dir())
8723 ceph_assert(g_conf()->mds_kill_rename_at != 3);
8724 if (!mdr->more()->peers.empty() && srci->is_dir())
8725 ceph_assert(g_conf()->mds_kill_rename_at != 4);
8726
8727 // -- declare now --
8728 mdr->set_mds_stamp(ceph_clock_now());
8729
8730 // -- prepare journal entry --
8731 mdr->ls = mdlog->get_current_segment();
8732 EUpdate *le = new EUpdate(mdlog, "rename");
8733 mdlog->start_entry(le);
8734 le->metablob.add_client_req(mdr->reqid, req->get_oldest_client_tid());
8735 if (!mdr->more()->witnessed.empty()) {
8736 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
8737
8738 le->reqid = mdr->reqid;
8739 le->had_peers = true;
8740
8741 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
8742 // no need to send frozen auth pin to recovring auth MDS of srci
8743 mdr->more()->is_remote_frozen_authpin = false;
8744 }
8745
8746 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, req->get_alternate_name(), straydn);
8747 if (le->client_map.length())
8748 le->cmapv = mds->sessionmap.get_projected();
8749
8750 // -- commit locally --
8751 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
8752
8753 journal_and_reply(mdr, srci, destdn, le, fin);
8754 mds->balancer->maybe_fragment(destdn->get_dir(), false);
8755 }
8756
8757
8758 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8759 {
8760 dout(10) << "_rename_finish " << *mdr << dendl;
8761
8762 if (!mdr->more()->witnessed.empty())
8763 mdcache->logged_leader_update(mdr->reqid);
8764
8765 // apply
8766 _rename_apply(mdr, srcdn, destdn, straydn);
8767
8768 mdcache->send_dentry_link(destdn, mdr);
8769
8770 CDentry::linkage_t *destdnl = destdn->get_linkage();
8771 CInode *in = destdnl->get_inode();
8772 bool need_eval = mdr->more()->cap_imports.count(in);
8773
8774 // test hack: test peer commit
8775 if (!mdr->more()->peers.empty() && !in->is_dir())
8776 ceph_assert(g_conf()->mds_kill_rename_at != 5);
8777 if (!mdr->more()->peers.empty() && in->is_dir())
8778 ceph_assert(g_conf()->mds_kill_rename_at != 6);
8779
8780 // bump popularity
8781 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
8782 if (destdnl->is_remote() && in->is_auth())
8783 mds->balancer->hit_inode(in, META_POP_IWR);
8784
8785 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8786
8787 ceph_assert(g_conf()->mds_kill_rename_at != 7);
8788
8789 // reply
8790 respond_to_request(mdr, 0);
8791
8792 if (need_eval)
8793 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
8794
8795 // clean up?
8796 // respond_to_request() drops locks. So stray reintegration can race with us.
8797 if (straydn && !straydn->get_projected_linkage()->is_null()) {
8798 mdcache->notify_stray(straydn);
8799 }
8800 }
8801
8802
8803
8804 // helpers
8805
8806 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
8807 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
8808 {
8809 const auto& client_req = mdr->client_request;
8810 ceph_assert(client_req);
8811
8812 if (mds->is_cluster_degraded() &&
8813 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
8814 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
8815 if (mdr->more()->waiting_on_peer.empty())
8816 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
8817 return false;
8818 }
8819
8820 dout(10) << "_rename_prepare_witness mds." << who << dendl;
8821 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREP);
8822
8823 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
8824 for (auto dn : srctrace)
8825 req->srcdnpath.push_dentry(dn->get_name());
8826 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
8827 for (auto dn : dsttrace)
8828 req->destdnpath.push_dentry(dn->get_name());
8829 req->alternate_name = client_req->alternate_name;
8830 if (straydn)
8831 mdcache->encode_replica_stray(straydn, who, req->straybl);
8832
8833 if (mdr->more()->srci_srnode)
8834 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
8835 if (mdr->more()->desti_srnode)
8836 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
8837
8838 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
8839
8840 // srcdn auth will verify our current witness list is sufficient
8841 req->witnesses = witnesse;
8842
8843 req->op_stamp = mdr->get_op_stamp();
8844 mds->send_message_mds(req, who);
8845
8846 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
8847 mdr->more()->waiting_on_peer.insert(who);
8848 return true;
8849 }
8850
8851 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
8852 {
8853 version_t oldpv = mdr->more()->inode_import_v;
8854
8855 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8856
8857 /* import node */
8858 auto blp = mdr->more()->inode_import.cbegin();
8859
8860 // imported caps
8861 map<client_t,entity_inst_t> client_map;
8862 map<client_t, client_metadata_t> client_metadata_map;
8863 decode(client_map, blp);
8864 decode(client_metadata_map, blp);
8865 prepare_force_open_sessions(client_map, client_metadata_map,
8866 mdr->more()->imported_session_map);
8867 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
8868 encode(client_metadata_map, *client_map_bl);
8869
8870 list<ScatterLock*> updated_scatterlocks;
8871 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
8872 mdr->more()->cap_imports, updated_scatterlocks);
8873
8874 // hack: force back to !auth and clean, temporarily
8875 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
8876 srcdnl->get_inode()->mark_clean();
8877
8878 return oldpv;
8879 }
8880
8881 bool Server::_need_force_journal(CInode *diri, bool empty)
8882 {
8883 auto&& dirs = diri->get_dirfrags();
8884
8885 bool force_journal = false;
8886 if (empty) {
8887 for (const auto& dir : dirs) {
8888 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
8889 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
8890 force_journal = true;
8891 break;
8892 } else
8893 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
8894 }
8895 } else {
8896 // see if any children of our frags are auth subtrees.
8897 std::vector<CDir*> subtrees;
8898 mdcache->get_subtrees(subtrees);
8899 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
8900 for (const auto& dir : dirs) {
8901 for (const auto& subtree : subtrees) {
8902 if (dir->contains(subtree)) {
8903 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
8904 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
8905 << *subtree << dendl;
8906 force_journal = true;
8907 break;
8908 } else
8909 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
8910 } else
8911 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
8912 }
8913 if (force_journal)
8914 break;
8915 }
8916 }
8917 return force_journal;
8918 }
8919
8920 void Server::_rename_prepare(MDRequestRef& mdr,
8921 EMetaBlob *metablob, bufferlist *client_map_bl,
8922 CDentry *srcdn, CDentry *destdn, std::string_view alternate_name,
8923 CDentry *straydn)
8924 {
8925 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8926 if (straydn)
8927 dout(10) << " straydn " << *straydn << dendl;
8928
8929 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8930 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8931 CInode *srci = srcdnl->get_inode();
8932 CInode *oldin = destdnl->get_inode();
8933
8934 // primary+remote link merge?
8935 bool linkmerge = (srci == oldin);
8936 if (linkmerge)
8937 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
8938 bool silent = srcdn->get_dir()->inode->is_stray();
8939
8940 bool force_journal_dest = false;
8941 if (srci->is_dir() && !destdn->is_auth()) {
8942 if (srci->is_auth()) {
8943 // if we are auth for srci and exporting it, force journal because journal replay needs
8944 // the source inode to create auth subtrees.
8945 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
8946 force_journal_dest = true;
8947 } else
8948 force_journal_dest = _need_force_journal(srci, false);
8949 }
8950
8951 bool force_journal_stray = false;
8952 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
8953 force_journal_stray = _need_force_journal(oldin, true);
8954
8955 if (linkmerge)
8956 dout(10) << " merging remote and primary links to the same inode" << dendl;
8957 if (silent)
8958 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
8959 if (force_journal_dest)
8960 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
8961 if (force_journal_stray)
8962 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
8963
8964 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
8965 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
8966 metablob->renamed_dirino = srci->ino();
8967 } else if (oldin && oldin->is_dir() && force_journal_stray) {
8968 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
8969 metablob->renamed_dirino = oldin->ino();
8970 }
8971
8972 // prepare
8973 CInode::mempool_inode *spi = 0; // renamed inode
8974 CInode::mempool_inode *tpi = 0; // target/overwritten inode
8975
8976 // target inode
8977 if (!linkmerge) {
8978 if (destdnl->is_primary()) {
8979 ceph_assert(straydn); // moving to straydn.
8980 // link--, and move.
8981 if (destdn->is_auth()) {
8982 auto pi= oldin->project_inode(mdr); //project_snaprealm
8983 pi.inode->version = straydn->pre_dirty(pi.inode->version);
8984 pi.inode->update_backtrace();
8985 tpi = pi.inode.get();
8986 }
8987 straydn->push_projected_linkage(oldin);
8988 } else if (destdnl->is_remote()) {
8989 // nlink-- targeti
8990 if (oldin->is_auth()) {
8991 auto pi = oldin->project_inode(mdr);
8992 pi.inode->version = oldin->pre_dirty();
8993 tpi = pi.inode.get();
8994 }
8995 }
8996 }
8997
8998 // dest
8999 if (destdnl->is_null()) {
9000 /* handle_client_rename checks that alternate_name matches for existing destdn */
9001 destdn->set_alternate_name(alternate_name);
9002 }
9003 if (srcdnl->is_remote()) {
9004 if (!linkmerge) {
9005 // destdn
9006 if (destdn->is_auth())
9007 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
9008 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
9009 // srci
9010 if (srci->is_auth()) {
9011 auto pi = srci->project_inode(mdr);
9012 pi.inode->version = srci->pre_dirty();
9013 spi = pi.inode.get();
9014 }
9015 } else {
9016 dout(10) << " will merge remote onto primary link" << dendl;
9017 if (destdn->is_auth()) {
9018 auto pi = oldin->project_inode(mdr);
9019 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->get_version());
9020 spi = pi.inode.get();
9021 }
9022 }
9023 } else { // primary
9024 if (destdn->is_auth()) {
9025 version_t oldpv;
9026 if (srcdn->is_auth())
9027 oldpv = srci->get_projected_version();
9028 else {
9029 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
9030
9031 // note which dirfrags have child subtrees in the journal
9032 // event, so that we can open those (as bounds) during replay.
9033 if (srci->is_dir()) {
9034 auto&& ls = srci->get_dirfrags();
9035 for (const auto& dir : ls) {
9036 if (!dir->is_auth())
9037 metablob->renamed_dir_frags.push_back(dir->get_frag());
9038 }
9039 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
9040 }
9041 }
9042 auto pi = srci->project_inode(mdr); // project snaprealm if srcdnl->is_primary
9043 // & srcdnl->snaprealm
9044 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
9045 pi.inode->update_backtrace();
9046 spi = pi.inode.get();
9047 }
9048 destdn->push_projected_linkage(srci);
9049 }
9050
9051 // src
9052 if (srcdn->is_auth())
9053 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
9054 srcdn->push_projected_linkage(); // push null linkage
9055
9056 if (!silent) {
9057 if (spi) {
9058 spi->ctime = mdr->get_op_stamp();
9059 if (mdr->get_op_stamp() > spi->rstat.rctime)
9060 spi->rstat.rctime = mdr->get_op_stamp();
9061 spi->change_attr++;
9062 if (linkmerge)
9063 spi->nlink--;
9064 }
9065 if (tpi) {
9066 tpi->ctime = mdr->get_op_stamp();
9067 if (mdr->get_op_stamp() > tpi->rstat.rctime)
9068 tpi->rstat.rctime = mdr->get_op_stamp();
9069 tpi->change_attr++;
9070 {
9071 std::string t;
9072 destdn->make_path_string(t, true);
9073 tpi->stray_prior_path = std::move(t);
9074 }
9075 tpi->nlink--;
9076 if (tpi->nlink == 0)
9077 oldin->state_set(CInode::STATE_ORPHAN);
9078 }
9079 }
9080
9081 // prepare nesting, mtime updates
9082 int predirty_dir = silent ? 0:PREDIRTY_DIR;
9083
9084 // guarantee stray dir is processed first during journal replay. unlink the old inode,
9085 // then link the source inode to destdn
9086 if (destdnl->is_primary()) {
9087 ceph_assert(straydn);
9088 if (straydn->is_auth()) {
9089 metablob->add_dir_context(straydn->get_dir());
9090 metablob->add_dir(straydn->get_dir(), true);
9091 }
9092 }
9093
9094 if (!linkmerge && destdnl->is_remote() && oldin->is_auth()) {
9095 CDir *oldin_dir = oldin->get_projected_parent_dir();
9096 if (oldin_dir != srcdn->get_dir() && oldin_dir != destdn->get_dir())
9097 mdcache->predirty_journal_parents(mdr, metablob, oldin, oldin_dir, PREDIRTY_PRIMARY);
9098 }
9099
9100 // sub off target
9101 if (destdn->is_auth() && !destdnl->is_null()) {
9102 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
9103 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
9104 if (destdnl->is_primary()) {
9105 ceph_assert(straydn);
9106 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
9107 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
9108 }
9109 }
9110
9111 if (srcdnl->is_remote() && srci->is_auth()) {
9112 CDir *srci_dir = srci->get_projected_parent_dir();
9113 if (srci_dir != srcdn->get_dir() && srci_dir != destdn->get_dir())
9114 mdcache->predirty_journal_parents(mdr, metablob, srci, srci_dir, PREDIRTY_PRIMARY);
9115 }
9116
9117 // move srcdn
9118 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
9119 int flags = predirty_dir | predirty_primary;
9120 if (srcdn->is_auth())
9121 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
9122 if (destdn->is_auth())
9123 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
9124
9125 // add it all to the metablob
9126 // target inode
9127 if (!linkmerge) {
9128 if (destdnl->is_primary()) {
9129 ceph_assert(straydn);
9130 if (destdn->is_auth()) {
9131 // project snaprealm, too
9132 if (auto& desti_srnode = mdr->more()->desti_srnode) {
9133 oldin->project_snaprealm(desti_srnode);
9134 if (tpi->nlink == 0)
9135 ceph_assert(!desti_srnode->is_parent_global());
9136 desti_srnode = NULL;
9137 }
9138 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
9139 metablob->add_primary_dentry(straydn, oldin, true, true);
9140 } else if (force_journal_stray) {
9141 dout(10) << " forced journaling straydn " << *straydn << dendl;
9142 metablob->add_dir_context(straydn->get_dir());
9143 metablob->add_primary_dentry(straydn, oldin, true);
9144 }
9145 } else if (destdnl->is_remote()) {
9146 if (oldin->is_auth()) {
9147 sr_t *new_srnode = NULL;
9148 if (mdr->peer_request) {
9149 if (mdr->peer_request->desti_snapbl.length() > 0) {
9150 new_srnode = new sr_t();
9151 auto p = mdr->peer_request->desti_snapbl.cbegin();
9152 decode(*new_srnode, p);
9153 }
9154 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
9155 new_srnode = desti_srnode;
9156 desti_srnode = NULL;
9157 }
9158 if (new_srnode) {
9159 oldin->project_snaprealm(new_srnode);
9160 if (tpi->nlink == 0)
9161 ceph_assert(!new_srnode->is_parent_global());
9162 }
9163 // auth for targeti
9164 CDentry *oldin_pdn = oldin->get_projected_parent_dn();
9165 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin_pdn);
9166 metablob->add_primary_dentry(oldin_pdn, oldin, true);
9167 }
9168 }
9169 }
9170
9171 // dest
9172 if (srcdnl->is_remote()) {
9173 ceph_assert(!linkmerge);
9174 if (destdn->is_auth() && !destdnl->is_null())
9175 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
9176 else
9177 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
9178
9179 if (destdn->is_auth())
9180 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
9181
9182 if (srci->is_auth() ) { // it's remote
9183 if (mdr->peer_request) {
9184 if (mdr->peer_request->srci_snapbl.length() > 0) {
9185 sr_t *new_srnode = new sr_t();
9186 auto p = mdr->peer_request->srci_snapbl.cbegin();
9187 decode(*new_srnode, p);
9188 srci->project_snaprealm(new_srnode);
9189 }
9190 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9191 srci->project_snaprealm(srci_srnode);
9192 srci_srnode = NULL;
9193 }
9194
9195 CDentry *srci_pdn = srci->get_projected_parent_dn();
9196 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn);
9197 metablob->add_primary_dentry(srci_pdn, srci, true);
9198 }
9199 } else if (srcdnl->is_primary()) {
9200 // project snap parent update?
9201 if (destdn->is_auth()) {
9202 if (auto& srci_srnode = mdr->more()->srci_srnode) {
9203 srci->project_snaprealm(srci_srnode);
9204 srci_srnode = NULL;
9205 }
9206 }
9207
9208 if (destdn->is_auth() && !destdnl->is_null())
9209 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
9210
9211 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
9212
9213 if (destdn->is_auth())
9214 metablob->add_primary_dentry(destdn, srci, true, true);
9215 else if (force_journal_dest) {
9216 dout(10) << " forced journaling destdn " << *destdn << dendl;
9217 metablob->add_dir_context(destdn->get_dir());
9218 metablob->add_primary_dentry(destdn, srci, true);
9219 if (srcdn->is_auth() && srci->is_dir()) {
9220 // journal new subtrees root dirfrags
9221 auto&& ls = srci->get_dirfrags();
9222 for (const auto& dir : ls) {
9223 if (dir->is_auth())
9224 metablob->add_dir(dir, true);
9225 }
9226 }
9227 }
9228 }
9229
9230 // src
9231 if (srcdn->is_auth()) {
9232 dout(10) << " journaling srcdn " << *srcdn << dendl;
9233 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
9234 // also journal the inode in case we need do peer rename rollback. It is Ok to add
9235 // both primary and NULL dentries. Because during journal replay, null dentry is
9236 // processed after primary dentry.
9237 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
9238 metablob->add_primary_dentry(srcdn, srci, true);
9239 metablob->add_null_dentry(srcdn, true);
9240 } else
9241 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
9242
9243 // make renamed inode first track the dn
9244 if (srcdnl->is_primary() && destdn->is_auth()) {
9245 ceph_assert(srci->first <= destdn->first);
9246 srci->first = destdn->first;
9247 }
9248 // make stray inode first track the straydn
9249 if (straydn && straydn->is_auth()) {
9250 ceph_assert(oldin->first <= straydn->first);
9251 oldin->first = straydn->first;
9252 }
9253
9254 if (oldin && oldin->is_dir()) {
9255 ceph_assert(straydn);
9256 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
9257 }
9258 if (srci->is_dir())
9259 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
9260
9261 }
9262
9263
9264 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9265 {
9266 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
9267 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
9268
9269 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9270 CDentry::linkage_t *destdnl = destdn->get_linkage();
9271
9272 CInode *oldin = destdnl->get_inode();
9273
9274 // primary+remote link merge?
9275 bool linkmerge = (srcdnl->get_inode() == oldin);
9276 if (linkmerge)
9277 ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
9278
9279 bool new_in_snaprealm = false;
9280 bool new_oldin_snaprealm = false;
9281
9282 // target inode
9283 if (!linkmerge) {
9284 if (destdnl->is_primary()) {
9285 ceph_assert(straydn);
9286 dout(10) << "straydn is " << *straydn << dendl;
9287
9288 // if there is newly created snaprealm, need to split old snaprealm's
9289 // inodes_with_caps. So pop snaprealm before linkage changes.
9290 if (destdn->is_auth()) {
9291 bool hadrealm = (oldin->snaprealm ? true : false);
9292 oldin->early_pop_projected_snaprealm();
9293 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
9294 } else {
9295 ceph_assert(mdr->peer_request);
9296 if (mdr->peer_request->desti_snapbl.length()) {
9297 new_oldin_snaprealm = !oldin->snaprealm;
9298 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
9299 ceph_assert(oldin->snaprealm);
9300 }
9301 }
9302
9303 destdn->get_dir()->unlink_inode(destdn, false);
9304
9305 straydn->pop_projected_linkage();
9306 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9307 ceph_assert(!straydn->is_projected()); // no other projected
9308
9309 // nlink-- targeti
9310 if (destdn->is_auth())
9311 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9312
9313 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
9314 } else if (destdnl->is_remote()) {
9315 destdn->get_dir()->unlink_inode(destdn, false);
9316 if (oldin->is_auth()) {
9317 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9318 } else if (mdr->peer_request) {
9319 if (mdr->peer_request->desti_snapbl.length() > 0) {
9320 ceph_assert(oldin->snaprealm);
9321 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
9322 }
9323 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
9324 delete desti_srnode;
9325 desti_srnode = NULL;
9326 }
9327 }
9328 }
9329
9330 // unlink src before we relink it at dest
9331 CInode *in = srcdnl->get_inode();
9332 ceph_assert(in);
9333
9334 bool srcdn_was_remote = srcdnl->is_remote();
9335 if (!srcdn_was_remote) {
9336 // if there is newly created snaprealm, need to split old snaprealm's
9337 // inodes_with_caps. So pop snaprealm before linkage changes.
9338 if (destdn->is_auth()) {
9339 bool hadrealm = (in->snaprealm ? true : false);
9340 in->early_pop_projected_snaprealm();
9341 new_in_snaprealm = (in->snaprealm && !hadrealm);
9342 } else {
9343 ceph_assert(mdr->peer_request);
9344 if (mdr->peer_request->srci_snapbl.length()) {
9345 new_in_snaprealm = !in->snaprealm;
9346 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
9347 ceph_assert(in->snaprealm);
9348 }
9349 }
9350 }
9351
9352 srcdn->get_dir()->unlink_inode(srcdn);
9353
9354 // dest
9355 if (srcdn_was_remote) {
9356 if (!linkmerge) {
9357 // destdn
9358 destdnl = destdn->pop_projected_linkage();
9359 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9360 ceph_assert(!destdn->is_projected()); // no other projected
9361
9362 destdn->link_remote(destdnl, in);
9363 if (destdn->is_auth())
9364 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
9365 // in
9366 if (in->is_auth()) {
9367 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9368 } else if (mdr->peer_request) {
9369 if (mdr->peer_request->srci_snapbl.length() > 0) {
9370 ceph_assert(in->snaprealm);
9371 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
9372 }
9373 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9374 delete srci_srnode;
9375 srci_srnode = NULL;
9376 }
9377 } else {
9378 dout(10) << "merging remote onto primary link" << dendl;
9379 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9380 }
9381 } else { // primary
9382 if (linkmerge) {
9383 dout(10) << "merging primary onto remote link" << dendl;
9384 destdn->get_dir()->unlink_inode(destdn, false);
9385 }
9386 destdnl = destdn->pop_projected_linkage();
9387 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9388 ceph_assert(!destdn->is_projected()); // no other projected
9389
9390 // srcdn inode import?
9391 if (!srcdn->is_auth() && destdn->is_auth()) {
9392 ceph_assert(mdr->more()->inode_import.length() > 0);
9393
9394 map<client_t,Capability::Import> imported_caps;
9395
9396 // finish cap imports
9397 finish_force_open_sessions(mdr->more()->imported_session_map);
9398 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
9399 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
9400 mdr->more()->srcdn_auth_mds, true,
9401 mdr->more()->imported_session_map,
9402 mdr->more()->cap_imports[destdnl->get_inode()],
9403 imported_caps);
9404 }
9405
9406 mdr->more()->inode_import.clear();
9407 encode(imported_caps, mdr->more()->inode_import);
9408
9409 /* hack: add an auth pin for each xlock we hold. These were
9410 * remote xlocks previously but now they're local and
9411 * we're going to try and unpin when we xlock_finish. */
9412
9413 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
9414 i != mdr->locks.end();
9415 ++i) {
9416 SimpleLock *lock = i->lock;
9417 if (lock->get_parent() != destdnl->get_inode())
9418 break;
9419 if (i->is_xlock() && !lock->is_locallock())
9420 mds->locker->xlock_import(lock);
9421 }
9422
9423 // hack: fix auth bit
9424 in->state_set(CInode::STATE_AUTH);
9425
9426 mdr->clear_ambiguous_auth();
9427 }
9428
9429 if (destdn->is_auth())
9430 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9431 }
9432
9433 // src
9434 if (srcdn->is_auth())
9435 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
9436 srcdn->pop_projected_linkage();
9437 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9438 ceph_assert(!srcdn->is_projected()); // no other projected
9439
9440 // apply remaining projected inodes (nested)
9441 mdr->apply();
9442
9443 // update subtree map?
9444 if (destdnl->is_primary() && in->is_dir())
9445 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
9446
9447 if (straydn && oldin->is_dir())
9448 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
9449
9450 if (new_oldin_snaprealm)
9451 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
9452 if (new_in_snaprealm)
9453 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
9454
9455 // removing a new dn?
9456 if (srcdn->is_auth())
9457 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
9458 }
9459
9460
9461
9462 // ------------
9463 // PEER
9464
9465 class C_MDS_PeerRenamePrep : public ServerLogContext {
9466 CDentry *srcdn, *destdn, *straydn;
9467 public:
9468 C_MDS_PeerRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
9469 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
9470 void finish(int r) override {
9471 server->_logged_peer_rename(mdr, srcdn, destdn, straydn);
9472 }
9473 };
9474
9475 class C_MDS_PeerRenameCommit : public ServerContext {
9476 MDRequestRef mdr;
9477 CDentry *srcdn, *destdn, *straydn;
9478 public:
9479 C_MDS_PeerRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
9480 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
9481 void finish(int r) override {
9482 server->_commit_peer_rename(mdr, r, srcdn, destdn, straydn);
9483 }
9484 };
9485
9486 class C_MDS_PeerRenameSessionsFlushed : public ServerContext {
9487 MDRequestRef mdr;
9488 public:
9489 C_MDS_PeerRenameSessionsFlushed(Server *s, MDRequestRef& r) :
9490 ServerContext(s), mdr(r) {}
9491 void finish(int r) override {
9492 server->_peer_rename_sessions_flushed(mdr);
9493 }
9494 };
9495
9496 void Server::handle_peer_rename_prep(MDRequestRef& mdr)
9497 {
9498 dout(10) << "handle_peer_rename_prep " << *mdr
9499 << " " << mdr->peer_request->srcdnpath
9500 << " to " << mdr->peer_request->destdnpath
9501 << dendl;
9502
9503 if (mdr->peer_request->is_interrupted()) {
9504 dout(10) << " peer request interrupted, sending noop reply" << dendl;
9505 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9506 reply->mark_interrupted();
9507 mds->send_message_mds(reply, mdr->peer_to_mds);
9508 mdr->reset_peer_request();
9509 return;
9510 }
9511
9512 // discover destdn
9513 filepath destpath(mdr->peer_request->destdnpath);
9514 dout(10) << " dest " << destpath << dendl;
9515 vector<CDentry*> trace;
9516 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
9517 int r = mdcache->path_traverse(mdr, cf, destpath,
9518 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
9519 &trace);
9520 if (r > 0) return;
9521 if (r == -CEPHFS_ESTALE) {
9522 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
9523 mdr->peer_to_mds, true);
9524 return;
9525 }
9526 ceph_assert(r == 0); // we shouldn't get an error here!
9527
9528 CDentry *destdn = trace.back();
9529 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
9530 dout(10) << " destdn " << *destdn << dendl;
9531 mdr->pin(destdn);
9532
9533 // discover srcdn
9534 filepath srcpath(mdr->peer_request->srcdnpath);
9535 dout(10) << " src " << srcpath << dendl;
9536 CInode *srci = nullptr;
9537 r = mdcache->path_traverse(mdr, cf, srcpath,
9538 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
9539 &trace, &srci);
9540 if (r > 0) return;
9541 ceph_assert(r == 0);
9542
9543 CDentry *srcdn = trace.back();
9544 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
9545 dout(10) << " srcdn " << *srcdn << dendl;
9546 mdr->pin(srcdn);
9547 mdr->pin(srci);
9548
9549 // stray?
9550 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
9551 if (linkmerge)
9552 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
9553 CDentry *straydn = mdr->straydn;
9554 if (destdnl->is_primary() && !linkmerge)
9555 ceph_assert(straydn);
9556
9557 mdr->set_op_stamp(mdr->peer_request->op_stamp);
9558 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
9559
9560 // set up commit waiter (early, to clean up any freezing etc we do)
9561 if (!mdr->more()->peer_commit)
9562 mdr->more()->peer_commit = new C_MDS_PeerRenameCommit(this, mdr, srcdn, destdn, straydn);
9563
9564 // am i srcdn auth?
9565 if (srcdn->is_auth()) {
9566 set<mds_rank_t> srcdnrep;
9567 srcdn->list_replicas(srcdnrep);
9568
9569 bool reply_witness = false;
9570 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
9571 // freeze?
9572 // we need this to
9573 // - avoid conflicting lock state changes
9574 // - avoid concurrent updates to the inode
9575 // (this could also be accomplished with the versionlock)
9576 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
9577 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
9578 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
9579
9580 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9581 if (srcdnl->get_inode()->is_frozen_auth_pin())
9582 mdr->unfreeze_auth_pin();
9583
9584 if (!frozen_inode) {
9585 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
9586 return;
9587 }
9588
9589 /*
9590 * set ambiguous auth for srci
9591 * NOTE: we don't worry about ambiguous cache expire as we do
9592 * with subtree migrations because all peers will pin
9593 * srcdn->get_inode() for duration of this rename.
9594 */
9595 mdr->set_ambiguous_auth(srcdnl->get_inode());
9596
9597 // just mark the source inode as ambiguous auth if more than two MDS are involved.
9598 // the leader will send another OP_RENAMEPREP peer request later.
9599 if (mdr->peer_request->witnesses.size() > 1) {
9600 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
9601 reply_witness = true;
9602 }
9603
9604 // make sure bystanders have received all lock related messages
9605 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
9606 if (*p == mdr->peer_to_mds ||
9607 (mds->is_cluster_degraded() &&
9608 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
9609 continue;
9610 auto notify = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMENOTIFY);
9611 mds->send_message_mds(notify, *p);
9612 mdr->more()->waiting_on_peer.insert(*p);
9613 }
9614
9615 // make sure clients have received all cap related messages
9616 set<client_t> export_client_set;
9617 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
9618
9619 MDSGatherBuilder gather(g_ceph_context);
9620 flush_client_sessions(export_client_set, gather);
9621 if (gather.has_subs()) {
9622 mdr->more()->waiting_on_peer.insert(MDS_RANK_NONE);
9623 gather.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr));
9624 gather.activate();
9625 }
9626 }
9627
9628 // is witness list sufficient?
9629 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
9630 if (*p == mdr->peer_to_mds ||
9631 mdr->peer_request->witnesses.count(*p)) continue;
9632 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
9633 reply_witness = true;
9634 break;
9635 }
9636
9637 if (reply_witness) {
9638 ceph_assert(!srcdnrep.empty());
9639 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9640 reply->witnesses.swap(srcdnrep);
9641 mds->send_message_mds(reply, mdr->peer_to_mds);
9642 mdr->reset_peer_request();
9643 return;
9644 }
9645 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
9646 if (!mdr->more()->waiting_on_peer.empty()) {
9647 dout(10) << " still waiting for rename notify acks from "
9648 << mdr->more()->waiting_on_peer << dendl;
9649 return;
9650 }
9651 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
9652 // set ambiguous auth for srci on witnesses
9653 mdr->set_ambiguous_auth(srcdnl->get_inode());
9654 }
9655
9656 // encode everything we'd need to roll this back... basically, just the original state.
9657 rename_rollback rollback;
9658
9659 rollback.reqid = mdr->reqid;
9660
9661 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
9662 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9663 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
9664 rollback.orig_src.dname = srcdn->get_name();
9665 if (srcdnl->is_primary())
9666 rollback.orig_src.ino = srcdnl->get_inode()->ino();
9667 else {
9668 ceph_assert(srcdnl->is_remote());
9669 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
9670 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
9671 }
9672
9673 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
9674 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9675 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
9676 rollback.orig_dest.dname = destdn->get_name();
9677 if (destdnl->is_primary())
9678 rollback.orig_dest.ino = destdnl->get_inode()->ino();
9679 else if (destdnl->is_remote()) {
9680 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
9681 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
9682 }
9683
9684 if (straydn) {
9685 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
9686 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
9687 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
9688 rollback.stray.dname = straydn->get_name();
9689 }
9690 if (mdr->peer_request->desti_snapbl.length()) {
9691 CInode *oldin = destdnl->get_inode();
9692 if (oldin->snaprealm) {
9693 encode(true, rollback.desti_snapbl);
9694 oldin->encode_snap_blob(rollback.desti_snapbl);
9695 } else {
9696 encode(false, rollback.desti_snapbl);
9697 }
9698 }
9699 if (mdr->peer_request->srci_snapbl.length()) {
9700 if (srci->snaprealm) {
9701 encode(true, rollback.srci_snapbl);
9702 srci->encode_snap_blob(rollback.srci_snapbl);
9703 } else {
9704 encode(false, rollback.srci_snapbl);
9705 }
9706 }
9707 encode(rollback, mdr->more()->rollback_bl);
9708 // FIXME: rollback snaprealm
9709 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
9710
9711 // journal.
9712 mdr->ls = mdlog->get_current_segment();
9713 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_prep", mdr->reqid, mdr->peer_to_mds,
9714 EPeerUpdate::OP_PREPARE, EPeerUpdate::RENAME);
9715 mdlog->start_entry(le);
9716 le->rollback = mdr->more()->rollback_bl;
9717
9718 bufferlist blah; // inode import data... obviously not used if we're the peer
9719 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, mdr->peer_request->alternate_name, straydn);
9720
9721 if (le->commit.empty()) {
9722 dout(10) << " empty metablob, skipping journal" << dendl;
9723 mdlog->cancel_entry(le);
9724 mdr->ls = NULL;
9725 _logged_peer_rename(mdr, srcdn, destdn, straydn);
9726 } else {
9727 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
9728 mdr->more()->peer_update_journaled = true;
9729 submit_mdlog_entry(le, new C_MDS_PeerRenamePrep(this, mdr, srcdn, destdn, straydn),
9730 mdr, __func__);
9731 mdlog->flush();
9732 }
9733 }
9734
9735 void Server::_logged_peer_rename(MDRequestRef& mdr,
9736 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9737 {
9738 dout(10) << "_logged_peer_rename " << *mdr << dendl;
9739
9740 // prepare ack
9741 ref_t<MMDSPeerRequest> reply;
9742 if (!mdr->aborted) {
9743 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9744 if (!mdr->more()->peer_update_journaled)
9745 reply->mark_not_journaled();
9746 }
9747
9748 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9749 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
9750
9751 // export srci?
9752 if (srcdn->is_auth() && srcdnl->is_primary()) {
9753 // set export bounds for CInode::encode_export()
9754 if (reply) {
9755 std::vector<CDir*> bounds;
9756 if (srcdnl->get_inode()->is_dir()) {
9757 srcdnl->get_inode()->get_dirfrags(bounds);
9758 for (const auto& bound : bounds) {
9759 bound->state_set(CDir::STATE_EXPORTBOUND);
9760 }
9761 }
9762
9763 map<client_t,entity_inst_t> exported_client_map;
9764 map<client_t, client_metadata_t> exported_client_metadata_map;
9765 bufferlist inodebl;
9766 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
9767 exported_client_map,
9768 exported_client_metadata_map);
9769
9770 for (const auto& bound : bounds) {
9771 bound->state_clear(CDir::STATE_EXPORTBOUND);
9772 }
9773
9774 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
9775 encode(exported_client_metadata_map, reply->inode_export);
9776 reply->inode_export.claim_append(inodebl);
9777 reply->inode_export_v = srcdnl->get_inode()->get_version();
9778 }
9779
9780 // remove mdr auth pin
9781 mdr->auth_unpin(srcdnl->get_inode());
9782 mdr->more()->is_inode_exporter = true;
9783
9784 if (srcdnl->get_inode()->is_dirty())
9785 srcdnl->get_inode()->mark_clean();
9786
9787 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
9788 }
9789
9790 // apply
9791 _rename_apply(mdr, srcdn, destdn, straydn);
9792
9793 CDentry::linkage_t *destdnl = destdn->get_linkage();
9794
9795 // bump popularity
9796 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
9797 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
9798 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
9799
9800 // done.
9801 mdr->reset_peer_request();
9802 mdr->straydn = 0;
9803
9804 if (reply) {
9805 mds->send_message_mds(reply, mdr->peer_to_mds);
9806 } else {
9807 ceph_assert(mdr->aborted);
9808 dout(10) << " abort flag set, finishing" << dendl;
9809 mdcache->request_finish(mdr);
9810 }
9811 }
9812
9813 void Server::_commit_peer_rename(MDRequestRef& mdr, int r,
9814 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9815 {
9816 dout(10) << "_commit_peer_rename " << *mdr << " r=" << r << dendl;
9817
9818 CInode *in = destdn->get_linkage()->get_inode();
9819
9820 inodeno_t migrated_stray;
9821 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
9822 migrated_stray = in->ino();
9823
9824 MDSContext::vec finished;
9825 if (r == 0) {
9826 // unfreeze+singleauth inode
9827 // hmm, do i really need to delay this?
9828 if (mdr->more()->is_inode_exporter) {
9829 // drop our pins
9830 // we exported, clear out any xlocks that we moved to another MDS
9831
9832 for (auto i = mdr->locks.lower_bound(&in->versionlock);
9833 i != mdr->locks.end(); ) {
9834 SimpleLock *lock = i->lock;
9835 if (lock->get_parent() != in)
9836 break;
9837 // we only care about xlocks on the exported inode
9838 if (i->is_xlock() && !lock->is_locallock())
9839 mds->locker->xlock_export(i++, mdr.get());
9840 else
9841 ++i;
9842 }
9843
9844 map<client_t,Capability::Import> peer_imported;
9845 auto bp = mdr->more()->inode_import.cbegin();
9846 decode(peer_imported, bp);
9847
9848 dout(10) << " finishing inode export on " << *in << dendl;
9849 mdcache->migrator->finish_export_inode(in, mdr->peer_to_mds, peer_imported, finished);
9850 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
9851
9852 // unfreeze
9853 ceph_assert(in->is_frozen_inode());
9854 in->unfreeze_inode(finished);
9855 }
9856
9857 // singleauth
9858 if (mdr->more()->is_ambiguous_auth) {
9859 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9860 mdr->more()->is_ambiguous_auth = false;
9861 }
9862
9863 if (straydn && mdr->more()->peer_update_journaled) {
9864 CInode *strayin = straydn->get_projected_linkage()->get_inode();
9865 if (strayin && !strayin->snaprealm)
9866 mdcache->clear_dirty_bits_for_stray(strayin);
9867 }
9868
9869 mds->queue_waiters(finished);
9870 mdr->cleanup();
9871
9872 if (mdr->more()->peer_update_journaled) {
9873 // write a commit to the journal
9874 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_commit", mdr->reqid,
9875 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
9876 EPeerUpdate::RENAME);
9877 mdlog->start_entry(le);
9878 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
9879 mdlog->flush();
9880 } else {
9881 _committed_peer(mdr);
9882 }
9883 } else {
9884
9885 // abort
9886 // rollback_bl may be empty if we froze the inode but had to provide an expanded
9887 // witness list from the leader, and they failed before we tried prep again.
9888 if (mdr->more()->rollback_bl.length()) {
9889 if (mdr->more()->is_inode_exporter) {
9890 dout(10) << " reversing inode export of " << *in << dendl;
9891 in->abort_export();
9892 }
9893 if (mdcache->is_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds)) {
9894 mdcache->remove_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds);
9895 // rollback but preserve the peer request
9896 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, false);
9897 mdr->more()->rollback_bl.clear();
9898 } else
9899 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, true);
9900 } else {
9901 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl;
9902 // singleauth
9903 if (mdr->more()->is_ambiguous_auth) {
9904 if (srcdn->is_auth())
9905 mdr->more()->rename_inode->unfreeze_inode(finished);
9906
9907 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9908 mdr->more()->is_ambiguous_auth = false;
9909 }
9910 mds->queue_waiters(finished);
9911 mdcache->request_finish(mdr);
9912 }
9913 }
9914
9915 if (migrated_stray && mds->is_stopping())
9916 mdcache->shutdown_export_stray_finish(migrated_stray);
9917 }
9918
9919 static void _rollback_repair_dir(MutationRef& mut, CDir *dir,
9920 rename_rollback::drec &r, utime_t ctime,
9921 bool isdir, const nest_info_t &rstat)
9922 {
9923 auto pf = dir->project_fnode(mut);
9924 pf->version = dir->pre_dirty();
9925
9926 if (isdir) {
9927 pf->fragstat.nsubdirs += 1;
9928 } else {
9929 pf->fragstat.nfiles += 1;
9930 }
9931 if (r.ino) {
9932 pf->rstat.rbytes += rstat.rbytes;
9933 pf->rstat.rfiles += rstat.rfiles;
9934 pf->rstat.rsubdirs += rstat.rsubdirs;
9935 pf->rstat.rsnaps += rstat.rsnaps;
9936 }
9937 if (pf->fragstat.mtime == ctime) {
9938 pf->fragstat.mtime = r.dirfrag_old_mtime;
9939 if (pf->rstat.rctime == ctime)
9940 pf->rstat.rctime = r.dirfrag_old_rctime;
9941 }
9942 mut->add_updated_lock(&dir->get_inode()->filelock);
9943 mut->add_updated_lock(&dir->get_inode()->nestlock);
9944 }
9945
9946 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
9947 MutationRef mut;
9948 CDentry *srcdn;
9949 version_t srcdnpv;
9950 CDentry *destdn;
9951 CDentry *straydn;
9952 map<client_t,ref_t<MClientSnap>> splits[2];
9953 bool finish_mdr;
9954 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
9955 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
9956 map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
9957 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
9958 straydn(st), finish_mdr(f) {
9959 splits[0].swap(_splits[0]);
9960 splits[1].swap(_splits[1]);
9961 }
9962 void finish(int r) override {
9963 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
9964 destdn, straydn, splits, finish_mdr);
9965 }
9966 };
9967
9968 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr,
9969 bool finish_mdr)
9970 {
9971 rename_rollback rollback;
9972 auto p = rbl.cbegin();
9973 decode(rollback, p);
9974
9975 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
9976 // need to finish this update before sending resolve to claim the subtree
9977 mdcache->add_rollback(rollback.reqid, leader);
9978
9979 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
9980 mut->ls = mds->mdlog->get_current_segment();
9981
9982 CDentry *srcdn = NULL;
9983 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
9984 if (!srcdir)
9985 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
9986 if (srcdir) {
9987 dout(10) << " srcdir " << *srcdir << dendl;
9988 srcdn = srcdir->lookup(rollback.orig_src.dname);
9989 if (srcdn) {
9990 dout(10) << " srcdn " << *srcdn << dendl;
9991 ceph_assert(srcdn->get_linkage()->is_null());
9992 } else
9993 dout(10) << " srcdn not found" << dendl;
9994 } else
9995 dout(10) << " srcdir not found" << dendl;
9996
9997 CDentry *destdn = NULL;
9998 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
9999 if (!destdir)
10000 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
10001 if (destdir) {
10002 dout(10) << " destdir " << *destdir << dendl;
10003 destdn = destdir->lookup(rollback.orig_dest.dname);
10004 if (destdn)
10005 dout(10) << " destdn " << *destdn << dendl;
10006 else
10007 dout(10) << " destdn not found" << dendl;
10008 } else
10009 dout(10) << " destdir not found" << dendl;
10010
10011 CInode *in = NULL;
10012 if (rollback.orig_src.ino) {
10013 in = mdcache->get_inode(rollback.orig_src.ino);
10014 if (in && in->is_dir())
10015 ceph_assert(srcdn && destdn);
10016 } else
10017 in = mdcache->get_inode(rollback.orig_src.remote_ino);
10018
10019 CDir *straydir = NULL;
10020 CDentry *straydn = NULL;
10021 if (rollback.stray.dirfrag.ino) {
10022 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
10023 if (straydir) {
10024 dout(10) << "straydir " << *straydir << dendl;
10025 straydn = straydir->lookup(rollback.stray.dname);
10026 if (straydn) {
10027 dout(10) << " straydn " << *straydn << dendl;
10028 ceph_assert(straydn->get_linkage()->is_primary());
10029 } else
10030 dout(10) << " straydn not found" << dendl;
10031 } else
10032 dout(10) << "straydir not found" << dendl;
10033 }
10034
10035 CInode *target = NULL;
10036 if (rollback.orig_dest.ino) {
10037 target = mdcache->get_inode(rollback.orig_dest.ino);
10038 if (target)
10039 ceph_assert(destdn && straydn);
10040 } else if (rollback.orig_dest.remote_ino)
10041 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
10042
10043 // can't use is_auth() in the resolve stage
10044 mds_rank_t whoami = mds->get_nodeid();
10045 // peer
10046 ceph_assert(!destdn || destdn->authority().first != whoami);
10047 ceph_assert(!straydn || straydn->authority().first != whoami);
10048
10049 bool force_journal_src = false;
10050 bool force_journal_dest = false;
10051 if (in && in->is_dir() && srcdn->authority().first != whoami)
10052 force_journal_src = _need_force_journal(in, false);
10053 if (in && target && target->is_dir())
10054 force_journal_dest = _need_force_journal(in, true);
10055
10056 version_t srcdnpv = 0;
10057 // repair src
10058 if (srcdn) {
10059 if (srcdn->authority().first == whoami)
10060 srcdnpv = srcdn->pre_dirty();
10061 if (rollback.orig_src.ino) {
10062 ceph_assert(in);
10063 srcdn->push_projected_linkage(in);
10064 } else
10065 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
10066 rollback.orig_src.remote_d_type);
10067 }
10068
10069 map<client_t,ref_t<MClientSnap>> splits[2];
10070
10071 const CInode::mempool_inode *pip = nullptr;
10072 if (in) {
10073 bool projected;
10074 CDir *pdir = in->get_projected_parent_dir();
10075 if (pdir->authority().first == whoami) {
10076 auto pi = in->project_inode(mut);
10077 pi.inode->version = in->pre_dirty();
10078 if (pdir != srcdir) {
10079 auto pf = pdir->project_fnode(mut);
10080 pf->version = pdir->pre_dirty();
10081 }
10082 if (pi.inode->ctime == rollback.ctime)
10083 pi.inode->ctime = rollback.orig_src.old_ctime;
10084 projected = true;
10085 } else {
10086 if (in->get_inode()->ctime == rollback.ctime) {
10087 auto _inode = CInode::allocate_inode(*in->get_inode());
10088 _inode->ctime = rollback.orig_src.old_ctime;
10089 in->reset_inode(_inode);
10090 }
10091 projected = false;
10092 }
10093 pip = in->get_projected_inode().get();
10094
10095 if (rollback.srci_snapbl.length() && in->snaprealm) {
10096 bool hadrealm;
10097 auto p = rollback.srci_snapbl.cbegin();
10098 decode(hadrealm, p);
10099 if (hadrealm) {
10100 if (projected && !mds->is_resolve()) {
10101 sr_t *new_srnode = new sr_t();
10102 decode(*new_srnode, p);
10103 in->project_snaprealm(new_srnode);
10104 } else
10105 decode(in->snaprealm->srnode, p);
10106 } else {
10107 SnapRealm *realm;
10108 if (rollback.orig_src.ino) {
10109 ceph_assert(srcdir);
10110 realm = srcdir->get_inode()->find_snaprealm();
10111 } else {
10112 realm = in->snaprealm->parent;
10113 }
10114 if (!mds->is_resolve())
10115 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
10116 if (projected)
10117 in->project_snaprealm(NULL);
10118 else
10119 in->snaprealm->merge_to(realm);
10120 }
10121 }
10122 }
10123
10124 // repair dest
10125 if (destdn) {
10126 if (rollback.orig_dest.ino && target) {
10127 destdn->push_projected_linkage(target);
10128 } else if (rollback.orig_dest.remote_ino) {
10129 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
10130 rollback.orig_dest.remote_d_type);
10131 } else {
10132 // the dentry will be trimmed soon, it's ok to have wrong linkage
10133 if (rollback.orig_dest.ino)
10134 ceph_assert(mds->is_resolve());
10135 destdn->push_projected_linkage();
10136 }
10137 }
10138
10139 if (straydn)
10140 straydn->push_projected_linkage();
10141
10142 if (target) {
10143 bool projected;
10144 CInode::inode_ptr ti;
10145 CDir *pdir = target->get_projected_parent_dir();
10146 if (pdir->authority().first == whoami) {
10147 auto pi = target->project_inode(mut);
10148 pi.inode->version = target->pre_dirty();
10149 if (pdir != srcdir) {
10150 auto pf = pdir->project_fnode(mut);
10151 pf->version = pdir->pre_dirty();
10152 }
10153 ti = pi.inode;
10154 projected = true;
10155 } else {
10156 ti = CInode::allocate_inode(*target->get_inode());
10157 projected = false;
10158 }
10159
10160 if (ti->ctime == rollback.ctime)
10161 ti->ctime = rollback.orig_dest.old_ctime;
10162 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
10163 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
10164 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
10165 else
10166 ceph_assert(rollback.orig_dest.remote_ino &&
10167 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
10168 } else
10169 ti->nlink++;
10170
10171 if (!projected)
10172 target->reset_inode(ti);
10173
10174 if (rollback.desti_snapbl.length() && target->snaprealm) {
10175 bool hadrealm;
10176 auto p = rollback.desti_snapbl.cbegin();
10177 decode(hadrealm, p);
10178 if (hadrealm) {
10179 if (projected && !mds->is_resolve()) {
10180 sr_t *new_srnode = new sr_t();
10181 decode(*new_srnode, p);
10182 target->project_snaprealm(new_srnode);
10183 } else
10184 decode(target->snaprealm->srnode, p);
10185 } else {
10186 SnapRealm *realm;
10187 if (rollback.orig_dest.ino) {
10188 ceph_assert(destdir);
10189 realm = destdir->get_inode()->find_snaprealm();
10190 } else {
10191 realm = target->snaprealm->parent;
10192 }
10193 if (!mds->is_resolve())
10194 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
10195 if (projected)
10196 target->project_snaprealm(NULL);
10197 else
10198 target->snaprealm->merge_to(realm);
10199 }
10200 }
10201 }
10202
10203 if (srcdn && srcdn->authority().first == whoami) {
10204 nest_info_t blah;
10205 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
10206 in && in->is_dir(), pip ? pip->accounted_rstat : blah);
10207 }
10208
10209 if (srcdn)
10210 dout(0) << " srcdn back to " << *srcdn << dendl;
10211 if (in)
10212 dout(0) << " srci back to " << *in << dendl;
10213 if (destdn)
10214 dout(0) << " destdn back to " << *destdn << dendl;
10215 if (target)
10216 dout(0) << " desti back to " << *target << dendl;
10217
10218 // journal it
10219 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_rollback", rollback.reqid, leader,
10220 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RENAME);
10221 mdlog->start_entry(le);
10222
10223 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
10224 le->commit.add_dir_context(srcdir);
10225 if (rollback.orig_src.ino)
10226 le->commit.add_primary_dentry(srcdn, 0, true);
10227 else
10228 le->commit.add_remote_dentry(srcdn, true);
10229 }
10230
10231 if (!rollback.orig_src.ino && // remote linkage
10232 in && in->authority().first == whoami) {
10233 le->commit.add_dir_context(in->get_projected_parent_dir());
10234 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
10235 }
10236
10237 if (force_journal_dest) {
10238 ceph_assert(rollback.orig_dest.ino);
10239 le->commit.add_dir_context(destdir);
10240 le->commit.add_primary_dentry(destdn, 0, true);
10241 }
10242
10243 // peer: no need to journal straydn
10244
10245 if (target && target != in && target->authority().first == whoami) {
10246 ceph_assert(rollback.orig_dest.remote_ino);
10247 le->commit.add_dir_context(target->get_projected_parent_dir());
10248 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
10249 }
10250
10251 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
10252 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
10253 le->commit.renamed_dirino = in->ino();
10254 if (srcdn->authority().first == whoami) {
10255 auto&& ls = in->get_dirfrags();
10256 for (const auto& dir : ls) {
10257 if (!dir->is_auth())
10258 le->commit.renamed_dir_frags.push_back(dir->get_frag());
10259 }
10260 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
10261 }
10262 } else if (force_journal_dest) {
10263 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
10264 le->commit.renamed_dirino = target->ino();
10265 }
10266
10267 if (target && target->is_dir()) {
10268 ceph_assert(destdn);
10269 mdcache->project_subtree_rename(target, straydir, destdir);
10270 }
10271
10272 if (in && in->is_dir()) {
10273 ceph_assert(srcdn);
10274 mdcache->project_subtree_rename(in, destdir, srcdir);
10275 }
10276
10277 if (mdr && !mdr->more()->peer_update_journaled) {
10278 ceph_assert(le->commit.empty());
10279 mdlog->cancel_entry(le);
10280 mut->ls = NULL;
10281 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
10282 } else {
10283 ceph_assert(!le->commit.empty());
10284 if (mdr)
10285 mdr->more()->peer_update_journaled = false;
10286 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
10287 srcdn, srcdnpv, destdn, straydn,
10288 splits, finish_mdr);
10289 submit_mdlog_entry(le, fin, mdr, __func__);
10290 mdlog->flush();
10291 }
10292 }
10293
10294 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
10295 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
10296 map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
10297 {
10298 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
10299
10300 if (straydn) {
10301 straydn->get_dir()->unlink_inode(straydn);
10302 straydn->pop_projected_linkage();
10303 }
10304 if (destdn) {
10305 destdn->get_dir()->unlink_inode(destdn);
10306 destdn->pop_projected_linkage();
10307 }
10308 if (srcdn) {
10309 srcdn->pop_projected_linkage();
10310 if (srcdn->authority().first == mds->get_nodeid()) {
10311 srcdn->mark_dirty(srcdnpv, mut->ls);
10312 if (srcdn->get_linkage()->is_primary())
10313 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
10314 }
10315 }
10316
10317 mut->apply();
10318
10319 if (srcdn && srcdn->get_linkage()->is_primary()) {
10320 CInode *in = srcdn->get_linkage()->get_inode();
10321 if (in && in->is_dir()) {
10322 ceph_assert(destdn);
10323 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
10324 }
10325 }
10326
10327 if (destdn) {
10328 CInode *oldin = destdn->get_linkage()->get_inode();
10329 // update subtree map?
10330 if (oldin && oldin->is_dir()) {
10331 ceph_assert(straydn);
10332 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
10333 }
10334 }
10335
10336 if (mds->is_resolve()) {
10337 CDir *root = NULL;
10338 if (straydn)
10339 root = mdcache->get_subtree_root(straydn->get_dir());
10340 else if (destdn)
10341 root = mdcache->get_subtree_root(destdn->get_dir());
10342 if (root)
10343 mdcache->try_trim_non_auth_subtree(root);
10344 } else {
10345 mdcache->send_snaps(splits[1]);
10346 mdcache->send_snaps(splits[0]);
10347 }
10348
10349 if (mdr) {
10350 MDSContext::vec finished;
10351 if (mdr->more()->is_ambiguous_auth) {
10352 if (srcdn->is_auth())
10353 mdr->more()->rename_inode->unfreeze_inode(finished);
10354
10355 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
10356 mdr->more()->is_ambiguous_auth = false;
10357 }
10358 mds->queue_waiters(finished);
10359 if (finish_mdr || mdr->aborted)
10360 mdcache->request_finish(mdr);
10361 else
10362 mdr->more()->peer_rolling_back = false;
10363 }
10364
10365 mdcache->finish_rollback(mut->reqid, mdr);
10366
10367 mut->cleanup();
10368 }
10369
10370 void Server::handle_peer_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
10371 {
10372 dout(10) << "handle_peer_rename_prep_ack " << *mdr
10373 << " witnessed by " << ack->get_source()
10374 << " " << *ack << dendl;
10375 mds_rank_t from = mds_rank_t(ack->get_source().num());
10376
10377 // note peer
10378 mdr->more()->peers.insert(from);
10379 if (mdr->more()->srcdn_auth_mds == from &&
10380 mdr->more()->is_remote_frozen_authpin &&
10381 !mdr->more()->is_ambiguous_auth) {
10382 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
10383 }
10384
10385 // witnessed? or add extra witnesses?
10386 ceph_assert(mdr->more()->witnessed.count(from) == 0);
10387 if (ack->is_interrupted()) {
10388 dout(10) << " peer request interrupted, noop" << dendl;
10389 } else if (ack->witnesses.empty()) {
10390 mdr->more()->witnessed.insert(from);
10391 if (!ack->is_not_journaled())
10392 mdr->more()->has_journaled_peers = true;
10393 } else {
10394 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
10395 mdr->more()->extra_witnesses = ack->witnesses;
10396 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
10397 }
10398
10399 // srci import?
10400 if (ack->inode_export.length()) {
10401 dout(10) << " got srci import" << dendl;
10402 mdr->more()->inode_import.share(ack->inode_export);
10403 mdr->more()->inode_import_v = ack->inode_export_v;
10404 }
10405
10406 // remove from waiting list
10407 ceph_assert(mdr->more()->waiting_on_peer.count(from));
10408 mdr->more()->waiting_on_peer.erase(from);
10409
10410 if (mdr->more()->waiting_on_peer.empty())
10411 dispatch_client_request(mdr); // go again!
10412 else
10413 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
10414 }
10415
10416 void Server::handle_peer_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
10417 {
10418 dout(10) << "handle_peer_rename_notify_ack " << *mdr << " from mds."
10419 << ack->get_source() << dendl;
10420 ceph_assert(mdr->is_peer());
10421 mds_rank_t from = mds_rank_t(ack->get_source().num());
10422
10423 if (mdr->more()->waiting_on_peer.count(from)) {
10424 mdr->more()->waiting_on_peer.erase(from);
10425
10426 if (mdr->more()->waiting_on_peer.empty()) {
10427 if (mdr->peer_request)
10428 dispatch_peer_request(mdr);
10429 } else
10430 dout(10) << " still waiting for rename notify acks from "
10431 << mdr->more()->waiting_on_peer << dendl;
10432 }
10433 }
10434
10435 void Server::_peer_rename_sessions_flushed(MDRequestRef& mdr)
10436 {
10437 dout(10) << "_peer_rename_sessions_flushed " << *mdr << dendl;
10438
10439 if (mdr->more()->waiting_on_peer.count(MDS_RANK_NONE)) {
10440 mdr->more()->waiting_on_peer.erase(MDS_RANK_NONE);
10441
10442 if (mdr->more()->waiting_on_peer.empty()) {
10443 if (mdr->peer_request)
10444 dispatch_peer_request(mdr);
10445 } else
10446 dout(10) << " still waiting for rename notify acks from "
10447 << mdr->more()->waiting_on_peer << dendl;
10448 }
10449 }
10450
10451 // snaps
10452 /* This function takes responsibility for the passed mdr*/
10453 void Server::handle_client_lssnap(MDRequestRef& mdr)
10454 {
10455 const cref_t<MClientRequest> &req = mdr->client_request;
10456
10457 // traverse to path
10458 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10459 if (!diri)
10460 return;
10461
10462 if (!diri->is_dir()) {
10463 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10464 return;
10465 }
10466 dout(10) << "lssnap on " << *diri << dendl;
10467
10468 // lock snap
10469 if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
10470 return;
10471
10472 if (!check_access(mdr, diri, MAY_READ))
10473 return;
10474
10475 SnapRealm *realm = diri->find_snaprealm();
10476 map<snapid_t,const SnapInfo*> infomap;
10477 realm->get_snap_info(infomap, diri->get_oldest_snap());
10478
10479 unsigned max_entries = req->head.args.readdir.max_entries;
10480 if (!max_entries)
10481 max_entries = infomap.size();
10482 int max_bytes = req->head.args.readdir.max_bytes;
10483 if (!max_bytes)
10484 // make sure at least one item can be encoded
10485 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
10486
10487 __u64 last_snapid = 0;
10488 string offset_str = req->get_path2();
10489 if (!offset_str.empty())
10490 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
10491
10492 //Empty DirStat
10493 bufferlist dirbl;
10494 static DirStat empty;
10495 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
10496
10497 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
10498
10499 __u32 num = 0;
10500 bufferlist dnbl;
10501 auto p = infomap.upper_bound(last_snapid);
10502 for (; p != infomap.end() && num < max_entries; ++p) {
10503 dout(10) << p->first << " -> " << *p->second << dendl;
10504
10505 // actual
10506 string snap_name;
10507 if (p->second->ino == diri->ino())
10508 snap_name = p->second->name;
10509 else
10510 snap_name = p->second->get_long_name();
10511
10512 unsigned start_len = dnbl.length();
10513 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
10514 break;
10515
10516 encode(snap_name, dnbl);
10517 //infinite lease
10518 LeaseStat e(CEPH_LEASE_VALID, -1, 0);
10519 mds->locker->encode_lease(dnbl, mdr->session->info, e);
10520 dout(20) << "encode_infinite_lease" << dendl;
10521
10522 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
10523 if (r < 0) {
10524 bufferlist keep;
10525 keep.substr_of(dnbl, 0, start_len);
10526 dnbl.swap(keep);
10527 break;
10528 }
10529 ++num;
10530 }
10531
10532 encode(num, dirbl);
10533 __u16 flags = 0;
10534 if (p == infomap.end()) {
10535 flags = CEPH_READDIR_FRAG_END;
10536 if (last_snapid == 0)
10537 flags |= CEPH_READDIR_FRAG_COMPLETE;
10538 }
10539 encode(flags, dirbl);
10540 dirbl.claim_append(dnbl);
10541
10542 mdr->reply_extra_bl = dirbl;
10543 mdr->tracei = diri;
10544 respond_to_request(mdr, 0);
10545 }
10546
10547
10548 // MKSNAP
10549
10550 struct C_MDS_mksnap_finish : public ServerLogContext {
10551 CInode *diri;
10552 SnapInfo info;
10553 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
10554 ServerLogContext(s, r), diri(di), info(i) {}
10555 void finish(int r) override {
10556 server->_mksnap_finish(mdr, diri, info);
10557 }
10558 };
10559
10560 /* This function takes responsibility for the passed mdr*/
10561 void Server::handle_client_mksnap(MDRequestRef& mdr)
10562 {
10563 const cref_t<MClientRequest> &req = mdr->client_request;
10564 // make sure we have as new a map as the client
10565 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
10566 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
10567 return;
10568 }
10569 if (!mds->mdsmap->allows_snaps()) {
10570 // you can't make snapshots until you set an option right now
10571 dout(5) << "new snapshots are disabled for this fs" << dendl;
10572 respond_to_request(mdr, -CEPHFS_EPERM);
10573 return;
10574 }
10575
10576 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10577 if (!diri)
10578 return;
10579
10580 // dir only
10581 if (!diri->is_dir()) {
10582 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10583 return;
10584 }
10585 if (diri->is_system() && !diri->is_root()) {
10586 // no snaps in system dirs (root is ok)
10587 dout(5) << "is an internal system dir" << dendl;
10588 respond_to_request(mdr, -CEPHFS_EPERM);
10589 return;
10590 }
10591
10592 std::string_view snapname = req->get_filepath().last_dentry();
10593
10594 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10595 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
10596 respond_to_request(mdr, -CEPHFS_EPERM);
10597 return;
10598 }
10599
10600 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
10601
10602 // lock snap
10603 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10604 MutationImpl::LockOpVec lov;
10605 lov.add_xlock(&diri->snaplock);
10606 if (!mds->locker->acquire_locks(mdr, lov))
10607 return;
10608
10609 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10610 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10611 return;
10612 }
10613 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10614 }
10615
10616 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10617 return;
10618
10619 if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino();
10620 (subvol_ino && subvol_ino != diri->ino())) {
10621 dout(5) << "is a descendent of a subvolume dir" << dendl;
10622 respond_to_request(mdr, -CEPHFS_EPERM);
10623 return;
10624 }
10625
10626 // check if we can create any more snapshots
10627 // we don't allow any more if we are already at or beyond the limit
10628 if (diri->snaprealm &&
10629 diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
10630 respond_to_request(mdr, -CEPHFS_EMLINK);
10631 return;
10632 }
10633
10634 // make sure name is unique
10635 if (diri->snaprealm &&
10636 diri->snaprealm->exists(snapname)) {
10637 respond_to_request(mdr, -CEPHFS_EEXIST);
10638 return;
10639 }
10640 if (snapname.length() == 0 ||
10641 snapname[0] == '_') {
10642 respond_to_request(mdr, -CEPHFS_EINVAL);
10643 return;
10644 }
10645
10646 // allocate a snapid
10647 if (!mdr->more()->stid) {
10648 // prepare an stid
10649 mds->snapclient->prepare_create(diri->ino(), snapname,
10650 mdr->get_mds_stamp(),
10651 &mdr->more()->stid, &mdr->more()->snapidbl,
10652 new C_MDS_RetryRequest(mdcache, mdr));
10653 return;
10654 }
10655
10656 version_t stid = mdr->more()->stid;
10657 snapid_t snapid;
10658 auto p = mdr->more()->snapidbl.cbegin();
10659 decode(snapid, p);
10660 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
10661
10662 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10663
10664 SnapPayload payload;
10665 if (req->get_data().length()) {
10666 try {
10667 auto iter = req->get_data().cbegin();
10668 decode(payload, iter);
10669 } catch (const ceph::buffer::error &e) {
10670 // backward compat -- client sends xattr bufferlist. however,
10671 // that is not used anywhere -- so (log and) ignore.
10672 dout(20) << ": no metadata in payload (old client?)" << dendl;
10673 }
10674 }
10675
10676 // journal
10677 SnapInfo info;
10678 info.ino = diri->ino();
10679 info.snapid = snapid;
10680 info.name = snapname;
10681 info.stamp = mdr->get_op_stamp();
10682 info.metadata = payload.metadata;
10683
10684 auto pi = diri->project_inode(mdr, false, true);
10685 pi.inode->ctime = info.stamp;
10686 if (info.stamp > pi.inode->rstat.rctime)
10687 pi.inode->rstat.rctime = info.stamp;
10688 pi.inode->rstat.rsnaps++;
10689 pi.inode->version = diri->pre_dirty();
10690
10691 // project the snaprealm
10692 auto &newsnap = *pi.snapnode;
10693 newsnap.created = snapid;
10694 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
10695 if (!em.second)
10696 em.first->second = info;
10697 newsnap.seq = snapid;
10698 newsnap.last_created = snapid;
10699
10700 // journal the inode changes
10701 mdr->ls = mdlog->get_current_segment();
10702 EUpdate *le = new EUpdate(mdlog, "mksnap");
10703 mdlog->start_entry(le);
10704
10705 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10706 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10707 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10708 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10709
10710 // journal the snaprealm changes
10711 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
10712 mdr, __func__);
10713 mdlog->flush();
10714 }
10715
10716 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
10717 {
10718 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
10719
10720 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
10721
10722 mdr->apply();
10723
10724 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10725
10726 // create snap
10727 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10728
10729 // notify other mds
10730 mdcache->send_snap_update(diri, mdr->more()->stid, op);
10731
10732 mdcache->do_realm_invalidate_and_update_notify(diri, op);
10733
10734 // yay
10735 mdr->in[0] = diri;
10736 mdr->snapid = info.snapid;
10737 mdr->tracei = diri;
10738 respond_to_request(mdr, 0);
10739 }
10740
10741
10742 // RMSNAP
10743
10744 struct C_MDS_rmsnap_finish : public ServerLogContext {
10745 CInode *diri;
10746 snapid_t snapid;
10747 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10748 ServerLogContext(s, r), diri(di), snapid(sn) {}
10749 void finish(int r) override {
10750 server->_rmsnap_finish(mdr, diri, snapid);
10751 }
10752 };
10753
10754 /* This function takes responsibility for the passed mdr*/
10755 void Server::handle_client_rmsnap(MDRequestRef& mdr)
10756 {
10757 const cref_t<MClientRequest> &req = mdr->client_request;
10758
10759 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10760 if (!diri)
10761 return;
10762
10763 if (!diri->is_dir()) {
10764 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10765 return;
10766 }
10767
10768 std::string_view snapname = req->get_filepath().last_dentry();
10769
10770 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10771 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
10772 respond_to_request(mdr, -CEPHFS_EPERM);
10773 return;
10774 }
10775
10776 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
10777
10778 // does snap exist?
10779 if (snapname.length() == 0 || snapname[0] == '_') {
10780 respond_to_request(mdr, -CEPHFS_EINVAL); // can't prune a parent snap, currently.
10781 return;
10782 }
10783 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
10784 respond_to_request(mdr, -CEPHFS_ENOENT);
10785 return;
10786 }
10787 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
10788 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
10789
10790 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10791 MutationImpl::LockOpVec lov;
10792 lov.add_xlock(&diri->snaplock);
10793 if (!mds->locker->acquire_locks(mdr, lov))
10794 return;
10795 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10796 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10797 return;
10798 }
10799 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10800 }
10801
10802 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10803 return;
10804
10805 // prepare
10806 if (!mdr->more()->stid) {
10807 mds->snapclient->prepare_destroy(diri->ino(), snapid,
10808 &mdr->more()->stid, &mdr->more()->snapidbl,
10809 new C_MDS_RetryRequest(mdcache, mdr));
10810 return;
10811 }
10812 version_t stid = mdr->more()->stid;
10813 auto p = mdr->more()->snapidbl.cbegin();
10814 snapid_t seq;
10815 decode(seq, p);
10816 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
10817
10818 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10819
10820 // journal
10821 auto pi = diri->project_inode(mdr, false, true);
10822 pi.inode->version = diri->pre_dirty();
10823 pi.inode->ctime = mdr->get_op_stamp();
10824 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
10825 pi.inode->rstat.rctime = mdr->get_op_stamp();
10826 pi.inode->rstat.rsnaps--;
10827
10828 mdr->ls = mdlog->get_current_segment();
10829 EUpdate *le = new EUpdate(mdlog, "rmsnap");
10830 mdlog->start_entry(le);
10831
10832 // project the snaprealm
10833 auto &newnode = *pi.snapnode;
10834 newnode.snaps.erase(snapid);
10835 newnode.seq = seq;
10836 newnode.last_destroyed = seq;
10837
10838 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10839 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10840 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10841 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10842
10843 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
10844 mdr, __func__);
10845 mdlog->flush();
10846 }
10847
10848 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10849 {
10850 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
10851 snapid_t stid = mdr->more()->stid;
10852 auto p = mdr->more()->snapidbl.cbegin();
10853 snapid_t seq;
10854 decode(seq, p);
10855
10856 mdr->apply();
10857
10858 mds->snapclient->commit(stid, mdr->ls);
10859
10860 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10861
10862 // notify other mds
10863 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
10864
10865 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
10866
10867 // yay
10868 mdr->in[0] = diri;
10869 respond_to_request(mdr, 0);
10870
10871 // purge snapshot data
10872 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
10873 }
10874
10875 struct C_MDS_renamesnap_finish : public ServerLogContext {
10876 CInode *diri;
10877 snapid_t snapid;
10878 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10879 ServerLogContext(s, r), diri(di), snapid(sn) {}
10880 void finish(int r) override {
10881 server->_renamesnap_finish(mdr, diri, snapid);
10882 }
10883 };
10884
10885 /* This function takes responsibility for the passed mdr*/
10886 void Server::handle_client_renamesnap(MDRequestRef& mdr)
10887 {
10888 const cref_t<MClientRequest> &req = mdr->client_request;
10889 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
10890 respond_to_request(mdr, -CEPHFS_EINVAL);
10891 return;
10892 }
10893
10894 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10895 if (!diri)
10896 return;
10897
10898 if (!diri->is_dir()) { // dir only
10899 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10900 return;
10901 }
10902
10903 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
10904 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10905 respond_to_request(mdr, -CEPHFS_EPERM);
10906 return;
10907 }
10908
10909 std::string_view dstname = req->get_filepath().last_dentry();
10910 std::string_view srcname = req->get_filepath2().last_dentry();
10911 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
10912
10913 if (srcname.length() == 0 || srcname[0] == '_') {
10914 respond_to_request(mdr, -CEPHFS_EINVAL); // can't rename a parent snap.
10915 return;
10916 }
10917 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
10918 respond_to_request(mdr, -CEPHFS_ENOENT);
10919 return;
10920 }
10921 if (dstname.length() == 0 || dstname[0] == '_') {
10922 respond_to_request(mdr, -CEPHFS_EINVAL);
10923 return;
10924 }
10925 if (diri->snaprealm->exists(dstname)) {
10926 respond_to_request(mdr, -CEPHFS_EEXIST);
10927 return;
10928 }
10929
10930 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
10931 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
10932
10933 // lock snap
10934 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10935 MutationImpl::LockOpVec lov;
10936 lov.add_xlock(&diri->snaplock);
10937 if (!mds->locker->acquire_locks(mdr, lov))
10938 return;
10939 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10940 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10941 return;
10942 }
10943 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10944 }
10945
10946 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10947 return;
10948
10949 // prepare
10950 if (!mdr->more()->stid) {
10951 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
10952 &mdr->more()->stid,
10953 new C_MDS_RetryRequest(mdcache, mdr));
10954 return;
10955 }
10956
10957 version_t stid = mdr->more()->stid;
10958 dout(10) << " stid is " << stid << dendl;
10959
10960 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10961
10962 // journal
10963 auto pi = diri->project_inode(mdr, false, true);
10964 pi.inode->ctime = mdr->get_op_stamp();
10965 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
10966 pi.inode->rstat.rctime = mdr->get_op_stamp();
10967 pi.inode->version = diri->pre_dirty();
10968
10969 // project the snaprealm
10970 auto &newsnap = *pi.snapnode;
10971 auto it = newsnap.snaps.find(snapid);
10972 ceph_assert(it != newsnap.snaps.end());
10973 it->second.name = dstname;
10974
10975 // journal the inode changes
10976 mdr->ls = mdlog->get_current_segment();
10977 EUpdate *le = new EUpdate(mdlog, "renamesnap");
10978 mdlog->start_entry(le);
10979
10980 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10981 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10982 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10983 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10984
10985 // journal the snaprealm changes
10986 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
10987 mdr, __func__);
10988 mdlog->flush();
10989 }
10990
10991 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10992 {
10993 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
10994
10995 mdr->apply();
10996
10997 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10998
10999 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
11000
11001 // notify other mds
11002 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
11003
11004 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
11005
11006 // yay
11007 mdr->in[0] = diri;
11008 mdr->tracei = diri;
11009 mdr->snapid = snapid;
11010 respond_to_request(mdr, 0);
11011 }
11012
11013 /**
11014 * Return true if server is in state RECONNECT and this
11015 * client has not yet reconnected.
11016 */
11017 bool Server::waiting_for_reconnect(client_t c) const
11018 {
11019 return client_reconnect_gather.count(c) > 0;
11020 }
11021
11022 void Server::dump_reconnect_status(Formatter *f) const
11023 {
11024 f->open_object_section("reconnect_status");
11025 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
11026 f->close_section();
11027 }