]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
21
22 #include "MDSRank.h"
23 #include "Server.h"
24 #include "Locker.h"
25 #include "MDCache.h"
26 #include "MDLog.h"
27 #include "Migrator.h"
28 #include "MDBalancer.h"
29 #include "InoTable.h"
30 #include "SnapClient.h"
31 #include "Mutation.h"
32 #include "MetricsHandler.h"
33 #include "cephfs_features.h"
34
35 #include "msg/Messenger.h"
36
37 #include "osdc/Objecter.h"
38
39 #include "events/EUpdate.h"
40 #include "events/EPeerUpdate.h"
41 #include "events/ESession.h"
42 #include "events/EOpen.h"
43 #include "events/ECommitted.h"
44 #include "events/EPurged.h"
45
46 #include "include/stringify.h"
47 #include "include/filepath.h"
48 #include "common/errno.h"
49 #include "common/Timer.h"
50 #include "common/perf_counters.h"
51 #include "include/compat.h"
52 #include "osd/OSDMap.h"
53 #include "fscrypt.h"
54
55 #include <errno.h>
56
57 #include <list>
58 #include <regex>
59 #include <string_view>
60 #include <functional>
61
62 #include "common/config.h"
63
64 #include "msg/Message.h"
65
66 #define dout_context g_ceph_context
67 #define dout_subsys ceph_subsys_mds
68 #undef dout_prefix
69 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
70
71 using namespace std;
72
73 class ServerContext : public MDSContext {
74 protected:
75 Server *server;
76 MDSRank *get_mds() override
77 {
78 return server->mds;
79 }
80
81 public:
82 explicit ServerContext(Server *s) : server(s) {
83 ceph_assert(server != NULL);
84 }
85 };
86
87 class Batch_Getattr_Lookup : public BatchOp {
88 protected:
89 Server* server;
90 ceph::ref_t<MDRequestImpl> mdr;
91 std::vector<ceph::ref_t<MDRequestImpl>> batch_reqs;
92 int res = 0;
93 public:
94 Batch_Getattr_Lookup(Server* s, const ceph::ref_t<MDRequestImpl>& r)
95 : server(s), mdr(r) {
96 if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP)
97 mdr->batch_op_map = &mdr->dn[0].back()->batch_ops;
98 else
99 mdr->batch_op_map = &mdr->in[0]->batch_ops;
100 }
101 void add_request(const ceph::ref_t<MDRequestImpl>& r) override {
102 batch_reqs.push_back(r);
103 }
104 ceph::ref_t<MDRequestImpl> find_new_head() override {
105 while (!batch_reqs.empty()) {
106 auto r = std::move(batch_reqs.back());
107 batch_reqs.pop_back();
108 if (r->killed)
109 continue;
110
111 r->batch_op_map = mdr->batch_op_map;
112 mdr->batch_op_map = nullptr;
113 mdr = r;
114 return mdr;
115 }
116 return nullptr;
117 }
118 void _forward(mds_rank_t t) override {
119 MDCache* mdcache = server->mdcache;
120 mdcache->mds->forward_message_mds(mdr->release_client_request(), t);
121 mdr->set_mds_stamp(ceph_clock_now());
122 for (auto& m : batch_reqs) {
123 if (!m->killed)
124 mdcache->request_forward(m, t);
125 }
126 batch_reqs.clear();
127 }
128 void _respond(int r) override {
129 mdr->set_mds_stamp(ceph_clock_now());
130 for (auto& m : batch_reqs) {
131 if (!m->killed) {
132 m->tracei = mdr->tracei;
133 m->tracedn = mdr->tracedn;
134 server->respond_to_request(m, r);
135 }
136 }
137 batch_reqs.clear();
138 server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
139 }
140 void print(std::ostream& o) {
141 o << "[batch front=" << *mdr << "]";
142 }
143 };
144
145 class ServerLogContext : public MDSLogContextBase {
146 protected:
147 Server *server;
148 MDSRank *get_mds() override
149 {
150 return server->mds;
151 }
152
153 MDRequestRef mdr;
154 void pre_finish(int r) override {
155 if (mdr)
156 mdr->mark_event("journal_committed: ");
157 }
158 public:
159 explicit ServerLogContext(Server *s) : server(s) {
160 ceph_assert(server != NULL);
161 }
162 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
163 ceph_assert(server != NULL);
164 }
165 };
166
167 void Server::create_logger()
168 {
169 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
170
171 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
172 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
173 plb.add_u64_counter(l_mdss_handle_peer_request, "handle_peer_request",
174 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
175 plb.add_u64_counter(l_mdss_handle_client_session,
176 "handle_client_session", "Client session messages", "hcs",
177 PerfCountersBuilder::PRIO_INTERESTING);
178 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
179 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
180 plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
181 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
182 PerfCountersBuilder::PRIO_INTERESTING);
183
184 // fop latencies are useful
185 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
186 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
187 "Request type lookup hash of inode latency");
188 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
189 "Request type lookup inode latency");
190 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
191 "Request type lookup parent latency");
192 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
193 "Request type lookup name latency");
194 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
195 "Request type lookup latency");
196 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
197 "Request type lookup snapshot latency");
198 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
199 "Request type get attribute latency");
200 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
201 "Request type set attribute latency");
202 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
203 "Request type set file layout latency");
204 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
205 "Request type set directory layout latency");
206 plb.add_time_avg(l_mdss_req_getvxattr_latency, "req_getvxattr_latency",
207 "Request type get virtual extended attribute latency");
208 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
209 "Request type set extended attribute latency");
210 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
211 "Request type remove extended attribute latency");
212 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
213 "Request type read directory latency");
214 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
215 "Request type set file lock latency");
216 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
217 "Request type get file lock latency");
218 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
219 "Request type create latency");
220 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
221 "Request type open latency");
222 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
223 "Request type make node latency");
224 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
225 "Request type link latency");
226 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
227 "Request type unlink latency");
228 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
229 "Request type remove directory latency");
230 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
231 "Request type rename latency");
232 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
233 "Request type make directory latency");
234 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
235 "Request type symbolic link latency");
236 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
237 "Request type list snapshot latency");
238 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
239 "Request type make snapshot latency");
240 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
241 "Request type remove snapshot latency");
242 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
243 "Request type rename snapshot latency");
244
245 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
246 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
247 "Client requests dispatched");
248 plb.add_u64_counter(l_mdss_dispatch_peer_request, "dispatch_server_request",
249 "Server requests dispatched");
250
251 logger = plb.create_perf_counters();
252 g_ceph_context->get_perfcounters_collection()->add(logger);
253 }
254
255 Server::Server(MDSRank *m, MetricsHandler *metrics_handler) :
256 mds(m),
257 mdcache(mds->mdcache), mdlog(mds->mdlog),
258 inject_rename_corrupt_dentry_first(g_conf().get_val<double>("mds_inject_rename_corrupt_dentry_first")),
259 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
260 metrics_handler(metrics_handler)
261 {
262 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
263 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
264 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
265 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
266 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
267 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
268 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
269 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
270 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
271 dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
272 bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
273 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
274 supported_metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
275 }
276
277 void Server::dispatch(const cref_t<Message> &m)
278 {
279 switch (m->get_type()) {
280 case CEPH_MSG_CLIENT_RECONNECT:
281 handle_client_reconnect(ref_cast<MClientReconnect>(m));
282 return;
283 }
284
285 /*
286 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
287
288 1. In reconnect phase, client sent unsafe requests to mds.
289 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
290 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
291 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
292
293 */
294 bool sessionclosed_isok = replay_unsafe_with_closed_session;
295 // active?
296 // handle_peer_request()/handle_client_session() will wait if necessary
297 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
298 const auto &req = ref_cast<MClientRequest>(m);
299 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
300 Session *session = mds->get_session(req);
301 if (!session || (!session->is_open() && !sessionclosed_isok)) {
302 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
303 return;
304 }
305 bool queue_replay = false;
306 if (req->is_replay() || req->is_async()) {
307 dout(3) << "queuing replayed op" << dendl;
308 queue_replay = true;
309 if (req->head.ino &&
310 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
311 inodeno_t ino(req->head.ino);
312 mdcache->add_replay_ino_alloc(ino);
313 if (replay_unsafe_with_closed_session &&
314 session->free_prealloc_inos.contains(ino)) {
315 // don't purge inodes that will be created by later replay
316 session->free_prealloc_inos.erase(ino);
317 session->delegated_inos.insert(ino);
318 }
319 }
320 } else if (req->get_retry_attempt()) {
321 // process completed request in clientreplay stage. The completed request
322 // might have created new file/directorie. This guarantees MDS sends a reply
323 // to client before other request modifies the new file/directorie.
324 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
325 dout(3) << "queuing completed op" << dendl;
326 queue_replay = true;
327 }
328 // this request was created before the cap reconnect message, drop any embedded
329 // cap releases.
330 req->releases.clear();
331 }
332 if (queue_replay) {
333 req->mark_queued_for_replay();
334 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
335 return;
336 }
337 }
338
339 bool wait_for_active = true;
340 if (mds->is_stopping()) {
341 wait_for_active = false;
342 } else if (mds->is_clientreplay()) {
343 if (req->is_queued_for_replay()) {
344 wait_for_active = false;
345 }
346 }
347 if (wait_for_active) {
348 dout(3) << "not active yet, waiting" << dendl;
349 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
350 return;
351 }
352 }
353
354 switch (m->get_type()) {
355 case CEPH_MSG_CLIENT_SESSION:
356 handle_client_session(ref_cast<MClientSession>(m));
357 return;
358 case CEPH_MSG_CLIENT_REQUEST:
359 handle_client_request(ref_cast<MClientRequest>(m));
360 return;
361 case CEPH_MSG_CLIENT_RECLAIM:
362 handle_client_reclaim(ref_cast<MClientReclaim>(m));
363 return;
364 case MSG_MDS_PEER_REQUEST:
365 handle_peer_request(ref_cast<MMDSPeerRequest>(m));
366 return;
367 default:
368 derr << "Server unknown message " << m->get_type() << " from peer type " << m->get_connection()->get_peer_type() << dendl;
369 ceph_abort_msg("server unknown message " + to_string(m->get_type()) + " from peer type " + to_string(m->get_connection()->get_peer_type()));
370 }
371 }
372
373
374
375 // ----------------------------------------------------------
376 // SESSION management
377
378 class C_MDS_session_finish : public ServerLogContext {
379 Session *session;
380 uint64_t state_seq;
381 bool open;
382 version_t cmapv;
383 interval_set<inodeno_t> inos_to_free;
384 version_t inotablev;
385 interval_set<inodeno_t> inos_to_purge;
386 LogSegment *ls = nullptr;
387 Context *fin;
388 public:
389 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = nullptr) :
390 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
391 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv,
392 const interval_set<inodeno_t>& to_free, version_t iv,
393 const interval_set<inodeno_t>& to_purge, LogSegment *_ls, Context *fin_ = nullptr) :
394 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv),
395 inos_to_free(to_free), inotablev(iv), inos_to_purge(to_purge), ls(_ls), fin(fin_) {}
396 void finish(int r) override {
397 ceph_assert(r == 0);
398 server->_session_logged(session, state_seq, open, cmapv, inos_to_free, inotablev, inos_to_purge, ls);
399 if (fin) {
400 fin->complete(r);
401 }
402 }
403 };
404
405 Session* Server::find_session_by_uuid(std::string_view uuid)
406 {
407 Session* session = nullptr;
408 for (auto& it : mds->sessionmap.get_sessions()) {
409 auto& metadata = it.second->info.client_metadata;
410
411 auto p = metadata.find("uuid");
412 if (p == metadata.end() || p->second != uuid)
413 continue;
414
415 if (!session) {
416 session = it.second;
417 } else if (!session->reclaiming_from) {
418 ceph_assert(it.second->reclaiming_from == session);
419 session = it.second;
420 } else {
421 ceph_assert(session->reclaiming_from == it.second);
422 }
423 }
424 return session;
425 }
426
427 void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
428 {
429 if (!session->is_open() && !session->is_stale()) {
430 dout(10) << "session not open, dropping this req" << dendl;
431 return;
432 }
433
434 auto reply = make_message<MClientReclaimReply>(0);
435 if (m->get_uuid().empty()) {
436 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
437 reply->set_result(-CEPHFS_EINVAL);
438 mds->send_message_client(reply, session);
439 return;
440 }
441
442 unsigned flags = m->get_flags();
443 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
444 dout(10) << __func__ << " unsupported flags" << dendl;
445 reply->set_result(-CEPHFS_EINVAL);
446 mds->send_message_client(reply, session);
447 return;
448 }
449
450 Session* target = find_session_by_uuid(m->get_uuid());
451 if (target) {
452 if (session->info.auth_name != target->info.auth_name) {
453 dout(10) << __func__ << " session auth_name " << session->info.auth_name
454 << " != target auth_name " << target->info.auth_name << dendl;
455 reply->set_result(-CEPHFS_EPERM);
456 mds->send_message_client(reply, session);
457 }
458
459 ceph_assert(!target->reclaiming_from);
460 ceph_assert(!session->reclaiming_from);
461 session->reclaiming_from = target;
462 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
463 }
464
465 if (flags & CEPH_RECLAIM_RESET) {
466 finish_reclaim_session(session, reply);
467 } else ceph_assert(0); /* no other flags are handled at this time */
468 }
469
470 void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
471 {
472 Session *target = session->reclaiming_from;
473 if (target) {
474 session->reclaiming_from = nullptr;
475
476 Context *send_reply;
477 if (reply) {
478 int64_t session_id = session->get_client().v;
479 send_reply = new LambdaContext([this, session_id, reply](int r) {
480 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
481 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
482 if (!session) {
483 return;
484 }
485 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
486 reply->set_epoch(epoch);
487 mds->send_message_client(reply, session);
488 });
489 } else {
490 send_reply = nullptr;
491 }
492
493 bool blocklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
494 return map.is_blocklisted(target->info.inst.addr);
495 });
496
497 if (blocklisted || !g_conf()->mds_session_blocklist_on_evict) {
498 kill_session(target, send_reply);
499 } else {
500 CachedStackStringStream css;
501 mds->evict_client(target->get_client().v, false, true, *css, send_reply);
502 }
503 } else if (reply) {
504 mds->send_message_client(reply, session);
505 }
506 }
507
508 void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
509 {
510 Session *session = mds->get_session(m);
511 uint32_t flags = m->get_flags();
512 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
513 ceph_assert(m->is_a_client()); // should _not_ come from an mds!
514
515 if (!session) {
516 dout(0) << " ignoring sessionless msg " << *m << dendl;
517 return;
518 }
519
520 std::string_view fs_name = mds->mdsmap->get_fs_name();
521 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
522 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
523 return;
524 }
525
526 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
527 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
528 return;
529 }
530
531 if (flags & MClientReclaim::FLAG_FINISH) {
532 if (flags ^ MClientReclaim::FLAG_FINISH) {
533 dout(0) << __func__ << " client specified FLAG_FINISH with other flags."
534 " Other flags:" << flags << dendl;
535 auto reply = make_message<MClientReclaimReply>(0);
536 reply->set_result(-CEPHFS_EINVAL);
537 mds->send_message_client(reply, session);
538 return;
539 }
540 finish_reclaim_session(session);
541 } else {
542 reclaim_session(session, m);
543 }
544 }
545
546 void Server::handle_client_session(const cref_t<MClientSession> &m)
547 {
548 version_t pv;
549 Session *session = mds->get_session(m);
550
551 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
552 ceph_assert(m->is_a_client()); // should _not_ come from an mds!
553
554 if (!session) {
555 dout(0) << " ignoring sessionless msg " << *m << dendl;
556 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
557 reply->metadata["error_string"] = "sessionless";
558 mds->send_message(reply, m->get_connection());
559 return;
560 }
561
562 std::string_view fs_name = mds->mdsmap->get_fs_name();
563 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
564 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
565 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
566 reply->metadata["error_string"] = "client doesn't have caps for FS \"" +
567 std::string(fs_name) + "\"";
568 mds->send_message(std::move(reply), m->get_connection());
569 return;
570 }
571
572 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
573 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
574 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
575 // close requests need to be handled when mds is active
576 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
577 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
578 return;
579 }
580 } else {
581 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
582 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
583 return;
584 }
585 }
586
587 if (logger)
588 logger->inc(l_mdss_handle_client_session);
589
590 uint64_t sseq = 0;
591 switch (m->get_op()) {
592 case CEPH_SESSION_REQUEST_OPEN:
593 if(mds->mdsmap->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
594 dout(0) << "new sessions are not permitted, enable again via"
595 "`ceph fs set <fs_name> refuse_client_session false`" << dendl;
596 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
597 reply->metadata["error_string"] = "new sessions are not permitted,"
598 " enable again via `ceph fs set"
599 " <fs_name> refuse_client_session false`";
600 mds->send_message(reply, m->get_connection());
601 return;
602 }
603 if (session->is_opening() ||
604 session->is_open() ||
605 session->is_stale() ||
606 session->is_killing() ||
607 terminating_sessions) {
608 if (m->supported_features.test(CEPHFS_FEATURE_NOTIFY_SESSION_STATE)) {
609 if (session->is_open() && !mds->is_stopping()) {
610 dout(10) << "currently already opened" << dendl;
611
612 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN,
613 session->get_push_seq());
614 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
615 reply->supported_features = supported_features;
616 mds->send_message_client(reply, session);
617 if (mdcache->is_readonly()) {
618 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
619 mds->send_message_client(m, session);
620 }
621 }
622 }
623 dout(10) << "currently " << session->get_state_name()
624 << ", dropping this req" << dendl;
625 return;
626 }
627 ceph_assert(session->is_closed() || session->is_closing());
628
629 if (mds->is_stopping()) {
630 dout(10) << "mds is stopping, dropping open req" << dendl;
631 return;
632 }
633
634 {
635 auto& addr = session->info.inst.addr;
636 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
637 auto& client_metadata = session->info.client_metadata;
638
639 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
640 auto now = ceph_clock_now();
641 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
642 auto elapsed = now - m->get_recv_stamp();
643 CachedStackStringStream css;
644 *css << "New client session:"
645 << " addr=\"" << session->info.inst.addr << "\""
646 << ",elapsed=" << elapsed
647 << ",throttled=" << throttle_elapsed
648 << ",status=\"" << status << "\"";
649 if (!err.empty()) {
650 *css << ",error=\"" << err << "\"";
651 }
652 const auto& metadata = session->info.client_metadata;
653 if (auto it = metadata.find("root"); it != metadata.end()) {
654 *css << ",root=\"" << it->second << "\"";
655 }
656 dout(2) << css->strv() << dendl;
657 };
658
659 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str, unsigned flags=0) {
660 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT, 0, flags);
661 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
662 m->metadata["error_string"] = err_str;
663 mds->send_message_client(m, session);
664 log_session_status("REJECTED", err_str);
665 };
666
667 bool blocklisted = mds->objecter->with_osdmap(
668 [&addr](const OSDMap &osd_map) -> bool {
669 return osd_map.is_blocklisted(addr);
670 });
671
672 if (blocklisted) {
673 dout(10) << "rejecting blocklisted client " << addr << dendl;
674 // This goes on the wire and the "blacklisted" substring is
675 // depended upon by the kernel client for detecting whether it
676 // has been blocklisted. If mounted with recover_session=clean
677 // (since 5.4), it tries to automatically recover itself from
678 // blocklisting.
679 unsigned flags = 0;
680 flags |= MClientSession::SESSION_BLOCKLISTED;
681 send_reject_message("blocklisted (blacklisted)", flags);
682 session->clear();
683 break;
684 }
685
686 if (client_metadata.features.empty())
687 infer_supported_features(session, client_metadata);
688
689 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
690 dout(20) << " features: '" << client_metadata.features << "'" << dendl;
691 dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
692 for (const auto& p : client_metadata) {
693 dout(20) << " " << p.first << ": " << p.second << dendl;
694 }
695
696 feature_bitset_t missing_features = required_client_features;
697 missing_features -= client_metadata.features;
698 if (!missing_features.empty()) {
699 CachedStackStringStream css;
700 *css << "missing required features '" << missing_features << "'";
701 send_reject_message(css->strv());
702 mds->clog->warn() << "client session (" << session->info.inst
703 << ") lacks required features " << missing_features
704 << "; client supports " << client_metadata.features;
705 session->clear();
706 break;
707 }
708
709 // Special case for the 'root' metadata path; validate that the claimed
710 // root is actually within the caps of the session
711 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
712 auto claimed_root = it->second;
713 CachedStackStringStream css;
714 bool denied = false;
715 // claimed_root has a leading "/" which we strip before passing
716 // into caps check
717 if (claimed_root.empty() || claimed_root[0] != '/') {
718 denied = true;
719 *css << "invalue root '" << claimed_root << "'";
720 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
721 denied = true;
722 *css << "non-allowable root '" << claimed_root << "'";
723 }
724
725 if (denied) {
726 // Tell the client we're rejecting their open
727 send_reject_message(css->strv());
728 mds->clog->warn() << "client session with " << css->strv()
729 << " denied (" << session->info.inst << ")";
730 session->clear();
731 break;
732 }
733 }
734
735 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
736 if (find_session_by_uuid(it->second)) {
737 send_reject_message("duplicated session uuid");
738 mds->clog->warn() << "client session with duplicated session uuid '"
739 << it->second << "' denied (" << session->info.inst << ")";
740 session->clear();
741 break;
742 }
743 }
744
745 if (session->is_closed()) {
746 mds->sessionmap.add_session(session);
747 }
748
749 pv = mds->sessionmap.mark_projected(session);
750 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
751 mds->sessionmap.touch_session(session);
752 auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
753 ceph_assert(r == 0);
754 log_session_status("ACCEPTED", "");
755 });
756 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
757 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
758 mdlog->flush();
759 }
760 break;
761
762 case CEPH_SESSION_REQUEST_RENEWCAPS:
763 if (session->is_open() || session->is_stale()) {
764 mds->sessionmap.touch_session(session);
765 if (session->is_stale()) {
766 mds->sessionmap.set_state(session, Session::STATE_OPEN);
767 mds->locker->resume_stale_caps(session);
768 mds->sessionmap.touch_session(session);
769 }
770 auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
771 mds->send_message_client(reply, session);
772 } else {
773 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
774 }
775 break;
776
777 case CEPH_SESSION_REQUEST_CLOSE:
778 {
779 if (session->is_closed() ||
780 session->is_closing() ||
781 session->is_killing()) {
782 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
783 return;
784 }
785 if (session->is_importing()) {
786 dout(10) << "ignoring close req on importing session" << dendl;
787 return;
788 }
789 ceph_assert(session->is_open() ||
790 session->is_stale() ||
791 session->is_opening());
792 if (m->get_seq() < session->get_push_seq()) {
793 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
794 << ", dropping" << dendl;
795 return;
796 }
797 // We are getting a seq that is higher than expected.
798 // Handle the same as any other seqn error.
799 //
800 if (m->get_seq() != session->get_push_seq()) {
801 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
802 << ", BUGGY!" << dendl;
803 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
804 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
805 return;
806 }
807 journal_close_session(session, Session::STATE_CLOSING, NULL);
808 }
809 break;
810
811 case CEPH_SESSION_FLUSHMSG_ACK:
812 finish_flush_session(session, m->get_seq());
813 break;
814
815 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
816 if (mds->is_active())
817 mdlog->flush();
818 break;
819
820 default:
821 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT);
822 mds->send_message_client(m, session);
823 derr << "Server received unknown message " << m->get_type() << ", closing session and blocklisting the client " << session->get_client() << dendl;
824 CachedStackStringStream css;
825 mds->evict_client(session->get_client().v, false, true, *css, nullptr);
826 }
827 }
828
829 void Server::flush_session(Session *session, MDSGatherBuilder& gather) {
830 if (!session->is_open() ||
831 !session->get_connection() ||
832 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
833 return;
834 }
835
836 version_t seq = session->wait_for_flush(gather.new_sub());
837 mds->send_message_client(
838 make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
839 }
840
841 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
842 {
843 for (const auto& client : client_set) {
844 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
845 ceph_assert(session);
846 flush_session(session, gather);
847 }
848 }
849
850 void Server::finish_flush_session(Session *session, version_t seq)
851 {
852 MDSContext::vec finished;
853 session->finish_flush(seq, finished);
854 mds->queue_waiters(finished);
855 }
856
857 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
858 const interval_set<inodeno_t>& inos_to_free, version_t piv,
859 const interval_set<inodeno_t>& inos_to_purge, LogSegment *ls)
860 {
861 dout(10) << "_session_logged " << session->info.inst
862 << " state_seq " << state_seq
863 << " " << (open ? "open":"close") << " " << pv
864 << " inos_to_free " << inos_to_free << " inotablev " << piv
865 << " inos_to_purge " << inos_to_purge << dendl;
866
867 if (!open) {
868 if (inos_to_purge.size()){
869 ceph_assert(ls);
870 session->info.prealloc_inos.subtract(inos_to_purge);
871 ls->purging_inodes.insert(inos_to_purge);
872 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping())
873 mdcache->purge_inodes(inos_to_purge, ls);
874 }
875
876 if (inos_to_free.size()) {
877 ceph_assert(piv);
878 ceph_assert(session->is_closing() || session->is_killing() ||
879 session->is_opening()); // re-open closing session
880 session->info.prealloc_inos.subtract(inos_to_free);
881 mds->inotable->apply_release_ids(inos_to_free);
882 ceph_assert(mds->inotable->get_version() == piv);
883 }
884 session->free_prealloc_inos = session->info.prealloc_inos;
885 session->delegated_inos.clear();
886 }
887
888 mds->sessionmap.mark_dirty(session);
889
890 // apply
891 if (session->get_state_seq() != state_seq) {
892 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
893 << ", noop" << dendl;
894 // close must have been canceled (by an import?), or any number of other things..
895 } else if (open) {
896 ceph_assert(session->is_opening());
897 mds->sessionmap.set_state(session, Session::STATE_OPEN);
898 mds->sessionmap.touch_session(session);
899 metrics_handler->add_session(session);
900 ceph_assert(session->get_connection());
901 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
902 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
903 reply->supported_features = supported_features;
904 reply->metric_spec = supported_metric_spec;
905 }
906 mds->send_message_client(reply, session);
907 if (mdcache->is_readonly()) {
908 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
909 mds->send_message_client(m, session);
910 }
911 } else if (session->is_closing() ||
912 session->is_killing()) {
913 // kill any lingering capabilities, leases, requests
914 bool killing = session->is_killing();
915 while (!session->caps.empty()) {
916 Capability *cap = session->caps.front();
917 CInode *in = cap->get_inode();
918 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
919 mds->locker->remove_client_cap(in, cap, killing);
920 }
921 while (!session->leases.empty()) {
922 ClientLease *r = session->leases.front();
923 CDentry *dn = static_cast<CDentry*>(r->parent);
924 dout(20) << " killing client lease of " << *dn << dendl;
925 dn->remove_client_lease(r, mds->locker);
926 }
927 if (client_reconnect_gather.erase(session->info.get_client())) {
928 dout(20) << " removing client from reconnect set" << dendl;
929 if (client_reconnect_gather.empty()) {
930 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
931 reconnect_gather_finish();
932 }
933 }
934 if (client_reclaim_gather.erase(session->info.get_client())) {
935 dout(20) << " removing client from reclaim set" << dendl;
936 if (client_reclaim_gather.empty()) {
937 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
938 mds->maybe_clientreplay_done();
939 }
940 }
941
942 if (session->is_closing()) {
943 // mark con disposable. if there is a fault, we will get a
944 // reset and clean it up. if the client hasn't received the
945 // CLOSE message yet, they will reconnect and get an
946 // ms_handle_remote_reset() and realize they had in fact closed.
947 // do this *before* sending the message to avoid a possible
948 // race.
949 if (session->get_connection()) {
950 // Conditional because terminate_sessions will indiscrimately
951 // put sessions in CLOSING whether they ever had a conn or not.
952 session->get_connection()->mark_disposable();
953 }
954
955 // reset session
956 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
957 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
958 session->clear();
959 metrics_handler->remove_session(session);
960 mds->sessionmap.remove_session(session);
961 } else if (session->is_killing()) {
962 // destroy session, close connection
963 if (session->get_connection()) {
964 session->get_connection()->mark_down();
965 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
966 session->set_connection(nullptr);
967 }
968 metrics_handler->remove_session(session);
969 mds->sessionmap.remove_session(session);
970 } else {
971 ceph_abort();
972 }
973 } else {
974 ceph_abort();
975 }
976 }
977
978 /**
979 * Inject sessions from some source other than actual connections.
980 *
981 * For example:
982 * - sessions inferred from journal replay
983 * - sessions learned from other MDSs during rejoin
984 * - sessions learned from other MDSs during dir/caps migration
985 * - sessions learned from other MDSs during a cross-MDS rename
986 */
987 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
988 map<client_t,client_metadata_t>& cmm,
989 map<client_t, pair<Session*,uint64_t> >& smap)
990 {
991 version_t pv = mds->sessionmap.get_projected();
992
993 dout(10) << "prepare_force_open_sessions " << pv
994 << " on " << cm.size() << " clients"
995 << dendl;
996
997 mds->objecter->with_osdmap(
998 [this, &cm, &cmm](const OSDMap &osd_map) {
999 for (auto p = cm.begin(); p != cm.end(); ) {
1000 if (osd_map.is_blocklisted(p->second.addr)) {
1001 dout(10) << " ignoring blocklisted client." << p->first
1002 << " (" << p->second.addr << ")" << dendl;
1003 cmm.erase(p->first);
1004 cm.erase(p++);
1005 } else {
1006 ++p;
1007 }
1008 }
1009 });
1010
1011 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
1012 Session *session = mds->sessionmap.get_or_add_session(p->second);
1013 pv = mds->sessionmap.mark_projected(session);
1014 uint64_t sseq;
1015 if (session->is_closed() ||
1016 session->is_closing() ||
1017 session->is_killing()) {
1018 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
1019 auto q = cmm.find(p->first);
1020 if (q != cmm.end())
1021 session->info.client_metadata.merge(q->second);
1022 } else {
1023 ceph_assert(session->is_open() ||
1024 session->is_opening() ||
1025 session->is_stale());
1026 sseq = 0;
1027 }
1028 smap[p->first] = make_pair(session, sseq);
1029 session->inc_importing();
1030 }
1031 return pv;
1032 }
1033
1034 void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
1035 bool dec_import)
1036 {
1037 /*
1038 * FIXME: need to carefully consider the race conditions between a
1039 * client trying to close a session and an MDS doing an import
1040 * trying to force open a session...
1041 */
1042 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
1043 << " initial v " << mds->sessionmap.get_version() << dendl;
1044
1045 for (auto &it : smap) {
1046 Session *session = it.second.first;
1047 uint64_t sseq = it.second.second;
1048 if (sseq > 0) {
1049 if (session->get_state_seq() != sseq) {
1050 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
1051 } else {
1052 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
1053 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1054 mds->sessionmap.touch_session(session);
1055 metrics_handler->add_session(session);
1056
1057 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
1058 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
1059 reply->supported_features = supported_features;
1060 reply->metric_spec = supported_metric_spec;
1061 }
1062 mds->send_message_client(reply, session);
1063
1064 if (mdcache->is_readonly())
1065 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
1066 }
1067 } else {
1068 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
1069 ceph_assert(session->is_open() || session->is_stale());
1070 }
1071
1072 if (dec_import) {
1073 session->dec_importing();
1074 }
1075
1076 mds->sessionmap.mark_dirty(session);
1077 }
1078
1079 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
1080 }
1081
1082 class C_MDS_TerminatedSessions : public ServerContext {
1083 void finish(int r) override {
1084 server->terminating_sessions = false;
1085 }
1086 public:
1087 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
1088 };
1089
1090 void Server::terminate_sessions()
1091 {
1092 dout(5) << "terminating all sessions..." << dendl;
1093
1094 terminating_sessions = true;
1095
1096 // kill them off. clients will retry etc.
1097 set<Session*> sessions;
1098 mds->sessionmap.get_client_session_set(sessions);
1099 for (set<Session*>::const_iterator p = sessions.begin();
1100 p != sessions.end();
1101 ++p) {
1102 Session *session = *p;
1103 if (session->is_closing() ||
1104 session->is_killing() ||
1105 session->is_closed())
1106 continue;
1107 journal_close_session(session, Session::STATE_CLOSING, NULL);
1108 }
1109
1110 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
1111 }
1112
1113
1114 void Server::find_idle_sessions()
1115 {
1116 auto now = clock::now();
1117 auto last_cleared_laggy = mds->last_cleared_laggy();
1118
1119 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
1120
1121 // timeout/stale
1122 // (caps go stale, lease die)
1123 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
1124 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
1125
1126 // don't kick clients if we've been laggy
1127 if (last_cleared_laggy < cutoff) {
1128 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
1129 << "), not marking any client stale" << dendl;
1130 return;
1131 }
1132
1133 std::vector<Session*> to_evict;
1134
1135 bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
1136 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
1137 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
1138 std::vector<Session*> new_stale;
1139
1140 for (auto session : *(sessions_p1->second)) {
1141 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1142 if (last_cap_renew_span < cutoff) {
1143 dout(20) << "laggiest active session is " << session->info.inst
1144 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1145 break;
1146 }
1147
1148 if (session->last_seen > session->last_cap_renew) {
1149 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
1150 if (last_cap_renew_span < cutoff) {
1151 dout(20) << "laggiest active session is " << session->info.inst
1152 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1153 continue;
1154 }
1155 }
1156
1157 if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
1158 dout(20) << "evicting session " << session->info.inst << " since autoclose "
1159 "has arrived" << dendl;
1160 // evict session without marking it stale
1161 to_evict.push_back(session);
1162 continue;
1163 }
1164
1165 if (defer_session_stale &&
1166 !session->is_any_flush_waiter() &&
1167 !mds->locker->is_revoking_any_caps_from(session->get_client())) {
1168 dout(20) << "deferring marking session " << session->info.inst << " stale "
1169 "since it holds no caps" << dendl;
1170 continue;
1171 }
1172
1173 auto it = session->info.client_metadata.find("timeout");
1174 if (it != session->info.client_metadata.end()) {
1175 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
1176 if (timeout == 0) {
1177 dout(10) << "skipping session " << session->info.inst
1178 << ", infinite timeout specified" << dendl;
1179 continue;
1180 }
1181 double cutoff = queue_max_age + timeout;
1182 if (last_cap_renew_span < cutoff) {
1183 dout(10) << "skipping session " << session->info.inst
1184 << ", timeout (" << timeout << ") specified"
1185 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1186 continue;
1187 }
1188
1189 // do not go through stale, evict it directly.
1190 to_evict.push_back(session);
1191 } else {
1192 dout(10) << "new stale session " << session->info.inst
1193 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1194 new_stale.push_back(session);
1195 }
1196 }
1197
1198 for (auto session : new_stale) {
1199 mds->sessionmap.set_state(session, Session::STATE_STALE);
1200 if (mds->locker->revoke_stale_caps(session)) {
1201 mds->locker->remove_stale_leases(session);
1202 finish_flush_session(session, session->get_push_seq());
1203 auto m = make_message<MClientSession>(CEPH_SESSION_STALE);
1204 mds->send_message_client(m, session);
1205 } else {
1206 to_evict.push_back(session);
1207 }
1208 }
1209 }
1210
1211 // autoclose
1212 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
1213
1214 // Collect a list of sessions exceeding the autoclose threshold
1215 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
1216 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1217 for (auto session : *(sessions_p2->second)) {
1218 ceph_assert(session->is_stale());
1219 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1220 if (last_cap_renew_span < cutoff) {
1221 dout(20) << "oldest stale session is " << session->info.inst
1222 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1223 break;
1224 }
1225 to_evict.push_back(session);
1226 }
1227 }
1228
1229 for (auto session: to_evict) {
1230 if (session->is_importing()) {
1231 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1232 continue;
1233 }
1234
1235 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1236 mds->clog->warn() << "evicting unresponsive client " << *session
1237 << ", after " << last_cap_renew_span << " seconds";
1238 dout(10) << "autoclosing stale session " << session->info.inst
1239 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1240
1241 if (g_conf()->mds_session_blocklist_on_timeout) {
1242 CachedStackStringStream css;
1243 mds->evict_client(session->get_client().v, false, true, *css, nullptr);
1244 } else {
1245 kill_session(session, NULL);
1246 }
1247 }
1248 }
1249
1250 void Server::evict_cap_revoke_non_responders() {
1251 if (!cap_revoke_eviction_timeout) {
1252 return;
1253 }
1254
1255 auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
1256
1257 for (auto const &client: to_evict) {
1258 mds->clog->warn() << "client id " << client << " has not responded to"
1259 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1260 << " seconds, evicting";
1261 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1262 << client << dendl;
1263
1264 CachedStackStringStream css;
1265 bool evicted = mds->evict_client(client.v, false,
1266 g_conf()->mds_session_blocklist_on_evict,
1267 *css, nullptr);
1268 if (evicted && logger) {
1269 logger->inc(l_mdss_cap_revoke_eviction);
1270 }
1271 }
1272 }
1273
1274 void Server::handle_conf_change(const std::set<std::string>& changed) {
1275 if (changed.count("mds_forward_all_requests_to_auth")){
1276 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
1277 }
1278 if (changed.count("mds_cap_revoke_eviction_timeout")) {
1279 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
1280 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1281 << cap_revoke_eviction_timeout << dendl;
1282 }
1283 if (changed.count("mds_recall_max_decay_rate")) {
1284 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
1285 }
1286 if (changed.count("mds_max_snaps_per_dir")) {
1287 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
1288 dout(20) << __func__ << " max snapshots per directory changed to "
1289 << max_snaps_per_dir << dendl;
1290 }
1291 if (changed.count("mds_client_delegate_inos_pct")) {
1292 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
1293 }
1294 if (changed.count("mds_max_caps_per_client")) {
1295 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1296 }
1297 if (changed.count("mds_session_cap_acquisition_throttle")) {
1298 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
1299 }
1300 if (changed.count("mds_session_max_caps_throttle_ratio")) {
1301 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
1302 }
1303 if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1304 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1305 }
1306 if (changed.count("mds_alternate_name_max")) {
1307 alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
1308 }
1309 if (changed.count("mds_fscrypt_last_block_max_size")) {
1310 fscrypt_last_block_max_size = g_conf().get_val<Option::size_t>("mds_fscrypt_last_block_max_size");
1311 }
1312 if (changed.count("mds_dir_max_entries")) {
1313 dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
1314 dout(20) << __func__ << " max entries per directory changed to "
1315 << dir_max_entries << dendl;
1316 }
1317 if (changed.count("mds_bal_fragment_size_max")) {
1318 bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
1319 dout(20) << __func__ << " max fragment size changed to "
1320 << bal_fragment_size_max << dendl;
1321 }
1322 if (changed.count("mds_inject_rename_corrupt_dentry_first")) {
1323 inject_rename_corrupt_dentry_first = g_conf().get_val<double>("mds_inject_rename_corrupt_dentry_first");
1324 }
1325 }
1326
1327 /*
1328 * XXX bump in the interface here, not using an MDSContext here
1329 * because all the callers right now happen to use a SaferCond
1330 */
1331 void Server::kill_session(Session *session, Context *on_safe)
1332 {
1333 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
1334
1335 if ((session->is_opening() ||
1336 session->is_open() ||
1337 session->is_stale()) &&
1338 !session->is_importing()) {
1339 dout(10) << "kill_session " << session << dendl;
1340 journal_close_session(session, Session::STATE_KILLING, on_safe);
1341 } else {
1342 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
1343 if (session->is_closing() ||
1344 session->is_killing()) {
1345 if (on_safe)
1346 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1347 } else {
1348 ceph_assert(session->is_closed() ||
1349 session->is_importing());
1350 if (on_safe)
1351 on_safe->complete(0);
1352 }
1353 }
1354 }
1355
1356 size_t Server::apply_blocklist()
1357 {
1358 std::vector<Session*> victims;
1359 const auto& sessions = mds->sessionmap.get_sessions();
1360 mds->objecter->with_osdmap(
1361 [&](const OSDMap& o) {
1362 for (const auto& p : sessions) {
1363 if (!p.first.is_client()) {
1364 // Do not apply OSDMap blocklist to MDS daemons, we find out
1365 // about their death via MDSMap.
1366 continue;
1367 }
1368 if (o.is_blocklisted(p.second->info.inst.addr)) {
1369 victims.push_back(p.second);
1370 }
1371 }
1372 });
1373
1374 for (const auto& s : victims) {
1375 kill_session(s, nullptr);
1376 }
1377
1378 dout(10) << "apply_blocklist: killed " << victims.size() << dendl;
1379
1380 return victims.size();
1381 }
1382
1383 void Server::journal_close_session(Session *session, int state, Context *on_safe)
1384 {
1385 dout(10) << __func__ << " : "
1386 << session->info.inst
1387 << " pending_prealloc_inos " << session->pending_prealloc_inos
1388 << " free_prealloc_inos " << session->free_prealloc_inos
1389 << " delegated_inos " << session->delegated_inos << dendl;
1390
1391 uint64_t sseq = mds->sessionmap.set_state(session, state);
1392 version_t pv = mds->sessionmap.mark_projected(session);
1393 version_t piv = 0;
1394
1395 // release alloc and pending-alloc inos for this session
1396 // and wipe out session state, in case the session close aborts for some reason
1397 interval_set<inodeno_t> inos_to_free;
1398 inos_to_free.insert(session->pending_prealloc_inos);
1399 inos_to_free.insert(session->free_prealloc_inos);
1400 if (inos_to_free.size()) {
1401 mds->inotable->project_release_ids(inos_to_free);
1402 piv = mds->inotable->get_projected_version();
1403 } else
1404 piv = 0;
1405
1406 auto le = new ESession(session->info.inst, false, pv, inos_to_free, piv, session->delegated_inos);
1407 auto fin = new C_MDS_session_finish(this, session, sseq, false, pv, inos_to_free, piv,
1408 session->delegated_inos, mdlog->get_current_segment(), on_safe);
1409 mdlog->start_submit_entry(le, fin);
1410 mdlog->flush();
1411
1412 // clean up requests, too
1413 while(!session->requests.empty()) {
1414 auto mdr = MDRequestRef(*session->requests.begin());
1415 mdcache->request_kill(mdr);
1416 }
1417
1418 finish_flush_session(session, session->get_push_seq());
1419 }
1420
1421 void Server::reconnect_clients(MDSContext *reconnect_done_)
1422 {
1423 reconnect_done = reconnect_done_;
1424
1425 auto now = clock::now();
1426 set<Session*> sessions;
1427 mds->sessionmap.get_client_session_set(sessions);
1428 for (auto session : sessions) {
1429 if (session->is_open()) {
1430 client_reconnect_gather.insert(session->get_client());
1431 session->set_reconnecting(true);
1432 session->last_cap_renew = now;
1433 }
1434 }
1435
1436 if (client_reconnect_gather.empty()) {
1437 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1438 reconnect_gather_finish();
1439 return;
1440 }
1441
1442 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1443
1444 reconnect_start = now;
1445 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1446 mds->sessionmap.dump();
1447 }
1448
1449 void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
1450 {
1451 dout(7) << "handle_client_reconnect " << m->get_source()
1452 << (m->has_more() ? " (more)" : "") << dendl;
1453 client_t from = m->get_source().num();
1454 Session *session = mds->get_session(m);
1455 if (!session) {
1456 dout(0) << " ignoring sessionless msg " << *m << dendl;
1457 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
1458 reply->metadata["error_string"] = "sessionless";
1459 mds->send_message(reply, m->get_connection());
1460 return;
1461 }
1462
1463 if(mds->mdsmap->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
1464 mds->clog->warn() << "client could not reconnect as"
1465 " file system flag refuse_client_session is set";
1466 dout(0) << "client cannot reconnect when file system flag"
1467 " refuse_client_session is set" << dendl;
1468 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1469 reply->metadata["error_string"] = "client cannot reconnect when file system flag"
1470 " refuse_client_session is set";
1471 mds->send_message(reply, m->get_connection());
1472 return;
1473 }
1474
1475 if (!session->is_open()) {
1476 dout(0) << " ignoring msg from not-open session" << *m << dendl;
1477 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1478 mds->send_message(reply, m->get_connection());
1479 return;
1480 }
1481
1482 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1483
1484 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1485 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1486 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1487 return;
1488 }
1489
1490 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
1491 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1492
1493 bool deny = false;
1494 if (reconnect_all_deny || !mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
1495 // XXX maybe in the future we can do better than this?
1496 if (reconnect_all_deny) {
1497 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl;
1498 } else {
1499 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1500 }
1501 mds->clog->info() << "denied reconnect attempt (mds is "
1502 << ceph_mds_state_name(mds->get_state())
1503 << ") from " << m->get_source_inst()
1504 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
1505 deny = true;
1506 } else {
1507 std::string error_str;
1508 if (!session->is_open()) {
1509 error_str = "session is closed";
1510 } else if (mdcache->is_readonly()) {
1511 error_str = "mds is readonly";
1512 } else {
1513 if (session->info.client_metadata.features.empty())
1514 infer_supported_features(session, session->info.client_metadata);
1515
1516 feature_bitset_t missing_features = required_client_features;
1517 missing_features -= session->info.client_metadata.features;
1518 if (!missing_features.empty()) {
1519 CachedStackStringStream css;
1520 *css << "missing required features '" << missing_features << "'";
1521 error_str = css->strv();
1522 }
1523 }
1524
1525 if (!error_str.empty()) {
1526 deny = true;
1527 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1528 mds->clog->info() << "denied reconnect attempt from "
1529 << m->get_source_inst() << " (" << error_str << ")";
1530 }
1531 }
1532
1533 if (deny) {
1534 auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1535 mds->send_message_client(r, session);
1536 if (session->is_open()) {
1537 client_reconnect_denied.insert(session->get_client());
1538 }
1539 return;
1540 }
1541
1542 if (!m->has_more()) {
1543 metrics_handler->add_session(session);
1544 // notify client of success with an OPEN
1545 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
1546 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
1547 reply->supported_features = supported_features;
1548 reply->metric_spec = supported_metric_spec;
1549 }
1550 mds->send_message_client(reply, session);
1551 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1552 }
1553
1554 session->last_cap_renew = clock::now();
1555
1556 // snaprealms
1557 for (const auto &r : m->realms) {
1558 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
1559 if (in && in->state_test(CInode::STATE_PURGING))
1560 continue;
1561 if (in) {
1562 if (in->snaprealm) {
1563 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
1564 } else {
1565 // this can happen if we are non-auth or we rollback snaprealm
1566 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
1567 }
1568 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1569 } else {
1570 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1571 << " seq " << r.realm.seq << dendl;
1572 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1573 }
1574 }
1575
1576 // caps
1577 for (const auto &p : m->caps) {
1578 // make sure our last_cap_id is MAX over all issued caps
1579 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1580 mdcache->last_cap_id = p.second.capinfo.cap_id;
1581
1582 CInode *in = mdcache->get_inode(p.first);
1583 if (in && in->state_test(CInode::STATE_PURGING))
1584 continue;
1585 if (in && in->is_auth()) {
1586 // we recovered it, and it's ours. take note.
1587 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
1588 << " on " << *in << dendl;
1589 in->reconnect_cap(from, p.second, session);
1590 mdcache->add_reconnected_cap(from, p.first, p.second);
1591 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
1592 continue;
1593 }
1594
1595 if (in && !in->is_auth()) {
1596 // not mine.
1597 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1598 // add to cap export list.
1599 mdcache->rejoin_export_caps(p.first, from, p.second,
1600 in->authority().first, true);
1601 } else {
1602 // don't know if the inode is mine
1603 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1604 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
1605 }
1606 }
1607
1608 reconnect_last_seen = clock::now();
1609
1610 if (!m->has_more()) {
1611 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1612
1613 // remove from gather set
1614 client_reconnect_gather.erase(from);
1615 session->set_reconnecting(false);
1616 if (client_reconnect_gather.empty())
1617 reconnect_gather_finish();
1618 }
1619 }
1620
1621 void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1622 {
1623 int supported = -1;
1624 auto it = client_metadata.find("ceph_version");
1625 if (it != client_metadata.end()) {
1626 // user space client
1627 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1628 supported = CEPHFS_FEATURE_LUMINOUS;
1629 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1630 supported = CEPHFS_FEATURE_KRAKEN;
1631 } else {
1632 it = client_metadata.find("kernel_version");
1633 if (it != client_metadata.end()) {
1634 // kernel client
1635 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1636 supported = CEPHFS_FEATURE_LUMINOUS;
1637 }
1638 }
1639 if (supported == -1 &&
1640 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1641 supported = CEPHFS_FEATURE_JEWEL;
1642
1643 if (supported >= 0) {
1644 unsigned long value = (1UL << (supported + 1)) - 1;
1645 client_metadata.features = feature_bitset_t(value);
1646 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1647 }
1648 }
1649
1650 void Server::update_required_client_features()
1651 {
1652 required_client_features = mds->mdsmap->get_required_client_features();
1653 dout(7) << "required_client_features: " << required_client_features << dendl;
1654
1655 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1656 set<Session*> sessions;
1657 mds->sessionmap.get_client_session_set(sessions);
1658 for (auto session : sessions) {
1659 feature_bitset_t missing_features = required_client_features;
1660 missing_features -= session->info.client_metadata.features;
1661 if (!missing_features.empty()) {
1662 bool blocklisted = mds->objecter->with_osdmap(
1663 [session](const OSDMap &osd_map) -> bool {
1664 return osd_map.is_blocklisted(session->info.inst.addr);
1665 });
1666 if (blocklisted)
1667 continue;
1668
1669 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1670 << missing_features << "'";
1671 CachedStackStringStream css;
1672 mds->evict_client(session->get_client().v, false,
1673 g_conf()->mds_session_blocklist_on_evict, *css);
1674 }
1675 }
1676 }
1677 }
1678
1679 void Server::reconnect_gather_finish()
1680 {
1681 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1682 ceph_assert(reconnect_done);
1683
1684 if (!mds->snapclient->is_synced()) {
1685 // make sure snaptable cache is populated. snaprealms will be
1686 // extensively used in rejoin stage.
1687 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1688 mds->snapclient->wait_for_sync(reconnect_done);
1689 } else {
1690 reconnect_done->complete(0);
1691 }
1692 reconnect_done = NULL;
1693 }
1694
1695 void Server::reconnect_tick()
1696 {
1697 bool reject_all_reconnect = false;
1698 if (reconnect_evicting) {
1699 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
1700 return;
1701 }
1702
1703 /*
1704 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1705 * then load less meta information in rejoin phase. This will shorten reboot time.
1706 * Moreover, loading less meta increases the chance standby with less memory can failover.
1707
1708 * Why not shorten reconnect period?
1709 * Clients may send unsafe or retry requests, which haven't been
1710 * completed before old mds stop, to new mds. These requests may
1711 * need to be processed during new mds's clientreplay phase,
1712 * see: #https://github.com/ceph/ceph/pull/29059.
1713 */
1714 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1715 if (client_reconnect_gather.empty())
1716 return;
1717
1718 if (reconnect_all_deny && (client_reconnect_gather == client_reconnect_denied))
1719 reject_all_reconnect = true;
1720
1721 auto now = clock::now();
1722 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
1723 if (elapse1 < g_conf()->mds_reconnect_timeout && !reject_all_reconnect)
1724 return;
1725
1726 vector<Session*> remaining_sessions;
1727 remaining_sessions.reserve(client_reconnect_gather.size());
1728 for (auto c : client_reconnect_gather) {
1729 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1730 ceph_assert(session);
1731 remaining_sessions.push_back(session);
1732 // client re-sends cap flush messages before the reconnect message
1733 if (session->last_seen > reconnect_last_seen)
1734 reconnect_last_seen = session->last_seen;
1735 }
1736
1737 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
1738 if (elapse2 < g_conf()->mds_reconnect_timeout / 2 && !reject_all_reconnect) {
1739 dout(7) << "reconnect_tick: last seen " << elapse2
1740 << " seconds ago, extending reconnect interval" << dendl;
1741 return;
1742 }
1743
1744 dout(7) << "reconnect timed out, " << remaining_sessions.size()
1745 << " clients have not reconnected in time" << dendl;
1746
1747 // If we're doing blocklist evictions, use this to wait for them before
1748 // proceeding to reconnect_gather_finish
1749 MDSGatherBuilder gather(g_ceph_context);
1750
1751 for (auto session : remaining_sessions) {
1752 // Keep sessions that have specified timeout. These sessions will prevent
1753 // mds from going to active. MDS goes to active after they all have been
1754 // killed or reclaimed.
1755 if (session->info.client_metadata.find("timeout") !=
1756 session->info.client_metadata.end()) {
1757 dout(1) << "reconnect keeps " << session->info.inst
1758 << ", need to be reclaimed" << dendl;
1759 client_reclaim_gather.insert(session->get_client());
1760 continue;
1761 }
1762
1763 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
1764
1765 mds->clog->warn() << "evicting unresponsive client " << *session
1766 << ", after waiting " << elapse1
1767 << " seconds during MDS startup";
1768
1769 // make _session_logged() purge orphan objects of lost async/unsafe requests
1770 session->delegated_inos.swap(session->free_prealloc_inos);
1771
1772 if (g_conf()->mds_session_blocklist_on_timeout) {
1773 CachedStackStringStream css;
1774 mds->evict_client(session->get_client().v, false, true, *css,
1775 gather.new_sub());
1776 } else {
1777 kill_session(session, NULL);
1778 }
1779
1780 failed_reconnects++;
1781 }
1782 client_reconnect_gather.clear();
1783 client_reconnect_denied.clear();
1784
1785 if (gather.has_subs()) {
1786 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1787 gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
1788 [this](int r){reconnect_gather_finish();})));
1789 gather.activate();
1790 reconnect_evicting = true;
1791 } else {
1792 reconnect_gather_finish();
1793 }
1794 }
1795
1796 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1797 {
1798 if (!locks.length()) return;
1799 int numlocks;
1800 ceph_filelock lock;
1801 auto p = locks.cbegin();
1802 decode(numlocks, p);
1803 for (int i = 0; i < numlocks; ++i) {
1804 decode(lock, p);
1805 lock.client = client;
1806 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1807 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1808 }
1809 decode(numlocks, p);
1810 for (int i = 0; i < numlocks; ++i) {
1811 decode(lock, p);
1812 lock.client = client;
1813 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1814 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1815 }
1816 }
1817
1818 /**
1819 * Call this when the MDCache is oversized, to send requests to the clients
1820 * to trim some caps, and consequently unpin some inodes in the MDCache so
1821 * that it can trim too.
1822 */
1823 std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1824 {
1825 const auto now = clock::now();
1826 const bool steady = !!(flags&RecallFlags::STEADY);
1827 const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
1828 const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
1829 const bool trim = !!(flags&RecallFlags::TRIM);
1830
1831 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1832 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1833 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1834 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1835 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
1836 const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
1837
1838 dout(7) << __func__ << ":"
1839 << " min=" << min_caps_per_client
1840 << " max=" << max_caps_per_client
1841 << " total=" << Capability::count()
1842 << " flags=" << flags
1843 << dendl;
1844
1845 /* trim caps of sessions with the most caps first */
1846 std::multimap<uint64_t, Session*> caps_session;
1847 auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
1848 auto num_caps = s->caps.size();
1849 auto cache_liveness = s->get_session_cache_liveness();
1850 if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
1851 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1852 }
1853 };
1854 mds->sessionmap.get_client_sessions(std::move(f));
1855
1856 std::pair<bool, uint64_t> result = {false, 0};
1857 auto& [throttled, caps_recalled] = result;
1858 last_recall_state = now;
1859 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
1860 if (!session->is_open() ||
1861 !session->get_connection() ||
1862 !session->info.inst.name.is_client())
1863 continue;
1864
1865 dout(10) << __func__ << ":"
1866 << " session " << session->info.inst
1867 << " caps " << num_caps
1868 << ", leases " << session->leases.size()
1869 << dendl;
1870
1871 uint64_t newlim;
1872 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1873 newlim = min_caps_per_client;
1874 } else {
1875 newlim = num_caps-recall_max_caps;
1876 }
1877 if (num_caps > newlim) {
1878 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1879 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1880 newlim = num_caps-recall;
1881 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
1882 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1883 const uint64_t global_recall_throttle = recall_throttle.get();
1884 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1885 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1886 throttled = true;
1887 continue;
1888 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1889 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1890 throttled = true;
1891 continue;
1892 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1893 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1894 throttled = true;
1895 break;
1896 }
1897
1898 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1899 if (steady) {
1900 const auto session_recall = session->get_recall_caps();
1901 const auto session_release = session->get_release_caps();
1902 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1903 /* The session has been unable to keep up with the number of caps
1904 * recalled (by half); additionally, to prevent marking sessions
1905 * we've just begun to recall from, the session_recall counter
1906 * (decayed count of caps recently recalled) is **greater** than the
1907 * session threshold for the session's cap recall throttle.
1908 */
1909 dout(15) << " 2*session_release < session_recall"
1910 " (2*" << session_release << " < " << session_recall << ") &&"
1911 " 2*session_recall < recall_max_decay_threshold"
1912 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
1913 " Skipping because we are unlikely to get more released." << dendl;
1914 continue;
1915 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1916 /* The number of caps recalled is less than the number we *could*
1917 * recall (so there isn't much left to recall?) and the number of
1918 * caps is less than the current recall_caps counter (decayed count
1919 * of caps recently recalled).
1920 */
1921 dout(15) << " 2*recall < session_recall "
1922 " (2*" << recall << " < " << session_recall << ") &&"
1923 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1924 " Skipping because we are unlikely to get more released." << dendl;
1925 continue;
1926 }
1927 }
1928
1929 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1930
1931 auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
1932 m->head.max_caps = newlim;
1933 mds->send_message_client(m, session);
1934 if (gather) {
1935 flush_session(session, *gather);
1936 }
1937 caps_recalled += session->notify_recall_sent(newlim);
1938 recall_throttle.hit(recall);
1939 }
1940 }
1941
1942 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1943
1944 return result;
1945 }
1946
1947 void Server::force_clients_readonly()
1948 {
1949 dout(10) << "force_clients_readonly" << dendl;
1950 set<Session*> sessions;
1951 mds->sessionmap.get_client_session_set(sessions);
1952 for (set<Session*>::const_iterator p = sessions.begin();
1953 p != sessions.end();
1954 ++p) {
1955 Session *session = *p;
1956 if (!session->info.inst.name.is_client() ||
1957 !(session->is_open() || session->is_stale()))
1958 continue;
1959 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
1960 }
1961 }
1962
1963 /*******
1964 * some generic stuff for finishing off requests
1965 */
1966 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1967 {
1968 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1969 ceph_assert(!mdr->has_completed);
1970
1971 // note trace items for eventual reply.
1972 mdr->tracei = in;
1973 if (in)
1974 mdr->pin(in);
1975
1976 mdr->tracedn = dn;
1977 if (dn)
1978 mdr->pin(dn);
1979
1980 early_reply(mdr, in, dn);
1981
1982 mdr->committing = true;
1983 submit_mdlog_entry(le, fin, mdr, __func__);
1984
1985 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1986 if (mds->queue_one_replay()) {
1987 dout(10) << " queued next replay op" << dendl;
1988 } else {
1989 dout(10) << " journaled last replay op" << dendl;
1990 }
1991 } else if (mdr->did_early_reply) {
1992 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
1993 if (dn && dn->is_waiter_for(CDentry::WAIT_UNLINK_FINISH))
1994 mdlog->flush();
1995 } else {
1996 mdlog->flush();
1997 }
1998 }
1999
2000 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
2001 std::string_view event)
2002 {
2003 if (mdr) {
2004 string event_str("submit entry: ");
2005 event_str += event;
2006 mdr->mark_event(event_str);
2007 }
2008 mdlog->submit_entry(le, fin);
2009 }
2010
2011 /*
2012 * send response built from mdr contents and error code; clean up mdr
2013 */
2014 void Server::respond_to_request(MDRequestRef& mdr, int r)
2015 {
2016 if (mdr->client_request) {
2017 if (mdr->is_batch_head()) {
2018 dout(20) << __func__ << " batch head " << *mdr << dendl;
2019 mdr->release_batch_op()->respond(r);
2020 } else {
2021 reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
2022 }
2023 } else if (mdr->internal_op > -1) {
2024 dout(10) << "respond_to_request on internal request " << mdr << dendl;
2025 if (!mdr->internal_op_finish)
2026 ceph_abort_msg("trying to respond to internal op without finisher");
2027 mdr->internal_op_finish->complete(r);
2028 mdcache->request_finish(mdr);
2029 }
2030 }
2031
2032 // statistics mds req op number and latency
2033 void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
2034 {
2035 int code = l_mdss_first;
2036 switch(req->get_op()) {
2037 case CEPH_MDS_OP_LOOKUPHASH:
2038 code = l_mdss_req_lookuphash_latency;
2039 break;
2040 case CEPH_MDS_OP_LOOKUPINO:
2041 code = l_mdss_req_lookupino_latency;
2042 break;
2043 case CEPH_MDS_OP_LOOKUPPARENT:
2044 code = l_mdss_req_lookupparent_latency;
2045 break;
2046 case CEPH_MDS_OP_LOOKUPNAME:
2047 code = l_mdss_req_lookupname_latency;
2048 break;
2049 case CEPH_MDS_OP_LOOKUP:
2050 code = l_mdss_req_lookup_latency;
2051 break;
2052 case CEPH_MDS_OP_LOOKUPSNAP:
2053 code = l_mdss_req_lookupsnap_latency;
2054 break;
2055 case CEPH_MDS_OP_GETATTR:
2056 code = l_mdss_req_getattr_latency;
2057 break;
2058 case CEPH_MDS_OP_SETATTR:
2059 code = l_mdss_req_setattr_latency;
2060 break;
2061 case CEPH_MDS_OP_SETLAYOUT:
2062 code = l_mdss_req_setlayout_latency;
2063 break;
2064 case CEPH_MDS_OP_SETDIRLAYOUT:
2065 code = l_mdss_req_setdirlayout_latency;
2066 break;
2067 case CEPH_MDS_OP_GETVXATTR:
2068 code = l_mdss_req_getvxattr_latency;
2069 break;
2070 case CEPH_MDS_OP_SETXATTR:
2071 code = l_mdss_req_setxattr_latency;
2072 break;
2073 case CEPH_MDS_OP_RMXATTR:
2074 code = l_mdss_req_rmxattr_latency;
2075 break;
2076 case CEPH_MDS_OP_READDIR:
2077 code = l_mdss_req_readdir_latency;
2078 break;
2079 case CEPH_MDS_OP_SETFILELOCK:
2080 code = l_mdss_req_setfilelock_latency;
2081 break;
2082 case CEPH_MDS_OP_GETFILELOCK:
2083 code = l_mdss_req_getfilelock_latency;
2084 break;
2085 case CEPH_MDS_OP_CREATE:
2086 code = l_mdss_req_create_latency;
2087 break;
2088 case CEPH_MDS_OP_OPEN:
2089 code = l_mdss_req_open_latency;
2090 break;
2091 case CEPH_MDS_OP_MKNOD:
2092 code = l_mdss_req_mknod_latency;
2093 break;
2094 case CEPH_MDS_OP_LINK:
2095 code = l_mdss_req_link_latency;
2096 break;
2097 case CEPH_MDS_OP_UNLINK:
2098 code = l_mdss_req_unlink_latency;
2099 break;
2100 case CEPH_MDS_OP_RMDIR:
2101 code = l_mdss_req_rmdir_latency;
2102 break;
2103 case CEPH_MDS_OP_RENAME:
2104 code = l_mdss_req_rename_latency;
2105 break;
2106 case CEPH_MDS_OP_MKDIR:
2107 code = l_mdss_req_mkdir_latency;
2108 break;
2109 case CEPH_MDS_OP_SYMLINK:
2110 code = l_mdss_req_symlink_latency;
2111 break;
2112 case CEPH_MDS_OP_LSSNAP:
2113 code = l_mdss_req_lssnap_latency;
2114 break;
2115 case CEPH_MDS_OP_MKSNAP:
2116 code = l_mdss_req_mksnap_latency;
2117 break;
2118 case CEPH_MDS_OP_RMSNAP:
2119 code = l_mdss_req_rmsnap_latency;
2120 break;
2121 case CEPH_MDS_OP_RENAMESNAP:
2122 code = l_mdss_req_renamesnap_latency;
2123 break;
2124 default:
2125 dout(1) << ": unknown client op" << dendl;
2126 return;
2127 }
2128 logger->tinc(code, lat);
2129 }
2130
2131 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
2132 {
2133 if (!g_conf()->mds_early_reply)
2134 return;
2135
2136 if (mdr->no_early_reply) {
2137 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
2138 return;
2139 }
2140
2141 if (mdr->has_more() && mdr->more()->has_journaled_peers) {
2142 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl;
2143 return;
2144 }
2145
2146 if (mdr->alloc_ino) {
2147 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
2148 return;
2149 }
2150
2151 const cref_t<MClientRequest> &req = mdr->client_request;
2152 entity_inst_t client_inst = req->get_source_inst();
2153 if (client_inst.name.is_mds())
2154 return;
2155
2156 if (req->is_replay()) {
2157 dout(10) << " no early reply on replay op" << dendl;
2158 return;
2159 }
2160
2161
2162 auto reply = make_message<MClientReply>(*req, 0);
2163 reply->set_unsafe();
2164
2165 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2166 //
2167 //_rename_finish() does not send dentry link/unlink message to replicas.
2168 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2169 // that have projected linkages from getting new replica.
2170 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
2171
2172 dout(10) << "early_reply " << reply->get_result()
2173 << " (" << cpp_strerror(reply->get_result())
2174 << ") " << *req << dendl;
2175
2176 if (tracei || tracedn) {
2177 if (tracei)
2178 mdr->cap_releases.erase(tracei->vino());
2179 if (tracedn)
2180 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2181
2182 set_trace_dist(reply, tracei, tracedn, mdr);
2183 }
2184
2185 reply->set_extra_bl(mdr->reply_extra_bl);
2186 mds->send_message_client(reply, mdr->session);
2187
2188 mdr->did_early_reply = true;
2189
2190 mds->logger->inc(l_mds_reply);
2191 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
2192 mds->logger->tinc(l_mds_reply_latency, lat);
2193 if (lat >= g_conf()->mds_op_complaint_time) {
2194 mds->logger->inc(l_mds_slow_reply);
2195 }
2196 if (client_inst.name.is_client()) {
2197 mds->sessionmap.hit_session(mdr->session);
2198 }
2199 perf_gather_op_latency(req, lat);
2200 dout(20) << "lat " << lat << dendl;
2201
2202 mdr->mark_event("early_replied");
2203 }
2204
2205 /*
2206 * send given reply
2207 * include a trace to tracei
2208 * Clean up mdr
2209 */
2210 void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
2211 {
2212 ceph_assert(mdr.get());
2213 const cref_t<MClientRequest> &req = mdr->client_request;
2214
2215 dout(7) << "reply_client_request " << reply->get_result()
2216 << " (" << cpp_strerror(reply->get_result())
2217 << ") " << *req << dendl;
2218
2219 mdr->mark_event("replying");
2220
2221 Session *session = mdr->session;
2222
2223 // note successful request in session map?
2224 //
2225 // setfilelock requests are special, they only modify states in MDS memory.
2226 // The states get lost when MDS fails. If Client re-send a completed
2227 // setfilelock request, it means that client did not receive corresponding
2228 // setfilelock reply. So MDS should re-execute the setfilelock request.
2229 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
2230 reply->get_result() == 0 && session) {
2231 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
2232 session->add_completed_request(mdr->reqid.tid, created);
2233 if (mdr->ls) {
2234 mdr->ls->touched_sessions.insert(session->info.inst.name);
2235 }
2236 }
2237
2238 // give any preallocated inos to the session
2239 apply_allocated_inos(mdr, session);
2240
2241 // get tracei/tracedn from mdr?
2242 CInode *tracei = mdr->tracei;
2243 CDentry *tracedn = mdr->tracedn;
2244
2245 bool is_replay = mdr->client_request->is_replay();
2246 bool did_early_reply = mdr->did_early_reply;
2247 entity_inst_t client_inst = req->get_source_inst();
2248
2249 if (!did_early_reply && !is_replay) {
2250
2251 mds->logger->inc(l_mds_reply);
2252 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
2253 mds->logger->tinc(l_mds_reply_latency, lat);
2254 if (lat >= g_conf()->mds_op_complaint_time) {
2255 mds->logger->inc(l_mds_slow_reply);
2256 }
2257 if (session && client_inst.name.is_client()) {
2258 mds->sessionmap.hit_session(session);
2259 }
2260 perf_gather_op_latency(req, lat);
2261 dout(20) << "lat " << lat << dendl;
2262
2263 if (tracei)
2264 mdr->cap_releases.erase(tracei->vino());
2265 if (tracedn)
2266 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2267 }
2268
2269 // drop non-rdlocks before replying, so that we can issue leases
2270 mdcache->request_drop_non_rdlocks(mdr);
2271
2272 // reply at all?
2273 if (session && !client_inst.name.is_mds()) {
2274 // send reply.
2275 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
2276 (tracei || tracedn)) {
2277 if (is_replay) {
2278 if (tracei)
2279 mdcache->try_reconnect_cap(tracei, session);
2280 } else {
2281 // include metadata in reply
2282 set_trace_dist(reply, tracei, tracedn, mdr);
2283 }
2284 }
2285
2286 // We can set the extra bl unconditionally: if it's already been sent in the
2287 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2288 reply->set_extra_bl(mdr->reply_extra_bl);
2289
2290 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
2291 mds->send_message_client(reply, session);
2292 }
2293
2294 if (req->is_queued_for_replay() &&
2295 (mdr->has_completed || reply->get_result() < 0)) {
2296 if (reply->get_result() < 0) {
2297 int r = reply->get_result();
2298 derr << "reply_client_request: failed to replay " << *req
2299 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2300 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2301 }
2302 mds->queue_one_replay();
2303 }
2304
2305 // clean up request
2306 mdcache->request_finish(mdr);
2307
2308 // take a closer look at tracei, if it happens to be a remote link
2309 if (tracei &&
2310 tracedn &&
2311 tracedn->get_projected_linkage()->is_remote()) {
2312 mdcache->eval_remote(tracedn);
2313 }
2314 }
2315
2316 /*
2317 * pass inode OR dentry (not both, or we may get confused)
2318 *
2319 * trace is in reverse order (i.e. root inode comes last)
2320 */
2321 void Server::set_trace_dist(const ref_t<MClientReply> &reply,
2322 CInode *in, CDentry *dn,
2323 MDRequestRef& mdr)
2324 {
2325 // skip doing this for debugging purposes?
2326 if (g_conf()->mds_inject_traceless_reply_probability &&
2327 mdr->ls && !mdr->o_trunc &&
2328 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
2329 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2330 return;
2331 }
2332
2333 // inode, dentry, dir, ..., inode
2334 bufferlist bl;
2335 mds_rank_t whoami = mds->get_nodeid();
2336 Session *session = mdr->session;
2337 snapid_t snapid = mdr->snapid;
2338 utime_t now = ceph_clock_now();
2339
2340 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2341
2342 // realm
2343 if (snapid == CEPH_NOSNAP) {
2344 SnapRealm *realm;
2345 if (in)
2346 realm = in->find_snaprealm();
2347 else
2348 realm = dn->get_dir()->get_inode()->find_snaprealm();
2349 reply->snapbl = get_snap_trace(session, realm);
2350 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2351 }
2352
2353 // dir + dentry?
2354 if (dn) {
2355 reply->head.is_dentry = 1;
2356 CDir *dir = dn->get_dir();
2357 CInode *diri = dir->get_inode();
2358
2359 diri->encode_inodestat(bl, session, NULL, snapid);
2360 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2361
2362 #ifdef MDS_VERIFY_FRAGSTAT
2363 if (dir->is_complete())
2364 dir->verify_fragstat();
2365 #endif
2366 DirStat ds;
2367 ds.frag = dir->get_frag();
2368 ds.auth = dir->get_dir_auth().first;
2369 if (dir->is_auth() && !forward_all_requests_to_auth)
2370 dir->get_dist_spec(ds.dist, whoami);
2371
2372 dir->encode_dirstat(bl, session->info, ds);
2373 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2374
2375 encode(dn->get_name(), bl);
2376 mds->locker->issue_client_lease(dn, in, mdr, now, bl);
2377 } else
2378 reply->head.is_dentry = 0;
2379
2380 // inode
2381 if (in) {
2382 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2383 dout(20) << "set_trace_dist added in " << *in << dendl;
2384 reply->head.is_target = 1;
2385 } else
2386 reply->head.is_target = 0;
2387
2388 reply->set_trace(bl);
2389 }
2390
2391 void Server::handle_client_request(const cref_t<MClientRequest> &req)
2392 {
2393 dout(4) << "handle_client_request " << *req << dendl;
2394
2395 if (mds->logger)
2396 mds->logger->inc(l_mds_request);
2397 if (logger)
2398 logger->inc(l_mdss_handle_client_request);
2399
2400 if (!mdcache->is_open()) {
2401 dout(5) << "waiting for root" << dendl;
2402 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2403 return;
2404 }
2405
2406 bool sessionclosed_isok = replay_unsafe_with_closed_session;
2407 // active session?
2408 Session *session = 0;
2409 if (req->is_a_client()) {
2410 session = mds->get_session(req);
2411 if (!session) {
2412 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
2413 } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
2414 session->is_closing() ||
2415 session->is_killing()) {
2416 dout(5) << "session closed|closing|killing, dropping" << dendl;
2417 session = NULL;
2418 }
2419 if (!session) {
2420 if (req->is_queued_for_replay())
2421 mds->queue_one_replay();
2422 return;
2423 }
2424 }
2425
2426 // old mdsmap?
2427 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2428 // send it? hrm, this isn't ideal; they may get a lot of copies if
2429 // they have a high request rate.
2430 }
2431
2432 // completed request?
2433 bool has_completed = false;
2434 if (req->is_replay() || req->get_retry_attempt()) {
2435 ceph_assert(session);
2436 inodeno_t created;
2437 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2438 has_completed = true;
2439 if (!session->is_open())
2440 return;
2441 // Don't send traceless reply if the completed request has created
2442 // new inode. Treat the request as lookup request instead.
2443 if (req->is_replay() ||
2444 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2445 req->get_op() != CEPH_MDS_OP_OPEN &&
2446 req->get_op() != CEPH_MDS_OP_CREATE)) {
2447 dout(5) << "already completed " << req->get_reqid() << dendl;
2448 auto reply = make_message<MClientReply>(*req, 0);
2449 if (created != inodeno_t()) {
2450 bufferlist extra;
2451 encode(created, extra);
2452 reply->set_extra_bl(extra);
2453 }
2454 mds->send_message_client(reply, session);
2455
2456 if (req->is_queued_for_replay())
2457 mds->queue_one_replay();
2458
2459 return;
2460 }
2461 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2462 req->get_op() != CEPH_MDS_OP_CREATE) {
2463 dout(10) << " completed request which created new inode " << created
2464 << ", convert it to lookup request" << dendl;
2465 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2466 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2467 }
2468 }
2469 }
2470
2471 // trim completed_request list
2472 if (req->get_oldest_client_tid() > 0) {
2473 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
2474 ceph_assert(session);
2475 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2476 // Sessions 'completed_requests' was dirtied, mark it to be
2477 // potentially flushed at segment expiry.
2478 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2479
2480 if (session->get_num_trim_requests_warnings() > 0 &&
2481 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
2482 session->reset_num_trim_requests_warnings();
2483 } else {
2484 if (session->get_num_completed_requests() >=
2485 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
2486 session->inc_num_trim_requests_warnings();
2487 CachedStackStringStream css;
2488 *css << "client." << session->get_client() << " does not advance its oldest_client_tid ("
2489 << req->get_oldest_client_tid() << "), "
2490 << session->get_num_completed_requests()
2491 << " completed requests recorded in session\n";
2492 mds->clog->warn() << css->strv();
2493 dout(20) << __func__ << " " << css->strv() << dendl;
2494 }
2495 }
2496 }
2497
2498 // register + dispatch
2499 MDRequestRef mdr = mdcache->request_start(req);
2500 if (!mdr.get())
2501 return;
2502
2503 if (session) {
2504 mdr->session = session;
2505 session->requests.push_back(&mdr->item_session_request);
2506 }
2507
2508 if (has_completed)
2509 mdr->has_completed = true;
2510
2511 // process embedded cap releases?
2512 // (only if NOT replay!)
2513 if (!req->releases.empty() && req->is_a_client() && !req->is_replay()) {
2514 client_t client = req->get_source().num();
2515 for (const auto &r : req->releases) {
2516 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2517 }
2518 req->releases.clear();
2519 }
2520
2521 dispatch_client_request(mdr);
2522 return;
2523 }
2524
2525 void Server::handle_osd_map()
2526 {
2527 /* Note that we check the OSDMAP_FULL flag directly rather than
2528 * using osdmap_full_flag(), because we want to know "is the flag set"
2529 * rather than "does the flag apply to us?" */
2530 mds->objecter->with_osdmap([this](const OSDMap& o) {
2531 auto pi = o.get_pg_pool(mds->get_metadata_pool());
2532 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
2533 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2534 << o.get_epoch() << dendl;
2535 });
2536 }
2537
2538 void Server::dispatch_client_request(MDRequestRef& mdr)
2539 {
2540 // we shouldn't be waiting on anyone.
2541 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_peer.empty());
2542
2543 if (mdr->killed) {
2544 dout(10) << "request " << *mdr << " was killed" << dendl;
2545 //if the mdr is a "batch_op" and it has followers, pick a follower as
2546 //the new "head of the batch ops" and go on processing the new one.
2547 if (mdr->is_batch_head()) {
2548 int mask = mdr->client_request->head.args.getattr.mask;
2549 auto it = mdr->batch_op_map->find(mask);
2550 auto new_batch_head = it->second->find_new_head();
2551 if (!new_batch_head) {
2552 mdr->batch_op_map->erase(it);
2553 return;
2554 }
2555 mdr = std::move(new_batch_head);
2556 } else {
2557 return;
2558 }
2559 } else if (mdr->aborted) {
2560 mdr->aborted = false;
2561 mdcache->request_kill(mdr);
2562 return;
2563 }
2564
2565 const cref_t<MClientRequest> &req = mdr->client_request;
2566
2567 if (logger) logger->inc(l_mdss_dispatch_client_request);
2568
2569 dout(7) << "dispatch_client_request " << *req << dendl;
2570
2571 if (req->may_write() && mdcache->is_readonly()) {
2572 dout(10) << " read-only FS" << dendl;
2573 respond_to_request(mdr, -CEPHFS_EROFS);
2574 return;
2575 }
2576 if (mdr->has_more() && mdr->more()->peer_error) {
2577 dout(10) << " got error from peers" << dendl;
2578 respond_to_request(mdr, mdr->more()->peer_error);
2579 return;
2580 }
2581
2582 if (is_full) {
2583 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
2584 if (!cur) {
2585 // the request is already responded to
2586 return;
2587 }
2588 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2589 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2590 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2591 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2592 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2593 req->get_op() == CEPH_MDS_OP_CREATE ||
2594 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2595 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2596 ((req->get_op() == CEPH_MDS_OP_LINK ||
2597 req->get_op() == CEPH_MDS_OP_RENAME) &&
2598 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started peer request
2599 ) {
2600
2601 if (check_access(mdr, cur, MAY_FULL)) {
2602 dout(20) << __func__ << ": full, has FULL caps, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2603 } else {
2604 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2605 respond_to_request(mdr, -CEPHFS_ENOSPC);
2606 return;
2607 }
2608 } else {
2609 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2610 }
2611 }
2612
2613 switch (req->get_op()) {
2614 case CEPH_MDS_OP_LOOKUPHASH:
2615 case CEPH_MDS_OP_LOOKUPINO:
2616 handle_client_lookup_ino(mdr, false, false);
2617 break;
2618 case CEPH_MDS_OP_LOOKUPPARENT:
2619 handle_client_lookup_ino(mdr, true, false);
2620 break;
2621 case CEPH_MDS_OP_LOOKUPNAME:
2622 handle_client_lookup_ino(mdr, false, true);
2623 break;
2624
2625 // inodes ops.
2626 case CEPH_MDS_OP_LOOKUP:
2627 handle_client_getattr(mdr, true);
2628 break;
2629
2630 case CEPH_MDS_OP_LOOKUPSNAP:
2631 // lookupsnap does not reference a CDentry; treat it as a getattr
2632 case CEPH_MDS_OP_GETATTR:
2633 handle_client_getattr(mdr, false);
2634 break;
2635 case CEPH_MDS_OP_GETVXATTR:
2636 handle_client_getvxattr(mdr);
2637 break;
2638
2639 case CEPH_MDS_OP_SETATTR:
2640 handle_client_setattr(mdr);
2641 break;
2642 case CEPH_MDS_OP_SETLAYOUT:
2643 handle_client_setlayout(mdr);
2644 break;
2645 case CEPH_MDS_OP_SETDIRLAYOUT:
2646 handle_client_setdirlayout(mdr);
2647 break;
2648 case CEPH_MDS_OP_SETXATTR:
2649 handle_client_setxattr(mdr);
2650 break;
2651 case CEPH_MDS_OP_RMXATTR:
2652 handle_client_removexattr(mdr);
2653 break;
2654
2655 case CEPH_MDS_OP_READDIR:
2656 handle_client_readdir(mdr);
2657 break;
2658
2659 case CEPH_MDS_OP_SETFILELOCK:
2660 handle_client_file_setlock(mdr);
2661 break;
2662
2663 case CEPH_MDS_OP_GETFILELOCK:
2664 handle_client_file_readlock(mdr);
2665 break;
2666
2667 // funky.
2668 case CEPH_MDS_OP_CREATE:
2669 if (mdr->has_completed)
2670 handle_client_open(mdr); // already created.. just open
2671 else
2672 handle_client_openc(mdr);
2673 break;
2674
2675 case CEPH_MDS_OP_OPEN:
2676 handle_client_open(mdr);
2677 break;
2678
2679 // namespace.
2680 // no prior locks.
2681 case CEPH_MDS_OP_MKNOD:
2682 handle_client_mknod(mdr);
2683 break;
2684 case CEPH_MDS_OP_LINK:
2685 handle_client_link(mdr);
2686 break;
2687 case CEPH_MDS_OP_UNLINK:
2688 case CEPH_MDS_OP_RMDIR:
2689 handle_client_unlink(mdr);
2690 break;
2691 case CEPH_MDS_OP_RENAME:
2692 handle_client_rename(mdr);
2693 break;
2694 case CEPH_MDS_OP_MKDIR:
2695 handle_client_mkdir(mdr);
2696 break;
2697 case CEPH_MDS_OP_SYMLINK:
2698 handle_client_symlink(mdr);
2699 break;
2700
2701
2702 // snaps
2703 case CEPH_MDS_OP_LSSNAP:
2704 handle_client_lssnap(mdr);
2705 break;
2706 case CEPH_MDS_OP_MKSNAP:
2707 handle_client_mksnap(mdr);
2708 break;
2709 case CEPH_MDS_OP_RMSNAP:
2710 handle_client_rmsnap(mdr);
2711 break;
2712 case CEPH_MDS_OP_RENAMESNAP:
2713 handle_client_renamesnap(mdr);
2714 break;
2715
2716 default:
2717 dout(1) << " unknown client op " << req->get_op() << dendl;
2718 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
2719 }
2720 }
2721
2722
2723 // ---------------------------------------
2724 // PEER REQUESTS
2725
2726 void Server::handle_peer_request(const cref_t<MMDSPeerRequest> &m)
2727 {
2728 dout(4) << "handle_peer_request " << m->get_reqid() << " from " << m->get_source() << dendl;
2729 mds_rank_t from = mds_rank_t(m->get_source().num());
2730
2731 if (logger) logger->inc(l_mdss_handle_peer_request);
2732
2733 // reply?
2734 if (m->is_reply())
2735 return handle_peer_request_reply(m);
2736
2737 // the purpose of rename notify is enforcing causal message ordering. making sure
2738 // bystanders have received all messages from rename srcdn's auth MDS.
2739 if (m->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY) {
2740 auto reply = make_message<MMDSPeerRequest>(m->get_reqid(), m->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK);
2741 mds->send_message(reply, m->get_connection());
2742 return;
2743 }
2744
2745 CDentry *straydn = NULL;
2746 if (m->straybl.length() > 0) {
2747 mdcache->decode_replica_stray(straydn, nullptr, m->straybl, from);
2748 ceph_assert(straydn);
2749 m->straybl.clear();
2750 }
2751
2752 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2753 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2754 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2755 return;
2756 }
2757
2758 // am i a new peer?
2759 MDRequestRef mdr;
2760 if (mdcache->have_request(m->get_reqid())) {
2761 // existing?
2762 mdr = mdcache->request_get(m->get_reqid());
2763
2764 // is my request newer?
2765 if (mdr->attempt > m->get_attempt()) {
2766 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2767 << ", dropping " << *m << dendl;
2768 return;
2769 }
2770
2771 if (mdr->attempt < m->get_attempt()) {
2772 // mine is old, close it out
2773 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2774 << ", closing out" << dendl;
2775 mdcache->request_finish(mdr);
2776 mdr.reset();
2777 } else if (mdr->peer_to_mds != from) {
2778 dout(10) << "local request " << *mdr << " not peer to mds." << from << dendl;
2779 return;
2780 }
2781
2782 // may get these while mdr->peer_request is non-null
2783 if (m->get_op() == MMDSPeerRequest::OP_DROPLOCKS) {
2784 mds->locker->drop_locks(mdr.get());
2785 return;
2786 }
2787 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2788 if (m->is_abort()) {
2789 mdr->aborted = true;
2790 if (mdr->peer_request) {
2791 // only abort on-going xlock, wrlock and auth pin
2792 ceph_assert(!mdr->peer_did_prepare());
2793 } else {
2794 mdcache->request_finish(mdr);
2795 }
2796 } else {
2797 if (m->inode_export.length() > 0)
2798 mdr->more()->inode_import = m->inode_export;
2799 // finish off request.
2800 mdcache->request_finish(mdr);
2801 }
2802 return;
2803 }
2804 }
2805 if (!mdr.get()) {
2806 // new?
2807 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2808 dout(10) << "missing peer request for " << m->get_reqid()
2809 << " OP_FINISH, must have lost race with a forward" << dendl;
2810 return;
2811 }
2812 mdr = mdcache->request_start_peer(m->get_reqid(), m->get_attempt(), m);
2813 mdr->set_op_stamp(m->op_stamp);
2814 }
2815 ceph_assert(mdr->peer_request == 0); // only one at a time, please!
2816
2817 if (straydn) {
2818 mdr->pin(straydn);
2819 mdr->straydn = straydn;
2820 }
2821
2822 if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2823 mdr->locks.empty()) {
2824 dout(3) << "not active yet, waiting" << dendl;
2825 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2826 return;
2827 }
2828
2829 mdr->reset_peer_request(m);
2830
2831 dispatch_peer_request(mdr);
2832 }
2833
2834 void Server::handle_peer_request_reply(const cref_t<MMDSPeerRequest> &m)
2835 {
2836 mds_rank_t from = mds_rank_t(m->get_source().num());
2837
2838 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2839 metareqid_t r = m->get_reqid();
2840 if (!mdcache->have_uncommitted_leader(r, from)) {
2841 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
2842 << from << " reqid " << r << dendl;
2843 return;
2844 }
2845 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2846 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2847 return;
2848 }
2849
2850 if (m->get_op() == MMDSPeerRequest::OP_COMMITTED) {
2851 metareqid_t r = m->get_reqid();
2852 mdcache->committed_leader_peer(r, from);
2853 return;
2854 }
2855
2856 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2857 if (m->get_attempt() != mdr->attempt) {
2858 dout(10) << "handle_peer_request_reply " << *mdr << " ignoring reply from other attempt "
2859 << m->get_attempt() << dendl;
2860 return;
2861 }
2862
2863 switch (m->get_op()) {
2864 case MMDSPeerRequest::OP_XLOCKACK:
2865 {
2866 // identify lock, leader request
2867 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2868 m->get_object_info());
2869 mdr->more()->peers.insert(from);
2870 lock->decode_locked_state(m->get_lock_data());
2871 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2872 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
2873 mdr->finish_locking(lock);
2874 lock->get_xlock(mdr, mdr->get_client());
2875
2876 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2877 mdr->more()->waiting_on_peer.erase(from);
2878 ceph_assert(mdr->more()->waiting_on_peer.empty());
2879 mdcache->dispatch_request(mdr);
2880 }
2881 break;
2882
2883 case MMDSPeerRequest::OP_WRLOCKACK:
2884 {
2885 // identify lock, leader request
2886 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2887 m->get_object_info());
2888 mdr->more()->peers.insert(from);
2889 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2890 auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
2891 ceph_assert(it->is_remote_wrlock());
2892 ceph_assert(it->wrlock_target == from);
2893
2894 mdr->finish_locking(lock);
2895
2896 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2897 mdr->more()->waiting_on_peer.erase(from);
2898 ceph_assert(mdr->more()->waiting_on_peer.empty());
2899 mdcache->dispatch_request(mdr);
2900 }
2901 break;
2902
2903 case MMDSPeerRequest::OP_AUTHPINACK:
2904 handle_peer_auth_pin_ack(mdr, m);
2905 break;
2906
2907 case MMDSPeerRequest::OP_LINKPREPACK:
2908 handle_peer_link_prep_ack(mdr, m);
2909 break;
2910
2911 case MMDSPeerRequest::OP_RMDIRPREPACK:
2912 handle_peer_rmdir_prep_ack(mdr, m);
2913 break;
2914
2915 case MMDSPeerRequest::OP_RENAMEPREPACK:
2916 handle_peer_rename_prep_ack(mdr, m);
2917 break;
2918
2919 case MMDSPeerRequest::OP_RENAMENOTIFYACK:
2920 handle_peer_rename_notify_ack(mdr, m);
2921 break;
2922
2923 default:
2924 ceph_abort_msg("unknown op " + to_string(m->get_op()) + " requested");
2925 }
2926 }
2927
2928 void Server::dispatch_peer_request(MDRequestRef& mdr)
2929 {
2930 dout(7) << "dispatch_peer_request " << *mdr << " " << *mdr->peer_request << dendl;
2931
2932 if (mdr->aborted) {
2933 dout(7) << " abort flag set, finishing" << dendl;
2934 mdcache->request_finish(mdr);
2935 return;
2936 }
2937
2938 if (logger) logger->inc(l_mdss_dispatch_peer_request);
2939
2940 int op = mdr->peer_request->get_op();
2941 switch (op) {
2942 case MMDSPeerRequest::OP_XLOCK:
2943 case MMDSPeerRequest::OP_WRLOCK:
2944 {
2945 // identify object
2946 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2947 mdr->peer_request->get_object_info());
2948
2949 if (!lock) {
2950 dout(10) << "don't have object, dropping" << dendl;
2951 ceph_abort_msg("don't have object"); // can this happen, if we auth pinned properly.
2952 }
2953 if (op == MMDSPeerRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2954 dout(10) << "not auth for remote xlock attempt, dropping on "
2955 << *lock << " on " << *lock->get_parent() << dendl;
2956 } else {
2957 // use acquire_locks so that we get auth_pinning.
2958 MutationImpl::LockOpVec lov;
2959 for (const auto& p : mdr->locks) {
2960 if (p.is_xlock())
2961 lov.add_xlock(p.lock);
2962 else if (p.is_wrlock())
2963 lov.add_wrlock(p.lock);
2964 }
2965
2966 int replycode = 0;
2967 switch (op) {
2968 case MMDSPeerRequest::OP_XLOCK:
2969 lov.add_xlock(lock);
2970 replycode = MMDSPeerRequest::OP_XLOCKACK;
2971 break;
2972 case MMDSPeerRequest::OP_WRLOCK:
2973 lov.add_wrlock(lock);
2974 replycode = MMDSPeerRequest::OP_WRLOCKACK;
2975 break;
2976 }
2977
2978 if (!mds->locker->acquire_locks(mdr, lov))
2979 return;
2980
2981 // ack
2982 auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, replycode);
2983 r->set_lock_type(lock->get_type());
2984 lock->get_parent()->set_object_info(r->get_object_info());
2985 if (replycode == MMDSPeerRequest::OP_XLOCKACK)
2986 lock->encode_locked_state(r->get_lock_data());
2987 mds->send_message(r, mdr->peer_request->get_connection());
2988 }
2989
2990 // done.
2991 mdr->reset_peer_request();
2992 }
2993 break;
2994
2995 case MMDSPeerRequest::OP_UNXLOCK:
2996 case MMDSPeerRequest::OP_UNWRLOCK:
2997 {
2998 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2999 mdr->peer_request->get_object_info());
3000 ceph_assert(lock);
3001 auto it = mdr->locks.find(lock);
3002 ceph_assert(it != mdr->locks.end());
3003 bool need_issue = false;
3004 switch (op) {
3005 case MMDSPeerRequest::OP_UNXLOCK:
3006 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
3007 break;
3008 case MMDSPeerRequest::OP_UNWRLOCK:
3009 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
3010 break;
3011 }
3012 if (need_issue)
3013 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
3014
3015 // done. no ack necessary.
3016 mdr->reset_peer_request();
3017 }
3018 break;
3019
3020 case MMDSPeerRequest::OP_AUTHPIN:
3021 handle_peer_auth_pin(mdr);
3022 break;
3023
3024 case MMDSPeerRequest::OP_LINKPREP:
3025 case MMDSPeerRequest::OP_UNLINKPREP:
3026 handle_peer_link_prep(mdr);
3027 break;
3028
3029 case MMDSPeerRequest::OP_RMDIRPREP:
3030 handle_peer_rmdir_prep(mdr);
3031 break;
3032
3033 case MMDSPeerRequest::OP_RENAMEPREP:
3034 handle_peer_rename_prep(mdr);
3035 break;
3036
3037 default:
3038 ceph_abort_msg("unknown op "+ to_string(op)+ " received");
3039 }
3040 }
3041
3042 void Server::handle_peer_auth_pin(MDRequestRef& mdr)
3043 {
3044 dout(10) << "handle_peer_auth_pin " << *mdr << dendl;
3045
3046 // build list of objects
3047 list<MDSCacheObject*> objects;
3048 CInode *auth_pin_freeze = NULL;
3049 bool nonblocking = mdr->peer_request->is_nonblocking();
3050 bool fail = false, wouldblock = false, readonly = false;
3051 ref_t<MMDSPeerRequest> reply;
3052
3053 if (mdcache->is_readonly()) {
3054 dout(10) << " read-only FS" << dendl;
3055 readonly = true;
3056 fail = true;
3057 }
3058
3059 if (!fail) {
3060 for (const auto &oi : mdr->peer_request->get_authpins()) {
3061 MDSCacheObject *object = mdcache->get_object(oi);
3062 if (!object) {
3063 dout(10) << " don't have " << oi << dendl;
3064 fail = true;
3065 break;
3066 }
3067
3068 objects.push_back(object);
3069 if (oi == mdr->peer_request->get_authpin_freeze())
3070 auth_pin_freeze = static_cast<CInode*>(object);
3071 }
3072 }
3073
3074 // can we auth pin them?
3075 if (!fail) {
3076 for (const auto& obj : objects) {
3077 if (!obj->is_auth()) {
3078 dout(10) << " not auth for " << *obj << dendl;
3079 fail = true;
3080 break;
3081 }
3082 if (mdr->is_auth_pinned(obj))
3083 continue;
3084 if (!mdr->can_auth_pin(obj)) {
3085 if (nonblocking) {
3086 dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
3087 fail = true;
3088 wouldblock = true;
3089 break;
3090 }
3091 // wait
3092 dout(10) << " waiting for authpinnable on " << *obj << dendl;
3093 obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3094 mdr->drop_local_auth_pins();
3095
3096 mds->locker->notify_freeze_waiter(obj);
3097 goto blocked;
3098 }
3099 }
3100 }
3101
3102 if (!fail) {
3103 /* freeze authpin wrong inode */
3104 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
3105 mdr->more()->rename_inode != auth_pin_freeze)
3106 mdr->unfreeze_auth_pin(true);
3107
3108 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
3109 * on the source inode to complete. This happens after all locks for the rename
3110 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3111 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3112 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
3113 * The solution is freeze the inode and prevent other MDRequests from getting new
3114 * auth pins.
3115 */
3116 if (auth_pin_freeze) {
3117 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
3118 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
3119 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
3120 mds->mdlog->flush();
3121 goto blocked;
3122 }
3123 }
3124 }
3125
3126 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
3127
3128 if (fail) {
3129 mdr->drop_local_auth_pins(); // just in case
3130 if (readonly)
3131 reply->mark_error_rofs();
3132 if (wouldblock)
3133 reply->mark_error_wouldblock();
3134 } else {
3135 // auth pin!
3136 for (const auto& obj : objects) {
3137 dout(10) << "auth_pinning " << *obj << dendl;
3138 mdr->auth_pin(obj);
3139 }
3140 // return list of my auth_pins (if any)
3141 for (const auto &p : mdr->object_states) {
3142 if (!p.second.auth_pinned)
3143 continue;
3144 MDSCacheObjectInfo info;
3145 p.first->set_object_info(info);
3146 reply->get_authpins().push_back(info);
3147 if (p.first == (MDSCacheObject*)auth_pin_freeze)
3148 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
3149 }
3150 }
3151
3152 mds->send_message_mds(reply, mdr->peer_to_mds);
3153
3154 // clean up this request
3155 mdr->reset_peer_request();
3156 return;
3157
3158 blocked:
3159 if (mdr->peer_request->should_notify_blocking()) {
3160 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
3161 reply->mark_req_blocked();
3162 mds->send_message_mds(reply, mdr->peer_to_mds);
3163 mdr->peer_request->clear_notify_blocking();
3164 }
3165 return;
3166 }
3167
3168 void Server::handle_peer_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
3169 {
3170 dout(10) << "handle_peer_auth_pin_ack on " << *mdr << " " << *ack << dendl;
3171 mds_rank_t from = mds_rank_t(ack->get_source().num());
3172
3173 if (ack->is_req_blocked()) {
3174 mdr->disable_lock_cache();
3175 // peer auth pin is blocked, drop locks to avoid deadlock
3176 mds->locker->drop_locks(mdr.get(), nullptr);
3177 return;
3178 }
3179
3180 // added auth pins?
3181 set<MDSCacheObject*> pinned;
3182 for (const auto &oi : ack->get_authpins()) {
3183 MDSCacheObject *object = mdcache->get_object(oi);
3184 ceph_assert(object); // we pinned it
3185 dout(10) << " remote has pinned " << *object << dendl;
3186 mdr->set_remote_auth_pinned(object, from);
3187 if (oi == ack->get_authpin_freeze())
3188 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
3189 pinned.insert(object);
3190 }
3191
3192 // removed frozen auth pin ?
3193 if (mdr->more()->is_remote_frozen_authpin &&
3194 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
3195 auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
3196 ceph_assert(stat_p);
3197 if (stat_p->remote_auth_pinned == from) {
3198 mdr->more()->is_remote_frozen_authpin = false;
3199 }
3200 }
3201
3202 // removed auth pins?
3203 for (auto& p : mdr->object_states) {
3204 if (p.second.remote_auth_pinned == MDS_RANK_NONE)
3205 continue;
3206 MDSCacheObject* object = p.first;
3207 if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
3208 dout(10) << " remote has unpinned " << *object << dendl;
3209 mdr->_clear_remote_auth_pinned(p.second);
3210 }
3211 }
3212
3213 // note peer
3214 mdr->more()->peers.insert(from);
3215
3216 // clear from waiting list
3217 auto ret = mdr->more()->waiting_on_peer.erase(from);
3218 ceph_assert(ret);
3219
3220 if (ack->is_error_rofs()) {
3221 mdr->more()->peer_error = -CEPHFS_EROFS;
3222 } else if (ack->is_error_wouldblock()) {
3223 mdr->more()->peer_error = -CEPHFS_EWOULDBLOCK;
3224 }
3225
3226 // go again?
3227 if (mdr->more()->waiting_on_peer.empty())
3228 mdcache->dispatch_request(mdr);
3229 else
3230 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
3231 }
3232
3233
3234 // ---------------------------------------
3235 // HELPERS
3236
3237
3238 /**
3239 * check whether we are permitted to complete a request
3240 *
3241 * Check whether we have permission to perform the operation specified
3242 * by mask on the given inode, based on the capability in the mdr's
3243 * session.
3244 */
3245 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
3246 {
3247 if (mdr->session) {
3248 int r = mdr->session->check_access(
3249 in, mask,
3250 mdr->client_request->get_caller_uid(),
3251 mdr->client_request->get_caller_gid(),
3252 &mdr->client_request->get_caller_gid_list(),
3253 mdr->client_request->head.args.setattr.uid,
3254 mdr->client_request->head.args.setattr.gid);
3255 if (r < 0) {
3256 respond_to_request(mdr, r);
3257 return false;
3258 }
3259 }
3260 return true;
3261 }
3262
3263 /**
3264 * check whether fragment has reached maximum size
3265 *
3266 */
3267 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *dir)
3268 {
3269 const auto size = dir->get_frag_size();
3270 const auto max = bal_fragment_size_max;
3271 if (size >= max) {
3272 dout(10) << "fragment " << *dir << " size exceeds " << max << " (CEPHFS_ENOSPC)" << dendl;
3273 respond_to_request(mdr, -CEPHFS_ENOSPC);
3274 return false;
3275 } else {
3276 dout(20) << "fragment " << *dir << " size " << size << " < " << max << dendl;
3277 }
3278
3279 return true;
3280 }
3281
3282 /**
3283 * check whether entries in a dir reached maximum size
3284 *
3285 */
3286 bool Server::check_dir_max_entries(MDRequestRef &mdr, CDir *in)
3287 {
3288 const uint64_t size = in->inode->get_projected_inode()->dirstat.nfiles +
3289 in->inode->get_projected_inode()->dirstat.nsubdirs;
3290 if (dir_max_entries && size >= dir_max_entries) {
3291 dout(10) << "entries per dir " << *in << " size exceeds " << dir_max_entries << " (ENOSPC)" << dendl;
3292 respond_to_request(mdr, -ENOSPC);
3293 return false;
3294 }
3295 return true;
3296 }
3297
3298
3299 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3300 {
3301 string straydname;
3302 in->name_stray_dentry(straydname);
3303
3304 CDentry *straydn = mdr->straydn;
3305 if (straydn) {
3306 ceph_assert(straydn->get_name() == straydname);
3307 return straydn;
3308 }
3309 CDir *straydir = mdcache->get_stray_dir(in);
3310
3311 if (!mdr->client_request->is_replay() &&
3312 !check_fragment_space(mdr, straydir))
3313 return nullptr;
3314
3315 straydn = straydir->lookup(straydname);
3316 if (!straydn) {
3317 if (straydir->is_frozen_dir()) {
3318 dout(10) << __func__ << ": " << *straydir << " is frozen, waiting" << dendl;
3319 mds->locker->drop_locks(mdr.get());
3320 mdr->drop_local_auth_pins();
3321 straydir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3322 return nullptr;
3323 }
3324 straydn = straydir->add_null_dentry(straydname);
3325 straydn->mark_new();
3326 } else {
3327 ceph_assert(straydn->get_projected_linkage()->is_null());
3328 }
3329
3330 straydn->state_set(CDentry::STATE_STRAY);
3331 mdr->straydn = straydn;
3332 mdr->pin(straydn);
3333
3334 return straydn;
3335 }
3336
3337 /** prepare_new_inode
3338 *
3339 * create a new inode. set c/m/atime. hit dir pop.
3340 */
3341 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
3342 const file_layout_t *layout)
3343 {
3344 CInode *in = new CInode(mdcache);
3345 auto _inode = in->_get_inode();
3346
3347 // Server::prepare_force_open_sessions() can re-open session in closing
3348 // state. In that corner case, session's prealloc_inos are being freed.
3349 // To simplify the code, we disallow using/refilling session's prealloc_ino
3350 // while session is opening.
3351 bool allow_prealloc_inos = mdr->session->is_open();
3352
3353 // assign ino
3354 if (allow_prealloc_inos && (mdr->used_prealloc_ino = _inode->ino = mdr->session->take_ino(useino))) {
3355 mds->sessionmap.mark_projected(mdr->session);
3356 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3357 << " (" << mdr->session->info.prealloc_inos.size() << " left)"
3358 << dendl;
3359 } else {
3360 mdr->alloc_ino =
3361 _inode->ino = mds->inotable->project_alloc_id(useino);
3362 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3363 }
3364
3365 if (useino && useino != _inode->ino) {
3366 dout(0) << "WARNING: client specified " << useino << " and i allocated " << _inode->ino << dendl;
3367 mds->clog->error() << mdr->client_request->get_source()
3368 << " specified ino " << useino
3369 << " but mds." << mds->get_nodeid() << " allocated " << _inode->ino;
3370 //ceph_abort(); // just for now.
3371 }
3372
3373 if (allow_prealloc_inos &&
3374 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3375 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
3376 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
3377 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
3378 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3379 mds->sessionmap.mark_projected(mdr->session);
3380 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3381 }
3382
3383 _inode->version = 1;
3384 _inode->xattr_version = 1;
3385 _inode->nlink = 1; // FIXME
3386
3387 _inode->mode = mode;
3388
3389 // FIPS zeroization audit 20191117: this memset is not security related.
3390 memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
3391 if (_inode->is_dir()) {
3392 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
3393 } else if (layout) {
3394 _inode->layout = *layout;
3395 } else {
3396 _inode->layout = mdcache->default_file_layout;
3397 }
3398
3399 _inode->truncate_size = -1ull; // not truncated, yet!
3400 _inode->truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
3401
3402 CInode *diri = dir->get_inode();
3403 auto pip = diri->get_projected_inode();
3404
3405 dout(10) << oct << " dir mode 0" << pip->mode << " new mode 0" << mode << dec << dendl;
3406
3407 if (pip->mode & S_ISGID) {
3408 dout(10) << " dir is sticky" << dendl;
3409 _inode->gid = pip->gid;
3410 if (S_ISDIR(mode)) {
3411 dout(10) << " new dir also sticky" << dendl;
3412 _inode->mode |= S_ISGID;
3413 }
3414 } else {
3415 _inode->gid = mdr->client_request->get_caller_gid();
3416 }
3417
3418 _inode->uid = mdr->client_request->get_caller_uid();
3419
3420 _inode->btime = _inode->ctime = _inode->mtime = _inode->atime =
3421 mdr->get_op_stamp();
3422
3423 _inode->change_attr = 0;
3424
3425 const cref_t<MClientRequest> &req = mdr->client_request;
3426
3427 dout(10) << "copying fscrypt_auth len " << req->fscrypt_auth.size() << dendl;
3428 _inode->fscrypt_auth = req->fscrypt_auth;
3429 _inode->fscrypt_file = req->fscrypt_file;
3430
3431 if (req->get_data().length()) {
3432 auto p = req->get_data().cbegin();
3433
3434 // xattrs on new inode?
3435 auto _xattrs = CInode::allocate_xattr_map();
3436 decode_noshare(*_xattrs, p);
3437 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs << dendl;
3438 in->reset_xattrs(std::move(_xattrs));
3439 }
3440
3441 if (!mds->mdsmap->get_inline_data_enabled() ||
3442 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
3443 _inode->inline_data.version = CEPH_INLINE_NONE;
3444
3445 mdcache->add_inode(in); // add
3446 dout(10) << "prepare_new_inode " << *in << dendl;
3447 return in;
3448 }
3449
3450 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3451 {
3452 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3453 << " inotablev " << mds->inotable->get_projected_version()
3454 << dendl;
3455 blob->set_ino_alloc(mdr->alloc_ino,
3456 mdr->used_prealloc_ino,
3457 mdr->prealloc_inos,
3458 mdr->client_request->get_source(),
3459 mds->sessionmap.get_projected(),
3460 mds->inotable->get_projected_version());
3461 }
3462
3463 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3464 {
3465 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3466 << " / " << mdr->prealloc_inos
3467 << " / " << mdr->used_prealloc_ino << dendl;
3468
3469 if (mdr->alloc_ino) {
3470 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3471 }
3472 if (mdr->prealloc_inos.size()) {
3473 ceph_assert(session);
3474 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
3475 session->free_prealloc_inos.insert(mdr->prealloc_inos);
3476 session->info.prealloc_inos.insert(mdr->prealloc_inos);
3477 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
3478 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3479 }
3480 if (mdr->used_prealloc_ino) {
3481 ceph_assert(session);
3482 session->info.prealloc_inos.erase(mdr->used_prealloc_ino);
3483 mds->sessionmap.mark_dirty(session);
3484 }
3485 }
3486
3487 struct C_MDS_TryOpenInode : public ServerContext {
3488 MDRequestRef mdr;
3489 inodeno_t ino;
3490 C_MDS_TryOpenInode(Server *s, MDRequestRef& r, inodeno_t i) :
3491 ServerContext(s), mdr(r), ino(i) {}
3492 void finish(int r) override {
3493 server->_try_open_ino(mdr, r, ino);
3494 }
3495 };
3496
3497 void Server::_try_open_ino(MDRequestRef& mdr, int r, inodeno_t ino)
3498 {
3499 dout(10) << "_try_open_ino " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3500
3501 // `r` is a rank if >=0, else an error code
3502 if (r >= 0) {
3503 mds_rank_t dest_rank(r);
3504 if (dest_rank == mds->get_nodeid())
3505 dispatch_client_request(mdr);
3506 else
3507 mdcache->request_forward(mdr, dest_rank);
3508 return;
3509 }
3510
3511 // give up
3512 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
3513 r = -CEPHFS_ESTALE;
3514 respond_to_request(mdr, r);
3515 }
3516
3517 class C_MDS_TryFindInode : public ServerContext {
3518 MDRequestRef mdr;
3519 MDCache *mdcache;
3520 inodeno_t ino;
3521 public:
3522 C_MDS_TryFindInode(Server *s, MDRequestRef& r, MDCache *m, inodeno_t i) :
3523 ServerContext(s), mdr(r), mdcache(m), ino(i) {}
3524 void finish(int r) override {
3525 if (r == -CEPHFS_ESTALE) { // :( find_ino_peers failed
3526 /*
3527 * There has one case that when the MDS crashes and the
3528 * openfiletable journal couldn't be flushed and then
3529 * the replacing MDS is possibly won't load some already
3530 * opened CInodes into the MDCache. And if the clients
3531 * will retry some requests after reconnected, the MDS
3532 * will return -ESTALE after failing to find the ino in
3533 * all active peers.
3534 *
3535 * As a workaround users can run `ls -R ${mountpoint}`
3536 * to list all the sub-files or sub-direcotries from the
3537 * mountpoint.
3538 *
3539 * We need try to open the ino and try it again.
3540 */
3541 CInode *in = mdcache->get_inode(ino);
3542 if (in && in->state_test(CInode::STATE_PURGING))
3543 server->respond_to_request(mdr, r);
3544 else
3545 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_TryOpenInode(server, mdr, ino));
3546 } else {
3547 server->dispatch_client_request(mdr);
3548 }
3549 }
3550 };
3551
3552 /* If this returns null, the request has been handled
3553 * as appropriate: forwarded on, or the client's been replied to */
3554 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
3555 bool want_auth,
3556 bool no_want_auth)
3557 {
3558 const filepath& refpath = mdr->get_filepath();
3559 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3560
3561 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3562 return mdr->in[0];
3563
3564 // traverse
3565 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3566 int flags = 0;
3567 if (refpath.is_last_snap()) {
3568 if (!no_want_auth)
3569 want_auth = true;
3570 } else {
3571 if (!no_want_auth && forward_all_requests_to_auth)
3572 want_auth = true;
3573 flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
3574 }
3575 if (want_auth)
3576 flags |= MDS_TRAVERSE_WANT_AUTH;
3577 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
3578 if (r > 0)
3579 return nullptr; // delayed
3580 if (r < 0) { // error
3581 if (r == -CEPHFS_ENOENT && !mdr->dn[0].empty()) {
3582 if (mdr->client_request &&
3583 mdr->client_request->get_dentry_wanted())
3584 mdr->tracedn = mdr->dn[0].back();
3585 respond_to_request(mdr, r);
3586 } else if (r == -CEPHFS_ESTALE) {
3587 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
3588 inodeno_t ino = refpath.get_ino();
3589 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
3590 } else {
3591 dout(10) << "FAIL on error " << r << dendl;
3592 respond_to_request(mdr, r);
3593 }
3594 return nullptr;
3595 }
3596 CInode *ref = mdr->in[0];
3597 dout(10) << "ref is " << *ref << dendl;
3598
3599 if (want_auth) {
3600 // auth_pin?
3601 // do NOT proceed if freezing, as cap release may defer in that case, and
3602 // we could deadlock when we try to lock @ref.
3603 // if we're already auth_pinned, continue; the release has already been processed.
3604 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3605 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3606 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
3607 ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
3608 if (mdr->is_any_remote_auth_pin())
3609 mds->locker->notify_freeze_waiter(ref);
3610 return 0;
3611 }
3612 mdr->auth_pin(ref);
3613 }
3614
3615 // set and pin ref
3616 mdr->pin(ref);
3617 return ref;
3618 }
3619
3620
3621 /** rdlock_path_xlock_dentry
3622 * traverse path to the directory that could/would contain dentry.
3623 * make sure i am auth for that dentry (or target inode if it exists and authexist),
3624 * forward as necessary. create null dentry in place (or use existing if okexist).
3625 * get rdlocks on traversed dentries, xlock on new dentry.
3626 *
3627 * set authexist true if caller requires the target inode to be auth when it exists.
3628 * the tail dentry is not always auth any more if authexist because it is impossible
3629 * to ensure tail dentry and target inode are both auth in one mds. the tail dentry
3630 * will not be xlocked too if authexist and the target inode exists.
3631 */
3632 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
3633 bool create, bool okexist, bool authexist,
3634 bool want_layout)
3635 {
3636 const filepath& refpath = mdr->get_filepath();
3637 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3638
3639 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3640 return mdr->dn[0].back();
3641
3642 // figure parent dir vs dname
3643 if (refpath.depth() == 0) {
3644 dout(7) << "invalid path (zero length)" << dendl;
3645 respond_to_request(mdr, -CEPHFS_EINVAL);
3646 return nullptr;
3647 }
3648
3649 if (refpath.is_last_snap()) {
3650 respond_to_request(mdr, -CEPHFS_EROFS);
3651 return nullptr;
3652 }
3653
3654 if (refpath.is_last_dot_or_dotdot()) {
3655 dout(7) << "invalid path (last dot or dot_dot)" << dendl;
3656 if (create)
3657 respond_to_request(mdr, -CEPHFS_EEXIST);
3658 else
3659 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
3660 return nullptr;
3661 }
3662
3663 // traverse to parent dir
3664 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3665 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
3666 MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
3667 MDS_TRAVERSE_WANT_AUTH;
3668 if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
3669 flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
3670 if (create)
3671 flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
3672 if (authexist)
3673 flags |= MDS_TRAVERSE_WANT_INODE;
3674 if (want_layout)
3675 flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
3676 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3677 if (r > 0)
3678 return nullptr; // delayed
3679 if (r < 0) {
3680 if (r == -CEPHFS_ESTALE) {
3681 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
3682 inodeno_t ino = refpath.get_ino();
3683 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
3684 return nullptr;
3685 }
3686 respond_to_request(mdr, r);
3687 return nullptr;
3688 }
3689
3690 CDentry *dn = mdr->dn[0].back();
3691 CDir *dir = dn->get_dir();
3692 CInode *diri = dir->get_inode();
3693
3694 if (!mdr->reqid.name.is_mds()) {
3695 if (diri->is_system() && !diri->is_root()) {
3696 respond_to_request(mdr, -CEPHFS_EROFS);
3697 return nullptr;
3698 }
3699 }
3700
3701 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
3702 respond_to_request(mdr, -CEPHFS_ENOENT);
3703 return nullptr;
3704 }
3705
3706 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3707 if (dnl->is_null()) {
3708 if (!create && okexist) {
3709 respond_to_request(mdr, -CEPHFS_ENOENT);
3710 return nullptr;
3711 }
3712
3713 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3714 dn->first = std::max(dn->first, next_snap);
3715 } else {
3716 if (!okexist) {
3717 respond_to_request(mdr, -CEPHFS_EEXIST);
3718 return nullptr;
3719 }
3720 mdr->in[0] = dnl->get_inode();
3721 }
3722
3723 return dn;
3724 }
3725
3726 /** rdlock_two_paths_xlock_destdn
3727 * traverse two paths and lock the two paths in proper order.
3728 * The order of taking locks is:
3729 * 1. Lock directory inodes or dentries according to which trees they
3730 * are under. Lock objects under fs root before objects under mdsdir.
3731 * 2. Lock directory inodes or dentries according to their depth, in
3732 * ascending order.
3733 * 3. Lock directory inodes or dentries according to inode numbers or
3734 * dentries' parent inode numbers, in ascending order.
3735 * 4. Lock dentries in the same directory in order of their keys.
3736 * 5. Lock non-directory inodes according to inode numbers, in ascending
3737 * order.
3738 */
3739 std::pair<CDentry*, CDentry*>
3740 Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
3741 {
3742
3743 const filepath& refpath = mdr->get_filepath();
3744 const filepath& refpath2 = mdr->get_filepath2();
3745
3746 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
3747
3748 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3749 return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
3750
3751 if (refpath.depth() != 1 || refpath2.depth() != 1) {
3752 respond_to_request(mdr, -CEPHFS_EINVAL);
3753 return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
3754 }
3755
3756 if (refpath.is_last_snap() || refpath2.is_last_snap()) {
3757 respond_to_request(mdr, -CEPHFS_EROFS);
3758 return std::make_pair(nullptr, nullptr);
3759 }
3760
3761 // traverse to parent dir
3762 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3763 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
3764 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3765 if (r != 0) {
3766 if (r == -CEPHFS_ESTALE) {
3767 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl;
3768 inodeno_t ino = refpath.get_ino();
3769 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
3770 } else if (r < 0) {
3771 respond_to_request(mdr, r);
3772 }
3773 return std::make_pair(nullptr, nullptr);
3774 }
3775
3776 flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
3777 r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
3778 if (r != 0) {
3779 if (r == -CEPHFS_ESTALE) {
3780 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
3781 inodeno_t ino = refpath2.get_ino();
3782 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
3783 } else if (r < 0) {
3784 respond_to_request(mdr, r);
3785 }
3786 return std::make_pair(nullptr, nullptr);
3787 }
3788
3789 CDentry *srcdn = mdr->dn[1].back();
3790 CDir *srcdir = srcdn->get_dir();
3791 CDentry *destdn = mdr->dn[0].back();
3792 CDir *destdir = destdn->get_dir();
3793
3794 if (!mdr->reqid.name.is_mds()) {
3795 if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
3796 (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
3797 respond_to_request(mdr, -CEPHFS_EROFS);
3798 return std::make_pair(nullptr, nullptr);
3799 }
3800 }
3801
3802 if (!destdir->get_inode()->is_base() &&
3803 destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
3804 respond_to_request(mdr, -CEPHFS_ENOENT);
3805 return std::make_pair(nullptr, nullptr);
3806 }
3807
3808 MutationImpl::LockOpVec lov;
3809 if (srcdir->get_inode() == destdir->get_inode()) {
3810 lov.add_wrlock(&destdir->inode->filelock);
3811 lov.add_wrlock(&destdir->inode->nestlock);
3812 if (xlock_srcdn && srcdir != destdir) {
3813 mds_rank_t srcdir_auth = srcdir->authority().first;
3814 if (srcdir_auth != mds->get_nodeid()) {
3815 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3816 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3817 }
3818 }
3819
3820 if (srcdn->get_name() > destdn->get_name())
3821 lov.add_xlock(&destdn->lock);
3822
3823 if (xlock_srcdn)
3824 lov.add_xlock(&srcdn->lock);
3825 else
3826 lov.add_rdlock(&srcdn->lock);
3827
3828 if (srcdn->get_name() < destdn->get_name())
3829 lov.add_xlock(&destdn->lock);
3830 } else {
3831 int cmp = mdr->compare_paths();
3832 bool lock_destdir_first =
3833 (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
3834
3835 if (lock_destdir_first) {
3836 lov.add_wrlock(&destdir->inode->filelock);
3837 lov.add_wrlock(&destdir->inode->nestlock);
3838 lov.add_xlock(&destdn->lock);
3839 }
3840
3841 if (xlock_srcdn) {
3842 mds_rank_t srcdir_auth = srcdir->authority().first;
3843 if (srcdir_auth == mds->get_nodeid()) {
3844 lov.add_wrlock(&srcdir->inode->filelock);
3845 lov.add_wrlock(&srcdir->inode->nestlock);
3846 } else {
3847 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3848 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3849 }
3850 lov.add_xlock(&srcdn->lock);
3851 } else {
3852 lov.add_rdlock(&srcdn->lock);
3853 }
3854
3855 if (!lock_destdir_first) {
3856 lov.add_wrlock(&destdir->inode->filelock);
3857 lov.add_wrlock(&destdir->inode->nestlock);
3858 lov.add_xlock(&destdn->lock);
3859 }
3860 }
3861
3862 CInode *auth_pin_freeze = nullptr;
3863 // XXX any better way to do this?
3864 if (xlock_srcdn && !srcdn->is_auth()) {
3865 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
3866 auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
3867 }
3868 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
3869 return std::make_pair(nullptr, nullptr);
3870
3871 if (srcdn->get_projected_linkage()->is_null()) {
3872 respond_to_request(mdr, -CEPHFS_ENOENT);
3873 return std::make_pair(nullptr, nullptr);
3874 }
3875
3876 if (destdn->get_projected_linkage()->is_null()) {
3877 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3878 destdn->first = std::max(destdn->first, next_snap);
3879 }
3880
3881 mdr->locking_state |= MutationImpl::PATH_LOCKED;
3882
3883 return std::make_pair(destdn, srcdn);
3884 }
3885
3886 /**
3887 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3888 *
3889 * @param diri base inode
3890 * @param fg the exact frag we want
3891 * @param mdr request
3892 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3893 */
3894 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3895 {
3896 CDir *dir = diri->get_dirfrag(fg);
3897
3898 if (dir) {
3899 // am i auth for the dirfrag?
3900 if (!dir->is_auth()) {
3901 mds_rank_t auth = dir->authority().first;
3902 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3903 << ", fw to mds." << auth << dendl;
3904 mdcache->request_forward(mdr, auth);
3905 return nullptr;
3906 }
3907 } else {
3908 // not open and inode not mine?
3909 if (!diri->is_auth()) {
3910 mds_rank_t inauth = diri->authority().first;
3911 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3912 mdcache->request_forward(mdr, inauth);
3913 return nullptr;
3914 }
3915
3916 // not open and inode frozen?
3917 if (diri->is_frozen()) {
3918 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3919 ceph_assert(diri->get_parent_dir());
3920 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3921 return nullptr;
3922 }
3923
3924 // invent?
3925 dir = diri->get_or_open_dirfrag(mdcache, fg);
3926 }
3927
3928 return dir;
3929 }
3930
3931
3932 // ===============================================================================
3933 // STAT
3934
3935 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3936 {
3937 const cref_t<MClientRequest> &req = mdr->client_request;
3938
3939 if (req->get_filepath().depth() == 0 && is_lookup) {
3940 // refpath can't be empty for lookup but it can for
3941 // getattr (we do getattr with empty refpath for mount of '/')
3942 respond_to_request(mdr, -CEPHFS_EINVAL);
3943 return;
3944 }
3945
3946 bool want_auth = false;
3947 int mask = req->head.args.getattr.mask;
3948 if (mask & CEPH_STAT_RSTAT)
3949 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3950
3951 if (!mdr->is_batch_head() && mdr->can_batch()) {
3952 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
3953 int r = mdcache->path_traverse(mdr, cf, mdr->get_filepath(),
3954 (want_auth ? MDS_TRAVERSE_WANT_AUTH : 0),
3955 &mdr->dn[0], &mdr->in[0]);
3956 if (r > 0)
3957 return; // delayed
3958
3959 if (r < 0) {
3960 // fall-thru. let rdlock_path_pin_ref() check again.
3961 } else if (is_lookup) {
3962 CDentry* dn = mdr->dn[0].back();
3963 mdr->pin(dn);
3964 auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3965 if (em.second) {
3966 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
3967 } else {
3968 dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3969 em.first->second->add_request(mdr);
3970 return;
3971 }
3972 } else {
3973 CInode *in = mdr->in[0];
3974 mdr->pin(in);
3975 auto em = in->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3976 if (em.second) {
3977 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
3978 } else {
3979 dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3980 em.first->second->add_request(mdr);
3981 return;
3982 }
3983 }
3984 }
3985
3986 CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
3987 if (!ref)
3988 return;
3989
3990 /*
3991 * if client currently holds the EXCL cap on a field, do not rdlock
3992 * it; client's stat() will result in valid info if _either_ EXCL
3993 * cap is held or MDS rdlocks and reads the value here.
3994 *
3995 * handling this case here is easier than weakening rdlock
3996 * semantics... that would cause problems elsewhere.
3997 */
3998 client_t client = mdr->get_client();
3999 int issued = 0;
4000 Capability *cap = ref->get_client_cap(client);
4001 if (cap && (mdr->snapid == CEPH_NOSNAP ||
4002 mdr->snapid <= cap->client_follows))
4003 issued = cap->issued();
4004
4005 // FIXME
4006 MutationImpl::LockOpVec lov;
4007 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
4008 lov.add_rdlock(&ref->linklock);
4009 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
4010 lov.add_rdlock(&ref->authlock);
4011 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
4012 lov.add_rdlock(&ref->xattrlock);
4013 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
4014 // Don't wait on unstable filelock if client is allowed to read file size.
4015 // This can reduce the response time of getattr in the case that multiple
4016 // clients do stat(2) and there are writers.
4017 // The downside of this optimization is that mds may not issue Fs caps along
4018 // with getattr reply. Client may need to send more getattr requests.
4019 if (mdr->is_rdlocked(&ref->filelock)) {
4020 lov.add_rdlock(&ref->filelock);
4021 } else if (ref->filelock.is_stable() ||
4022 ref->filelock.get_num_wrlocks() > 0 ||
4023 !ref->filelock.can_read(mdr->get_client())) {
4024 lov.add_rdlock(&ref->filelock);
4025 mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
4026 }
4027 }
4028
4029 if (!mds->locker->acquire_locks(mdr, lov))
4030 return;
4031
4032 if (!check_access(mdr, ref, MAY_READ))
4033 return;
4034
4035 utime_t now = ceph_clock_now();
4036 mdr->set_mds_stamp(now);
4037
4038 // note which caps are requested, so we return at least a snapshot
4039 // value for them. (currently this matters for xattrs and inline data)
4040 mdr->getattr_caps = mask;
4041
4042 mds->balancer->hit_inode(ref, META_POP_IRD);
4043
4044 // reply
4045 dout(10) << "reply to stat on " << *req << dendl;
4046 mdr->tracei = ref;
4047 if (is_lookup)
4048 mdr->tracedn = mdr->dn[0].back();
4049 respond_to_request(mdr, 0);
4050 }
4051
4052 struct C_MDS_LookupIno2 : public ServerContext {
4053 MDRequestRef mdr;
4054 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
4055 void finish(int r) override {
4056 server->_lookup_ino_2(mdr, r);
4057 }
4058 };
4059
4060 /*
4061 * filepath: ino
4062 */
4063 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
4064 bool want_parent, bool want_dentry)
4065 {
4066 const cref_t<MClientRequest> &req = mdr->client_request;
4067
4068 if ((uint64_t)req->head.args.lookupino.snapid > 0)
4069 return _lookup_snap_ino(mdr);
4070
4071 inodeno_t ino = req->get_filepath().get_ino();
4072 auto _ino = ino.val;
4073
4074 /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
4075 * I do not have an explanation for how that happened organically but this
4076 * check will ensure that the client can no longer do that.
4077 *
4078 * [1] https://tracker.ceph.com/issues/49922
4079 */
4080 if (MDS_IS_PRIVATE_INO(_ino)) {
4081 respond_to_request(mdr, -CEPHFS_ESTALE);
4082 return;
4083 }
4084
4085 CInode *in = mdcache->get_inode(ino);
4086 if (in && in->state_test(CInode::STATE_PURGING)) {
4087 respond_to_request(mdr, -CEPHFS_ESTALE);
4088 return;
4089 }
4090 if (!in) {
4091 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
4092 return;
4093 }
4094
4095 // check for nothing (not read or write); this still applies the
4096 // path check.
4097 if (!check_access(mdr, in, 0))
4098 return;
4099
4100 CDentry *dn = in->get_projected_parent_dn();
4101 CInode *diri = dn ? dn->get_dir()->inode : NULL;
4102
4103 MutationImpl::LockOpVec lov;
4104 if (dn && (want_parent || want_dentry)) {
4105 mdr->pin(dn);
4106 lov.add_rdlock(&dn->lock);
4107 }
4108
4109 unsigned mask = req->head.args.lookupino.mask;
4110 if (mask) {
4111 Capability *cap = in->get_client_cap(mdr->get_client());
4112 int issued = 0;
4113 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4114 issued = cap->issued();
4115 // FIXME
4116 // permission bits, ACL/security xattrs
4117 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
4118 lov.add_rdlock(&in->authlock);
4119 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
4120 lov.add_rdlock(&in->xattrlock);
4121
4122 mdr->getattr_caps = mask;
4123 }
4124
4125 if (!lov.empty()) {
4126 if (!mds->locker->acquire_locks(mdr, lov))
4127 return;
4128
4129 if (diri != NULL) {
4130 // need read access to directory inode
4131 if (!check_access(mdr, diri, MAY_READ))
4132 return;
4133 }
4134 }
4135
4136 if (want_parent) {
4137 if (in->is_base()) {
4138 respond_to_request(mdr, -CEPHFS_EINVAL);
4139 return;
4140 }
4141 if (!diri || diri->is_stray()) {
4142 respond_to_request(mdr, -CEPHFS_ESTALE);
4143 return;
4144 }
4145 dout(10) << "reply to lookup_parent " << *in << dendl;
4146 mdr->tracei = diri;
4147 respond_to_request(mdr, 0);
4148 } else {
4149 if (want_dentry) {
4150 inodeno_t dirino = req->get_filepath2().get_ino();
4151 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
4152 respond_to_request(mdr, -CEPHFS_ENOENT);
4153 return;
4154 }
4155 dout(10) << "reply to lookup_name " << *in << dendl;
4156 } else
4157 dout(10) << "reply to lookup_ino " << *in << dendl;
4158
4159 mdr->tracei = in;
4160 if (want_dentry)
4161 mdr->tracedn = dn;
4162 respond_to_request(mdr, 0);
4163 }
4164 }
4165
4166 void Server::_lookup_snap_ino(MDRequestRef& mdr)
4167 {
4168 const cref_t<MClientRequest> &req = mdr->client_request;
4169
4170 vinodeno_t vino;
4171 vino.ino = req->get_filepath().get_ino();
4172 vino.snapid = (__u64)req->head.args.lookupino.snapid;
4173 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
4174 __u32 hash = req->head.args.lookupino.hash;
4175
4176 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
4177
4178 CInode *in = mdcache->lookup_snap_inode(vino);
4179 if (!in) {
4180 in = mdcache->get_inode(vino.ino);
4181 if (in) {
4182 if (in->state_test(CInode::STATE_PURGING) ||
4183 !in->has_snap_data(vino.snapid)) {
4184 if (in->is_dir() || !parent_ino) {
4185 respond_to_request(mdr, -CEPHFS_ESTALE);
4186 return;
4187 }
4188 in = NULL;
4189 }
4190 }
4191 }
4192
4193 if (in) {
4194 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
4195 mdr->snapid = vino.snapid;
4196 mdr->tracei = in;
4197 respond_to_request(mdr, 0);
4198 return;
4199 }
4200
4201 CInode *diri = NULL;
4202 if (parent_ino) {
4203 diri = mdcache->get_inode(parent_ino);
4204 if (!diri) {
4205 mdcache->open_ino(parent_ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
4206 return;
4207 }
4208
4209 if (!diri->is_dir()) {
4210 respond_to_request(mdr, -CEPHFS_EINVAL);
4211 return;
4212 }
4213
4214 MutationImpl::LockOpVec lov;
4215 lov.add_rdlock(&diri->dirfragtreelock);
4216 if (!mds->locker->acquire_locks(mdr, lov))
4217 return;
4218
4219 frag_t frag = diri->dirfragtree[hash];
4220 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
4221 if (!dir)
4222 return;
4223
4224 if (!dir->is_complete()) {
4225 if (dir->is_frozen()) {
4226 mds->locker->drop_locks(mdr.get());
4227 mdr->drop_local_auth_pins();
4228 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4229 return;
4230 }
4231 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4232 return;
4233 }
4234
4235 respond_to_request(mdr, -CEPHFS_ESTALE);
4236 } else {
4237 mdcache->open_ino(vino.ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
4238 }
4239 }
4240
4241 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
4242 {
4243 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
4244 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
4245
4246 // `r` is a rank if >=0, else an error code
4247 if (r >= 0) {
4248 mds_rank_t dest_rank(r);
4249 if (dest_rank == mds->get_nodeid())
4250 dispatch_client_request(mdr);
4251 else
4252 mdcache->request_forward(mdr, dest_rank);
4253 return;
4254 }
4255
4256 // give up
4257 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
4258 r = -CEPHFS_ESTALE;
4259 respond_to_request(mdr, r);
4260 }
4261
4262
4263 /* This function takes responsibility for the passed mdr*/
4264 void Server::handle_client_open(MDRequestRef& mdr)
4265 {
4266 const cref_t<MClientRequest> &req = mdr->client_request;
4267 dout(7) << "open on " << req->get_filepath() << dendl;
4268
4269 int flags = req->head.args.open.flags;
4270 int cmode = ceph_flags_to_mode(flags);
4271 if (cmode < 0) {
4272 respond_to_request(mdr, -CEPHFS_EINVAL);
4273 return;
4274 }
4275
4276 bool need_auth = !file_mode_is_readonly(cmode) ||
4277 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
4278
4279 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
4280 dout(7) << "read-only FS" << dendl;
4281 respond_to_request(mdr, -CEPHFS_EROFS);
4282 return;
4283 }
4284
4285 CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
4286 if (!cur)
4287 return;
4288
4289 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
4290 ceph_assert(!need_auth);
4291 mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
4292 CInode *cur = rdlock_path_pin_ref(mdr, true);
4293 if (!cur)
4294 return;
4295 }
4296
4297 if (!cur->is_file()) {
4298 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4299 cmode = CEPH_FILE_MODE_PIN;
4300 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4301 if (cur->is_symlink() && !(flags & CEPH_O_NOFOLLOW))
4302 flags &= ~CEPH_O_TRUNC;
4303 }
4304
4305 dout(10) << "open flags = " << flags
4306 << ", filemode = " << cmode
4307 << ", need_auth = " << need_auth
4308 << dendl;
4309
4310 // regular file?
4311 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4312 dout(7) << "not a file or dir " << *cur << dendl;
4313 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
4314 return;
4315 }*/
4316 if ((flags & CEPH_O_DIRECTORY) && !cur->is_dir() && !cur->is_symlink()) {
4317 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
4318 respond_to_request(mdr, -CEPHFS_EINVAL);
4319 return;
4320 }
4321
4322 if ((flags & CEPH_O_TRUNC) && !cur->is_file()) {
4323 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
4324 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4325 respond_to_request(mdr, cur->is_dir() ? -CEPHFS_EISDIR : -CEPHFS_EINVAL);
4326 return;
4327 }
4328
4329 if (cur->get_inode()->inline_data.version != CEPH_INLINE_NONE &&
4330 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
4331 dout(7) << "old client cannot open inline data file " << *cur << dendl;
4332 respond_to_request(mdr, -CEPHFS_EPERM);
4333 return;
4334 }
4335
4336 // snapped data is read only
4337 if (mdr->snapid != CEPH_NOSNAP &&
4338 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
4339 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
4340 respond_to_request(mdr, -CEPHFS_EROFS);
4341 return;
4342 }
4343
4344 MutationImpl::LockOpVec lov;
4345
4346 unsigned mask = req->head.args.open.mask;
4347 if (mask) {
4348 Capability *cap = cur->get_client_cap(mdr->get_client());
4349 int issued = 0;
4350 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4351 issued = cap->issued();
4352 // permission bits, ACL/security xattrs
4353 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
4354 lov.add_rdlock(&cur->authlock);
4355 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
4356 lov.add_rdlock(&cur->xattrlock);
4357
4358 mdr->getattr_caps = mask;
4359 }
4360
4361 // O_TRUNC
4362 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
4363 ceph_assert(cur->is_auth());
4364
4365 lov.add_xlock(&cur->filelock);
4366 if (!mds->locker->acquire_locks(mdr, lov))
4367 return;
4368
4369 if (!check_access(mdr, cur, MAY_WRITE))
4370 return;
4371
4372 // wait for pending truncate?
4373 const auto& pi = cur->get_projected_inode();
4374 if (pi->is_truncating()) {
4375 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4376 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4377 mds->locker->drop_locks(mdr.get());
4378 mdr->drop_local_auth_pins();
4379 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4380 return;
4381 }
4382
4383 do_open_truncate(mdr, cmode);
4384 return;
4385 }
4386
4387 // sync filelock if snapped.
4388 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4389 // and that data itself is flushed so that we can read the snapped data off disk.
4390 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
4391 lov.add_rdlock(&cur->filelock);
4392 }
4393
4394 if (!mds->locker->acquire_locks(mdr, lov))
4395 return;
4396
4397 mask = MAY_READ;
4398 if (cmode & CEPH_FILE_MODE_WR)
4399 mask |= MAY_WRITE;
4400 if (!check_access(mdr, cur, mask))
4401 return;
4402
4403 utime_t now = ceph_clock_now();
4404 mdr->set_mds_stamp(now);
4405
4406 if (cur->is_file() || cur->is_dir()) {
4407 if (mdr->snapid == CEPH_NOSNAP) {
4408 // register new cap
4409 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
4410 if (cap)
4411 dout(12) << "open issued caps " << ccap_string(cap->pending())
4412 << " for " << req->get_source()
4413 << " on " << *cur << dendl;
4414 } else {
4415 int caps = ceph_caps_for_mode(cmode);
4416 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
4417 << " for " << req->get_source()
4418 << " snapid " << mdr->snapid
4419 << " on " << *cur << dendl;
4420 mdr->snap_caps = caps;
4421 }
4422 }
4423
4424 // increase max_size?
4425 if (cmode & CEPH_FILE_MODE_WR)
4426 mds->locker->check_inode_max_size(cur);
4427
4428 // make sure this inode gets into the journal
4429 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
4430 mdcache->open_file_table.should_log_open(cur)) {
4431 EOpen *le = new EOpen(mds->mdlog);
4432 mdlog->start_entry(le);
4433 le->add_clean_inode(cur);
4434 mdlog->submit_entry(le);
4435 }
4436
4437 // hit pop
4438 if (cmode & CEPH_FILE_MODE_WR)
4439 mds->balancer->hit_inode(cur, META_POP_IWR);
4440 else
4441 mds->balancer->hit_inode(cur, META_POP_IRD);
4442
4443 CDentry *dn = 0;
4444 if (req->get_dentry_wanted()) {
4445 ceph_assert(mdr->dn[0].size());
4446 dn = mdr->dn[0].back();
4447 }
4448
4449 mdr->tracei = cur;
4450 mdr->tracedn = dn;
4451 respond_to_request(mdr, 0);
4452 }
4453
4454 class C_MDS_openc_finish : public ServerLogContext {
4455 CDentry *dn;
4456 CInode *newi;
4457 public:
4458 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4459 ServerLogContext(s, r), dn(d), newi(ni) {}
4460 void finish(int r) override {
4461 ceph_assert(r == 0);
4462
4463 dn->pop_projected_linkage();
4464
4465 // dirty inode, dn, dir
4466 newi->mark_dirty(mdr->ls);
4467 newi->mark_dirty_parent(mdr->ls, true);
4468
4469 mdr->apply();
4470
4471 get_mds()->locker->share_inode_max_size(newi);
4472
4473 MDRequestRef null_ref;
4474 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4475
4476 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
4477
4478 server->respond_to_request(mdr, 0);
4479
4480 ceph_assert(g_conf()->mds_kill_openc_at != 1);
4481 }
4482 };
4483
4484 /* This function takes responsibility for the passed mdr*/
4485 void Server::handle_client_openc(MDRequestRef& mdr)
4486 {
4487 const cref_t<MClientRequest> &req = mdr->client_request;
4488 client_t client = mdr->get_client();
4489
4490 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4491
4492 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4493 if (cmode < 0) {
4494 respond_to_request(mdr, -CEPHFS_EINVAL);
4495 return;
4496 }
4497
4498 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
4499 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true, true);
4500 if (!dn)
4501 return;
4502
4503 if (is_unlink_pending(dn)) {
4504 wait_for_pending_unlink(dn, mdr);
4505 return;
4506 }
4507
4508 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4509 if (!excl && !dnl->is_null()) {
4510 // it existed.
4511 ceph_assert(mdr.get()->is_rdlocked(&dn->lock));
4512
4513 MutationImpl::LockOpVec lov;
4514 lov.add_rdlock(&dnl->get_inode()->snaplock);
4515 if (!mds->locker->acquire_locks(mdr, lov))
4516 return;
4517
4518 handle_client_open(mdr);
4519 return;
4520 }
4521
4522 ceph_assert(dnl->is_null());
4523
4524 if (req->get_alternate_name().size() > alternate_name_max) {
4525 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
4526 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
4527 return;
4528 }
4529 dn->set_alternate_name(req->get_alternate_name());
4530
4531 // set layout
4532 file_layout_t layout;
4533 if (mdr->dir_layout != file_layout_t())
4534 layout = mdr->dir_layout;
4535 else
4536 layout = mdcache->default_file_layout;
4537
4538 // What kind of client caps are required to complete this operation
4539 uint64_t access = MAY_WRITE;
4540
4541 const auto default_layout = layout;
4542
4543 // fill in any special params from client
4544 if (req->head.args.open.stripe_unit)
4545 layout.stripe_unit = req->head.args.open.stripe_unit;
4546 if (req->head.args.open.stripe_count)
4547 layout.stripe_count = req->head.args.open.stripe_count;
4548 if (req->head.args.open.object_size)
4549 layout.object_size = req->head.args.open.object_size;
4550 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4551 (__s32)req->head.args.open.pool >= 0) {
4552 layout.pool_id = req->head.args.open.pool;
4553
4554 // make sure we have as new a map as the client
4555 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4556 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4557 return;
4558 }
4559 }
4560
4561 // If client doesn't have capability to modify layout pools, then
4562 // only permit this request if the requested pool matches what the
4563 // file would have inherited anyway from its parent.
4564 if (default_layout != layout) {
4565 access |= MAY_SET_VXATTR;
4566 }
4567
4568 if (!layout.is_valid()) {
4569 dout(10) << " invalid initial file layout" << dendl;
4570 respond_to_request(mdr, -CEPHFS_EINVAL);
4571 return;
4572 }
4573 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4574 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4575 respond_to_request(mdr, -CEPHFS_EINVAL);
4576 return;
4577 }
4578
4579 // created null dn.
4580 CDir *dir = dn->get_dir();
4581 CInode *diri = dir->get_inode();
4582 if (!check_access(mdr, diri, access))
4583 return;
4584 if (!check_fragment_space(mdr, dir))
4585 return;
4586 if (!check_dir_max_entries(mdr, dir))
4587 return;
4588
4589 if (mdr->dn[0].size() == 1)
4590 mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
4591
4592 // create inode.
4593 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4594 req->head.args.open.mode | S_IFREG, &layout);
4595 ceph_assert(newi);
4596
4597 // it's a file.
4598 dn->push_projected_linkage(newi);
4599
4600 auto _inode = newi->_get_inode();
4601 _inode->version = dn->pre_dirty();
4602 if (layout.pool_id != mdcache->default_file_layout.pool_id)
4603 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
4604 _inode->update_backtrace();
4605 _inode->rstat.rfiles = 1;
4606 _inode->accounted_rstat = _inode->rstat;
4607
4608 SnapRealm *realm = diri->find_snaprealm();
4609 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4610 ceph_assert(follows >= realm->get_newest_seq());
4611
4612 ceph_assert(dn->first == follows+1);
4613 newi->first = dn->first;
4614
4615 // do the open
4616 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
4617 newi->authlock.set_state(LOCK_EXCL);
4618 newi->xattrlock.set_state(LOCK_EXCL);
4619
4620 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
4621 _inode->client_ranges[client].range.first = 0;
4622 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
4623 _inode->client_ranges[client].follows = follows;
4624 newi->mark_clientwriteable();
4625 cap->mark_clientwriteable();
4626 }
4627
4628 // prepare finisher
4629 mdr->ls = mdlog->get_current_segment();
4630 EUpdate *le = new EUpdate(mdlog, "openc");
4631 mdlog->start_entry(le);
4632 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4633 journal_allocated_inos(mdr, &le->metablob);
4634 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4635 le->metablob.add_primary_dentry(dn, newi, true, true, true);
4636
4637 // make sure this inode gets into the journal
4638 le->metablob.add_opened_ino(newi->ino());
4639
4640 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, newi);
4641
4642 if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
4643 openc_response_t ocresp;
4644
4645 dout(10) << "adding created_ino and delegated_inos" << dendl;
4646 ocresp.created_ino = _inode->ino;
4647
4648 if (delegate_inos_pct && !req->is_queued_for_replay()) {
4649 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4650 unsigned frac = 100 / delegate_inos_pct;
4651 if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
4652 mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
4653 }
4654
4655 encode(ocresp, mdr->reply_extra_bl);
4656 } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
4657 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4658 // add the file created flag onto the reply if create_flags features is supported
4659 encode(newi->ino(), mdr->reply_extra_bl);
4660 }
4661
4662 journal_and_reply(mdr, newi, dn, le, fin);
4663
4664 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4665 // have overshot the split size (multiple opencs in flight), so here is
4666 // an early chance to split the dir if this openc makes it oversized.
4667 mds->balancer->maybe_fragment(dir, false);
4668 }
4669
4670
4671
4672 void Server::handle_client_readdir(MDRequestRef& mdr)
4673 {
4674 const cref_t<MClientRequest> &req = mdr->client_request;
4675 Session *session = mds->get_session(req);
4676 client_t client = req->get_source().num();
4677 MutationImpl::LockOpVec lov;
4678 CInode *diri = rdlock_path_pin_ref(mdr, false, true);
4679 if (!diri) return;
4680
4681 // it's a directory, right?
4682 if (!diri->is_dir()) {
4683 // not a dir
4684 dout(10) << "reply to " << *req << " readdir -CEPHFS_ENOTDIR" << dendl;
4685 respond_to_request(mdr, -CEPHFS_ENOTDIR);
4686 return;
4687 }
4688
4689 auto num_caps = session->get_num_caps();
4690 auto session_cap_acquisition = session->get_cap_acquisition();
4691
4692 if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
4693 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
4694 << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
4695 if (logger)
4696 logger->inc(l_mdss_cap_acquisition_throttle);
4697
4698 mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
4699 return;
4700 }
4701
4702 lov.add_rdlock(&diri->filelock);
4703 lov.add_rdlock(&diri->dirfragtreelock);
4704
4705 if (!mds->locker->acquire_locks(mdr, lov))
4706 return;
4707
4708 if (!check_access(mdr, diri, MAY_READ))
4709 return;
4710
4711 // which frag?
4712 frag_t fg = (__u32)req->head.args.readdir.frag;
4713 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4714 string offset_str = req->get_path2();
4715
4716 __u32 offset_hash = 0;
4717 if (!offset_str.empty())
4718 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4719 else
4720 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4721
4722 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4723 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4724
4725 // does the frag exist?
4726 if (diri->dirfragtree[fg.value()] != fg) {
4727 frag_t newfg;
4728 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4729 if (fg.contains((unsigned)offset_hash)) {
4730 newfg = diri->dirfragtree[offset_hash];
4731 } else {
4732 // client actually wants next frag
4733 newfg = diri->dirfragtree[fg.value()];
4734 }
4735 } else {
4736 offset_str.clear();
4737 newfg = diri->dirfragtree[fg.value()];
4738 }
4739 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4740 fg = newfg;
4741 }
4742
4743 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4744 if (!dir) return;
4745
4746 // ok!
4747 dout(10) << "handle_client_readdir on " << *dir << dendl;
4748 ceph_assert(dir->is_auth());
4749
4750 if (!dir->is_complete()) {
4751 if (dir->is_frozen()) {
4752 dout(7) << "dir is frozen " << *dir << dendl;
4753 mds->locker->drop_locks(mdr.get());
4754 mdr->drop_local_auth_pins();
4755 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4756 return;
4757 }
4758 // fetch
4759 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4760 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4761 return;
4762 }
4763
4764 #ifdef MDS_VERIFY_FRAGSTAT
4765 dir->verify_fragstat();
4766 #endif
4767
4768 utime_t now = ceph_clock_now();
4769 mdr->set_mds_stamp(now);
4770
4771 snapid_t snapid = mdr->snapid;
4772 dout(10) << "snapid " << snapid << dendl;
4773
4774 SnapRealm *realm = diri->find_snaprealm();
4775
4776 unsigned max = req->head.args.readdir.max_entries;
4777 if (!max)
4778 max = dir->get_num_any(); // whatever, something big.
4779 unsigned max_bytes = req->head.args.readdir.max_bytes;
4780 if (!max_bytes)
4781 // make sure at least one item can be encoded
4782 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
4783
4784 // start final blob
4785 bufferlist dirbl;
4786 DirStat ds;
4787 ds.frag = dir->get_frag();
4788 ds.auth = dir->get_dir_auth().first;
4789 if (dir->is_auth() && !forward_all_requests_to_auth)
4790 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4791
4792 dir->encode_dirstat(dirbl, mdr->session->info, ds);
4793
4794 // count bytes available.
4795 // this isn't perfect, but we should capture the main variable/unbounded size items!
4796 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4797 int bytes_left = max_bytes - front_bytes;
4798 bytes_left -= get_snap_trace(session, realm).length();
4799
4800 // build dir contents
4801 bufferlist dnbl;
4802 __u32 numfiles = 0;
4803 bool start = !offset_hash && offset_str.empty();
4804 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4805 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
4806 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4807 bool end = (it == dir->end());
4808 for (; !end && numfiles < max; end = (it == dir->end())) {
4809 CDentry *dn = it->second;
4810 ++it;
4811
4812 if (dn->state_test(CDentry::STATE_PURGING))
4813 continue;
4814
4815 bool dnp = dn->use_projected(client, mdr);
4816 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4817
4818 if (dnl->is_null()) {
4819 if (dn->get_num_ref() == 0 && !dn->is_projected())
4820 dir->remove_dentry(dn);
4821 continue;
4822 }
4823
4824 if (dn->last < snapid || dn->first > snapid) {
4825 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4826 continue;
4827 }
4828
4829 if (!start) {
4830 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4831 if (!(offset_key < dn->key()))
4832 continue;
4833 }
4834
4835 CInode *in = dnl->get_inode();
4836
4837 if (in && in->ino() == CEPH_INO_CEPH)
4838 continue;
4839
4840 // remote link?
4841 // better for the MDS to do the work, if we think the client will stat any of these files.
4842 if (dnl->is_remote() && !in) {
4843 in = mdcache->get_inode(dnl->get_remote_ino());
4844 if (in) {
4845 dn->link_remote(dnl, in);
4846 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4847 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4848 continue;
4849 } else {
4850 // touch everything i _do_ have
4851 for (auto &p : *dir) {
4852 if (!p.second->get_linkage()->is_null())
4853 mdcache->lru.lru_touch(p.second);
4854 }
4855
4856 // already issued caps and leases, reply immediately.
4857 if (dnbl.length() > 0) {
4858 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4859 dout(10) << " open remote dentry after caps were issued, stopping at "
4860 << dnbl.length() << " < " << bytes_left << dendl;
4861 break;
4862 }
4863
4864 mds->locker->drop_locks(mdr.get());
4865 mdr->drop_local_auth_pins();
4866 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4867 return;
4868 }
4869 }
4870 ceph_assert(in);
4871
4872 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
4873 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4874 break;
4875 }
4876
4877 unsigned start_len = dnbl.length();
4878
4879 // dentry
4880 dout(12) << "including dn " << *dn << dendl;
4881 encode(dn->get_name(), dnbl);
4882 mds->locker->issue_client_lease(dn, in, mdr, now, dnbl);
4883
4884 // inode
4885 dout(12) << "including inode " << *in << dendl;
4886 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4887 if (r < 0) {
4888 // chop off dn->name, lease
4889 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4890 bufferlist keep;
4891 keep.substr_of(dnbl, 0, start_len);
4892 dnbl.swap(keep);
4893 break;
4894 }
4895 ceph_assert(r >= 0);
4896 numfiles++;
4897
4898 // touch dn
4899 mdcache->lru.lru_touch(dn);
4900 }
4901
4902 session->touch_readdir_cap(numfiles);
4903
4904 __u16 flags = 0;
4905 if (end) {
4906 flags = CEPH_READDIR_FRAG_END;
4907 if (start)
4908 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4909 }
4910 // client only understand END and COMPLETE flags ?
4911 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4912 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4913 }
4914
4915 // finish final blob
4916 encode(numfiles, dirbl);
4917 encode(flags, dirbl);
4918 dirbl.claim_append(dnbl);
4919
4920 // yay, reply
4921 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4922 << " bytes=" << dirbl.length()
4923 << " start=" << (int)start
4924 << " end=" << (int)end
4925 << dendl;
4926 mdr->reply_extra_bl = dirbl;
4927
4928 // bump popularity. NOTE: this doesn't quite capture it.
4929 mds->balancer->hit_dir(dir, META_POP_READDIR, numfiles);
4930
4931 // reply
4932 mdr->tracei = diri;
4933 respond_to_request(mdr, 0);
4934 }
4935
4936
4937
4938 // ===============================================================================
4939 // INODE UPDATES
4940
4941
4942 /*
4943 * finisher for basic inode updates
4944 */
4945 class C_MDS_inode_update_finish : public ServerLogContext {
4946 CInode *in;
4947 bool truncating_smaller, changed_ranges, adjust_realm;
4948 public:
4949 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
4950 bool sm=false, bool cr=false, bool ar=false) :
4951 ServerLogContext(s, r), in(i),
4952 truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { }
4953 void finish(int r) override {
4954 ceph_assert(r == 0);
4955
4956 int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT);
4957
4958 // apply
4959 mdr->apply();
4960
4961 MDSRank *mds = get_mds();
4962
4963 // notify any clients
4964 if (truncating_smaller && in->get_inode()->is_truncating()) {
4965 mds->locker->issue_truncate(in);
4966 mds->mdcache->truncate_inode(in, mdr->ls);
4967 }
4968
4969 if (adjust_realm) {
4970 mds->mdcache->send_snap_update(in, 0, snap_op);
4971 mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op);
4972 }
4973
4974 get_mds()->balancer->hit_inode(in, META_POP_IWR);
4975
4976 server->respond_to_request(mdr, 0);
4977
4978 if (changed_ranges)
4979 get_mds()->locker->share_inode_max_size(in);
4980 }
4981 };
4982
4983 void Server::handle_client_file_setlock(MDRequestRef& mdr)
4984 {
4985 const cref_t<MClientRequest> &req = mdr->client_request;
4986 MutationImpl::LockOpVec lov;
4987
4988 // get the inode to operate on, and set up any locks needed for that
4989 CInode *cur = rdlock_path_pin_ref(mdr, true);
4990 if (!cur)
4991 return;
4992
4993 lov.add_xlock(&cur->flocklock);
4994 /* acquire_locks will return true if it gets the locks. If it fails,
4995 it will redeliver this request at a later date, so drop the request.
4996 */
4997 if (!mds->locker->acquire_locks(mdr, lov)) {
4998 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
4999 return;
5000 }
5001
5002 // copy the lock change into a ceph_filelock so we can store/apply it
5003 ceph_filelock set_lock;
5004 set_lock.start = req->head.args.filelock_change.start;
5005 set_lock.length = req->head.args.filelock_change.length;
5006 set_lock.client = req->get_orig_source().num();
5007 set_lock.owner = req->head.args.filelock_change.owner;
5008 set_lock.pid = req->head.args.filelock_change.pid;
5009 set_lock.type = req->head.args.filelock_change.type;
5010 bool will_wait = req->head.args.filelock_change.wait;
5011
5012 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
5013
5014 ceph_lock_state_t *lock_state = NULL;
5015 bool interrupt = false;
5016
5017 // get the appropriate lock state
5018 switch (req->head.args.filelock_change.rule) {
5019 case CEPH_LOCK_FLOCK_INTR:
5020 interrupt = true;
5021 // fall-thru
5022 case CEPH_LOCK_FLOCK:
5023 lock_state = cur->get_flock_lock_state();
5024 break;
5025
5026 case CEPH_LOCK_FCNTL_INTR:
5027 interrupt = true;
5028 // fall-thru
5029 case CEPH_LOCK_FCNTL:
5030 lock_state = cur->get_fcntl_lock_state();
5031 break;
5032
5033 default:
5034 dout(10) << "got unknown lock type " << set_lock.type
5035 << ", dropping request!" << dendl;
5036 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
5037 return;
5038 }
5039
5040 dout(10) << " state prior to lock change: " << *lock_state << dendl;
5041 if (CEPH_LOCK_UNLOCK == set_lock.type) {
5042 list<ceph_filelock> activated_locks;
5043 MDSContext::vec waiters;
5044 if (lock_state->is_waiting(set_lock)) {
5045 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
5046 lock_state->remove_waiting(set_lock);
5047 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
5048 } else if (!interrupt) {
5049 dout(10) << " unlock attempt on " << set_lock << dendl;
5050 lock_state->remove_lock(set_lock, activated_locks);
5051 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
5052 }
5053 mds->queue_waiters(waiters);
5054
5055 respond_to_request(mdr, 0);
5056 } else {
5057 dout(10) << " lock attempt on " << set_lock << dendl;
5058 bool deadlock = false;
5059 if (mdr->more()->flock_was_waiting &&
5060 !lock_state->is_waiting(set_lock)) {
5061 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
5062 respond_to_request(mdr, -CEPHFS_EINTR);
5063 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
5064 dout(10) << " it failed on this attempt" << dendl;
5065 // couldn't set lock right now
5066 if (deadlock) {
5067 respond_to_request(mdr, -CEPHFS_EDEADLK);
5068 } else if (!will_wait) {
5069 respond_to_request(mdr, -CEPHFS_EWOULDBLOCK);
5070 } else {
5071 dout(10) << " added to waiting list" << dendl;
5072 ceph_assert(lock_state->is_waiting(set_lock));
5073 mdr->more()->flock_was_waiting = true;
5074 mds->locker->drop_locks(mdr.get());
5075 mdr->drop_local_auth_pins();
5076 mdr->mark_event("failed to add lock, waiting");
5077 mdr->mark_nowarn();
5078 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
5079 }
5080 } else
5081 respond_to_request(mdr, 0);
5082 }
5083 dout(10) << " state after lock change: " << *lock_state << dendl;
5084 }
5085
5086 void Server::handle_client_file_readlock(MDRequestRef& mdr)
5087 {
5088 const cref_t<MClientRequest> &req = mdr->client_request;
5089 MutationImpl::LockOpVec lov;
5090
5091 // get the inode to operate on, and set up any locks needed for that
5092 CInode *cur = rdlock_path_pin_ref(mdr, true);
5093 if (!cur)
5094 return;
5095
5096 /* acquire_locks will return true if it gets the locks. If it fails,
5097 it will redeliver this request at a later date, so drop the request.
5098 */
5099 lov.add_rdlock(&cur->flocklock);
5100 if (!mds->locker->acquire_locks(mdr, lov)) {
5101 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
5102 return;
5103 }
5104
5105 // copy the lock change into a ceph_filelock so we can store/apply it
5106 ceph_filelock checking_lock;
5107 checking_lock.start = req->head.args.filelock_change.start;
5108 checking_lock.length = req->head.args.filelock_change.length;
5109 checking_lock.client = req->get_orig_source().num();
5110 checking_lock.owner = req->head.args.filelock_change.owner;
5111 checking_lock.pid = req->head.args.filelock_change.pid;
5112 checking_lock.type = req->head.args.filelock_change.type;
5113
5114 // get the appropriate lock state
5115 ceph_lock_state_t *lock_state = NULL;
5116 switch (req->head.args.filelock_change.rule) {
5117 case CEPH_LOCK_FLOCK:
5118 lock_state = cur->get_flock_lock_state();
5119 break;
5120
5121 case CEPH_LOCK_FCNTL:
5122 lock_state = cur->get_fcntl_lock_state();
5123 break;
5124
5125 default:
5126 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
5127 respond_to_request(mdr, -CEPHFS_EINVAL);
5128 return;
5129 }
5130 lock_state->look_for_lock(checking_lock);
5131
5132 bufferlist lock_bl;
5133 encode(checking_lock, lock_bl);
5134
5135 mdr->reply_extra_bl = lock_bl;
5136 respond_to_request(mdr, 0);
5137 }
5138
5139 void Server::handle_client_setattr(MDRequestRef& mdr)
5140 {
5141 const cref_t<MClientRequest> &req = mdr->client_request;
5142 MutationImpl::LockOpVec lov;
5143 CInode *cur = rdlock_path_pin_ref(mdr, true);
5144 if (!cur) return;
5145
5146 if (mdr->snapid != CEPH_NOSNAP) {
5147 respond_to_request(mdr, -CEPHFS_EROFS);
5148 return;
5149 }
5150 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
5151 respond_to_request(mdr, -CEPHFS_EPERM);
5152 return;
5153 }
5154
5155 __u32 mask = req->head.args.setattr.mask;
5156 __u32 access_mask = MAY_WRITE;
5157
5158 if (req->get_header().version < 6) {
5159 // No changes to fscrypted inodes by downrevved clients
5160 if (!cur->get_inode()->fscrypt_auth.empty()) {
5161 respond_to_request(mdr, -CEPHFS_EPERM);
5162 return;
5163 }
5164
5165 // Only allow fscrypt field changes by capable clients
5166 if (mask & (CEPH_SETATTR_FSCRYPT_FILE|CEPH_SETATTR_FSCRYPT_AUTH)) {
5167 respond_to_request(mdr, -CEPHFS_EINVAL);
5168 return;
5169 }
5170 }
5171
5172 // xlock inode
5173 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID|CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_KILL_SUID|CEPH_SETATTR_KILL_SGID))
5174 lov.add_xlock(&cur->authlock);
5175 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE|CEPH_SETATTR_FSCRYPT_FILE))
5176 lov.add_xlock(&cur->filelock);
5177 if (mask & CEPH_SETATTR_CTIME)
5178 lov.add_wrlock(&cur->versionlock);
5179
5180 if (!mds->locker->acquire_locks(mdr, lov))
5181 return;
5182
5183 if ((mask & CEPH_SETATTR_UID) && (cur->get_inode()->uid != req->head.args.setattr.uid))
5184 access_mask |= MAY_CHOWN;
5185
5186 if ((mask & CEPH_SETATTR_GID) && (cur->get_inode()->gid != req->head.args.setattr.gid))
5187 access_mask |= MAY_CHGRP;
5188
5189 if (!check_access(mdr, cur, access_mask))
5190 return;
5191
5192 // trunc from bigger -> smaller?
5193 const auto& pip = cur->get_projected_inode();
5194
5195 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
5196
5197 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
5198 if (is_full && req->head.args.setattr.size > old_size) {
5199 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl;
5200 respond_to_request(mdr, -CEPHFS_ENOSPC);
5201 return;
5202 }
5203
5204 bool truncating_smaller = false;
5205 if (mask & CEPH_SETATTR_SIZE) {
5206 if (req->get_data().length() >
5207 sizeof(struct ceph_fscrypt_last_block_header) + fscrypt_last_block_max_size) {
5208 dout(10) << __func__ << ": the last block size is too large" << dendl;
5209 respond_to_request(mdr, -CEPHFS_EINVAL);
5210 return;
5211 }
5212
5213 truncating_smaller = req->head.args.setattr.size < old_size ||
5214 (req->head.args.setattr.size == old_size && req->get_data().length());
5215 if (truncating_smaller && pip->is_truncating()) {
5216 dout(10) << " waiting for pending truncate from " << pip->truncate_from
5217 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
5218 mds->locker->drop_locks(mdr.get());
5219 mdr->drop_local_auth_pins();
5220 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
5221 return;
5222 }
5223
5224 if (truncating_smaller && req->get_data().length()) {
5225 struct ceph_fscrypt_last_block_header header;
5226 memset(&header, 0, sizeof(header));
5227 auto bl = req->get_data().cbegin();
5228 DECODE_START(1, bl);
5229 decode(header.change_attr, bl);
5230 DECODE_FINISH(bl);
5231
5232 dout(20) << __func__ << " mdr->retry:" << mdr->retry
5233 << " header.change_attr: " << header.change_attr
5234 << " header.file_offset: " << header.file_offset
5235 << " header.block_size: " << header.block_size
5236 << dendl;
5237
5238 if (header.change_attr != pip->change_attr) {
5239 dout(5) << __func__ << ": header.change_attr:" << header.change_attr
5240 << " != current change_attr:" << pip->change_attr
5241 << ", let client retry it!" << dendl;
5242 // flush the journal to make sure the clients will get the lasted
5243 // change_attr as possible for the next retry
5244 mds->mdlog->flush();
5245 respond_to_request(mdr, -CEPHFS_EAGAIN);
5246 return;
5247 }
5248 }
5249 }
5250
5251 bool changed_ranges = false;
5252
5253 // project update
5254 mdr->ls = mdlog->get_current_segment();
5255 EUpdate *le = new EUpdate(mdlog, "setattr");
5256 mdlog->start_entry(le);
5257
5258 auto pi = cur->project_inode(mdr);
5259
5260 if (mask & CEPH_SETATTR_UID)
5261 pi.inode->uid = req->head.args.setattr.uid;
5262 if (mask & CEPH_SETATTR_GID)
5263 pi.inode->gid = req->head.args.setattr.gid;
5264
5265 if (mask & CEPH_SETATTR_MODE)
5266 pi.inode->mode = (pi.inode->mode & ~07777) | (req->head.args.setattr.mode & 07777);
5267 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID|
5268 CEPH_SETATTR_KILL_SUID|CEPH_SETATTR_KILL_SGID)) &&
5269 S_ISREG(pi.inode->mode)) {
5270 if (mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID) &&
5271 (pi.inode->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
5272 pi.inode->mode &= ~(S_ISUID|S_ISGID);
5273 } else {
5274 if (mask & CEPH_SETATTR_KILL_SUID) {
5275 pi.inode->mode &= ~S_ISUID;
5276 }
5277 if (mask & CEPH_SETATTR_KILL_SGID) {
5278 pi.inode->mode &= ~S_ISGID;
5279 }
5280 }
5281 }
5282
5283 if (mask & CEPH_SETATTR_MTIME)
5284 pi.inode->mtime = req->head.args.setattr.mtime;
5285 if (mask & CEPH_SETATTR_ATIME)
5286 pi.inode->atime = req->head.args.setattr.atime;
5287 if (mask & CEPH_SETATTR_BTIME)
5288 pi.inode->btime = req->head.args.setattr.btime;
5289 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
5290 pi.inode->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
5291 if (mask & CEPH_SETATTR_SIZE) {
5292 if (truncating_smaller) {
5293 pi.inode->truncate(old_size, req->head.args.setattr.size, req->get_data());
5294 le->metablob.add_truncate_start(cur->ino());
5295 } else {
5296 pi.inode->size = req->head.args.setattr.size;
5297 pi.inode->rstat.rbytes = pi.inode->size;
5298 }
5299 pi.inode->mtime = mdr->get_op_stamp();
5300
5301 // adjust client's max_size?
5302 if (mds->locker->calc_new_client_ranges(cur, pi.inode->size)) {
5303 dout(10) << " client_ranges " << cur->get_previous_projected_inode()->client_ranges
5304 << " -> " << pi.inode->client_ranges << dendl;
5305 changed_ranges = true;
5306 }
5307 }
5308
5309 if (mask & CEPH_SETATTR_FSCRYPT_AUTH)
5310 pi.inode->fscrypt_auth = req->fscrypt_auth;
5311 if (mask & CEPH_SETATTR_FSCRYPT_FILE)
5312 pi.inode->fscrypt_file = req->fscrypt_file;
5313
5314 pi.inode->version = cur->pre_dirty();
5315 pi.inode->ctime = mdr->get_op_stamp();
5316 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5317 pi.inode->rstat.rctime = mdr->get_op_stamp();
5318 pi.inode->change_attr++;
5319
5320 // log + wait
5321 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5322 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5323 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5324
5325 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5326 truncating_smaller, changed_ranges));
5327
5328 // flush immediately if there are readers/writers waiting
5329 if (mdr->is_xlocked(&cur->filelock) &&
5330 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
5331 mds->mdlog->flush();
5332 }
5333
5334 /* Takes responsibility for mdr */
5335 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
5336 {
5337 CInode *in = mdr->in[0];
5338 client_t client = mdr->get_client();
5339 ceph_assert(in);
5340
5341 dout(10) << "do_open_truncate " << *in << dendl;
5342
5343 SnapRealm *realm = in->find_snaprealm();
5344 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
5345
5346 mdr->ls = mdlog->get_current_segment();
5347 EUpdate *le = new EUpdate(mdlog, "open_truncate");
5348 mdlog->start_entry(le);
5349
5350 // prepare
5351 auto pi = in->project_inode(mdr);
5352 pi.inode->version = in->pre_dirty();
5353 pi.inode->mtime = pi.inode->ctime = mdr->get_op_stamp();
5354 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5355 pi.inode->rstat.rctime = mdr->get_op_stamp();
5356 pi.inode->change_attr++;
5357
5358 uint64_t old_size = std::max<uint64_t>(pi.inode->size, mdr->client_request->head.args.open.old_size);
5359 if (old_size > 0) {
5360 pi.inode->truncate(old_size, 0);
5361 le->metablob.add_truncate_start(in->ino());
5362 }
5363
5364 bool changed_ranges = false;
5365 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
5366 pi.inode->client_ranges[client].range.first = 0;
5367 pi.inode->client_ranges[client].range.last = pi.inode->get_layout_size_increment();
5368 pi.inode->client_ranges[client].follows = realm->get_newest_seq();
5369 changed_ranges = true;
5370 in->mark_clientwriteable();
5371 cap->mark_clientwriteable();
5372 }
5373
5374 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5375
5376 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5377 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5378
5379 // make sure ino gets into the journal
5380 le->metablob.add_opened_ino(in->ino());
5381
5382 mdr->o_trunc = true;
5383
5384 CDentry *dn = 0;
5385 if (mdr->client_request->get_dentry_wanted()) {
5386 ceph_assert(mdr->dn[0].size());
5387 dn = mdr->dn[0].back();
5388 }
5389
5390 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
5391 changed_ranges));
5392 // Although the `open` part can give an early reply, the truncation won't
5393 // happen until our EUpdate is persistent, to give the client a prompt
5394 // response we must also flush that event.
5395 mdlog->flush();
5396 }
5397
5398
5399 /* This function cleans up the passed mdr */
5400 void Server::handle_client_setlayout(MDRequestRef& mdr)
5401 {
5402 const cref_t<MClientRequest> &req = mdr->client_request;
5403 CInode *cur = rdlock_path_pin_ref(mdr, true);
5404 if (!cur) return;
5405
5406 if (mdr->snapid != CEPH_NOSNAP) {
5407 respond_to_request(mdr, -CEPHFS_EROFS);
5408 return;
5409 }
5410 if (!cur->is_file()) {
5411 respond_to_request(mdr, -CEPHFS_EINVAL);
5412 return;
5413 }
5414 if (cur->get_projected_inode()->size ||
5415 cur->get_projected_inode()->truncate_seq > 1) {
5416 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
5417 return;
5418 }
5419
5420 // validate layout
5421 file_layout_t layout = cur->get_projected_inode()->layout;
5422 // save existing layout for later
5423 const auto old_layout = layout;
5424
5425 int access = MAY_WRITE;
5426
5427 if (req->head.args.setlayout.layout.fl_object_size > 0)
5428 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5429 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5430 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5431 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5432 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5433 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5434 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5435
5436 // make sure we have as new a map as the client
5437 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5438 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5439 return;
5440 }
5441 }
5442
5443 // Don't permit layout modifications without 'p' caps
5444 if (layout != old_layout) {
5445 access |= MAY_SET_VXATTR;
5446 }
5447
5448 if (!layout.is_valid()) {
5449 dout(10) << "bad layout" << dendl;
5450 respond_to_request(mdr, -CEPHFS_EINVAL);
5451 return;
5452 }
5453 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5454 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5455 respond_to_request(mdr, -CEPHFS_EINVAL);
5456 return;
5457 }
5458
5459 MutationImpl::LockOpVec lov;
5460 lov.add_xlock(&cur->filelock);
5461 if (!mds->locker->acquire_locks(mdr, lov))
5462 return;
5463
5464 if (!check_access(mdr, cur, access))
5465 return;
5466
5467 // project update
5468 auto pi = cur->project_inode(mdr);
5469 pi.inode->layout = layout;
5470 // add the old pool to the inode
5471 pi.inode->add_old_pool(old_layout.pool_id);
5472 pi.inode->version = cur->pre_dirty();
5473 pi.inode->ctime = mdr->get_op_stamp();
5474 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5475 pi.inode->rstat.rctime = mdr->get_op_stamp();
5476 pi.inode->change_attr++;
5477
5478 // log + wait
5479 mdr->ls = mdlog->get_current_segment();
5480 EUpdate *le = new EUpdate(mdlog, "setlayout");
5481 mdlog->start_entry(le);
5482 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5483 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5484 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5485
5486 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5487 }
5488
5489 bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
5490 {
5491 if (mdr->locking_state & MutationImpl::ALL_LOCKED)
5492 return true;
5493
5494 MutationImpl::LockOpVec lov;
5495 lov.add_xlock(&in->policylock);
5496 if (xlock_snaplock)
5497 lov.add_xlock(&in->snaplock);
5498 else
5499 lov.add_rdlock(&in->snaplock);
5500 if (!mds->locker->acquire_locks(mdr, lov))
5501 return false;
5502
5503 if (want_layout && in->get_projected_inode()->has_layout()) {
5504 mdr->dir_layout = in->get_projected_inode()->layout;
5505 want_layout = false;
5506 }
5507 if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
5508 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
5509 return false;
5510 }
5511
5512 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5513 return true;
5514 }
5515
5516 CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
5517 {
5518 CInode *in = mdcache->get_inode(ino);
5519 if (!in || in->state_test(CInode::STATE_PURGING)) {
5520 respond_to_request(mdr, -CEPHFS_ESTALE);
5521 return nullptr;
5522 }
5523 if (!in->is_auth()) {
5524 mdcache->request_forward(mdr, in->authority().first);
5525 return nullptr;
5526 }
5527
5528 return in;
5529 }
5530
5531 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
5532 {
5533 const cref_t<MClientRequest> &req = mdr->client_request;
5534
5535 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5536 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5537 if (!cur)
5538 return;
5539
5540 if (!cur->is_dir()) {
5541 respond_to_request(mdr, -CEPHFS_ENOTDIR);
5542 return;
5543 }
5544
5545 if (!xlock_policylock(mdr, cur, true))
5546 return;
5547
5548 // validate layout
5549 const auto& old_pi = cur->get_projected_inode();
5550 file_layout_t layout;
5551 if (old_pi->has_layout())
5552 layout = old_pi->layout;
5553 else if (mdr->dir_layout != file_layout_t())
5554 layout = mdr->dir_layout;
5555 else
5556 layout = mdcache->default_file_layout;
5557
5558 // Level of access required to complete
5559 int access = MAY_WRITE;
5560
5561 const auto old_layout = layout;
5562
5563 if (req->head.args.setlayout.layout.fl_object_size > 0)
5564 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5565 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5566 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5567 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5568 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5569 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5570 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5571 // make sure we have as new a map as the client
5572 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5573 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5574 return;
5575 }
5576 }
5577
5578 if (layout != old_layout) {
5579 access |= MAY_SET_VXATTR;
5580 }
5581
5582 if (!layout.is_valid()) {
5583 dout(10) << "bad layout" << dendl;
5584 respond_to_request(mdr, -CEPHFS_EINVAL);
5585 return;
5586 }
5587 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5588 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5589 respond_to_request(mdr, -CEPHFS_EINVAL);
5590 return;
5591 }
5592
5593 if (!check_access(mdr, cur, access))
5594 return;
5595
5596 auto pi = cur->project_inode(mdr);
5597 pi.inode->layout = layout;
5598 pi.inode->version = cur->pre_dirty();
5599
5600 // log + wait
5601 mdr->ls = mdlog->get_current_segment();
5602 EUpdate *le = new EUpdate(mdlog, "setlayout");
5603 mdlog->start_entry(le);
5604 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5605 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5606 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5607
5608 mdr->no_early_reply = true;
5609 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5610 }
5611
5612 // XATTRS
5613 int Server::parse_layout_vxattr_json(
5614 string name, string value, const OSDMap& osdmap, file_layout_t *layout)
5615 {
5616 auto parse_pool = [&](std::string pool_name, int64_t pool_id) -> int64_t {
5617 if (pool_name != "") {
5618 int64_t _pool_id = osdmap.lookup_pg_pool_name(pool_name);
5619 if (_pool_id < 0) {
5620 dout(10) << __func__ << ": unknown pool name:" << pool_name << dendl;
5621 return -CEPHFS_EINVAL;
5622 }
5623 return _pool_id;
5624 } else if (pool_id >= 0) {
5625 const auto pools = osdmap.get_pools();
5626 if (pools.find(pool_id) == pools.end()) {
5627 dout(10) << __func__ << ": unknown pool id:" << pool_id << dendl;
5628 return -CEPHFS_EINVAL;
5629 }
5630 return pool_id;
5631 } else {
5632 return -CEPHFS_EINVAL;
5633 }
5634 };
5635
5636 try {
5637 if (name == "layout.json") {
5638 JSONParser json_parser;
5639 if (json_parser.parse(value.c_str(), value.length()) and json_parser.is_object()) {
5640 std::string field;
5641 try {
5642 field = "object_size";
5643 JSONDecoder::decode_json("object_size", layout->object_size, &json_parser, true);
5644
5645 field = "stripe_unit";
5646 JSONDecoder::decode_json("stripe_unit", layout->stripe_unit, &json_parser, true);
5647
5648 field = "stripe_count";
5649 JSONDecoder::decode_json("stripe_count", layout->stripe_count, &json_parser, true);
5650
5651 field = "pool_namespace";
5652 JSONDecoder::decode_json("pool_namespace", layout->pool_ns, &json_parser, false);
5653
5654 field = "pool_id";
5655 int64_t pool_id = 0;
5656 JSONDecoder::decode_json("pool_id", pool_id, &json_parser, false);
5657
5658 field = "pool_name";
5659 std::string pool_name;
5660 JSONDecoder::decode_json("pool_name", pool_name, &json_parser, false);
5661
5662 pool_id = parse_pool(pool_name, pool_id);
5663 if (pool_id < 0) {
5664 return (int)pool_id;
5665 }
5666 layout->pool_id = pool_id;
5667 } catch (JSONDecoder::err&) {
5668 dout(10) << __func__ << ": json is missing a mandatory field named "
5669 << field << dendl;
5670 return -CEPHFS_EINVAL;
5671 }
5672 } else {
5673 dout(10) << __func__ << ": bad json" << dendl;
5674 return -CEPHFS_EINVAL;
5675 }
5676 } else {
5677 dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
5678 return -CEPHFS_ENODATA; // no such attribute
5679 }
5680 } catch (boost::bad_lexical_cast const&) {
5681 dout(10) << __func__ << ": bad vxattr value:" << value
5682 << ", unable to parse for xattr:" << name << dendl;
5683 return -CEPHFS_EINVAL;
5684 }
5685 return 0;
5686 }
5687
5688 // parse old style layout string
5689 int Server::parse_layout_vxattr_string(
5690 string name, string value, const OSDMap& osdmap, file_layout_t *layout)
5691 {
5692 try {
5693 if (name == "layout") {
5694 string::iterator begin = value.begin();
5695 string::iterator end = value.end();
5696 keys_and_values<string::iterator> p; // create instance of parser
5697 std::map<string, string> m; // map to receive results
5698 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5699 return -CEPHFS_EINVAL;
5700 }
5701 string left(begin, end);
5702 dout(10) << __func__ << ": parsed " << m << " left '" << left << "'" << dendl;
5703 if (begin != end)
5704 return -CEPHFS_EINVAL;
5705 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5706 // Skip validation on each attr, we do it once at the end (avoid
5707 // rejecting intermediate states if the overall result is ok)
5708 int r = parse_layout_vxattr_string(string("layout.") + q->first, q->second,
5709 osdmap, layout);
5710 if (r < 0)
5711 return r;
5712 }
5713 } else if (name == "layout.object_size") {
5714 layout->object_size = boost::lexical_cast<unsigned>(value);
5715 } else if (name == "layout.stripe_unit") {
5716 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5717 } else if (name == "layout.stripe_count") {
5718 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5719 } else if (name == "layout.pool") {
5720 try {
5721 layout->pool_id = boost::lexical_cast<unsigned>(value);
5722 } catch (boost::bad_lexical_cast const&) {
5723 int64_t pool = osdmap.lookup_pg_pool_name(value);
5724 if (pool < 0) {
5725 dout(10) << __func__ << ": unknown pool " << value << dendl;
5726 return -CEPHFS_ENOENT;
5727 }
5728 layout->pool_id = pool;
5729 }
5730 } else if (name == "layout.pool_id") {
5731 layout->pool_id = boost::lexical_cast<int64_t>(value);
5732 } else if (name == "layout.pool_name") {
5733 layout->pool_id = osdmap.lookup_pg_pool_name(value);
5734 if (layout->pool_id < 0) {
5735 dout(10) << __func__ << ": unknown pool " << value << dendl;
5736 return -CEPHFS_EINVAL;
5737 }
5738 } else if (name == "layout.pool_namespace") {
5739 layout->pool_ns = value;
5740 } else {
5741 dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
5742 return -CEPHFS_ENODATA; // no such attribute
5743 }
5744 } catch (boost::bad_lexical_cast const&) {
5745 dout(10) << __func__ << ": bad vxattr value, unable to parse int for "
5746 << name << dendl;
5747 return -CEPHFS_EINVAL;
5748 }
5749 return 0;
5750 }
5751
5752 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5753 file_layout_t *layout, bool validate)
5754 {
5755 dout(20) << __func__ << ": name:" << name << " value:'" << value << "'" << dendl;
5756
5757 int r;
5758 if (name == "layout.json") {
5759 r = parse_layout_vxattr_json(name, value, osdmap, layout);
5760 } else {
5761 r = parse_layout_vxattr_string(name, value, osdmap, layout);
5762 }
5763 if (r < 0) {
5764 return r;
5765 }
5766
5767 if (validate && !layout->is_valid()) {
5768 dout(10) << __func__ << ": bad layout" << dendl;
5769 return -CEPHFS_EINVAL;
5770 }
5771 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
5772 dout(10) << __func__ << ": invalid data pool " << layout->pool_id << dendl;
5773 return -CEPHFS_EINVAL;
5774 }
5775 return 0;
5776 }
5777
5778 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5779 {
5780 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5781 try {
5782 if (name == "quota") {
5783 string::iterator begin = value.begin();
5784 string::iterator end = value.end();
5785 if (begin == end) {
5786 // keep quota unchanged. (for create_quota_realm())
5787 return 0;
5788 }
5789 keys_and_values<string::iterator> p; // create instance of parser
5790 std::map<string, string> m; // map to receive results
5791 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5792 return -CEPHFS_EINVAL;
5793 }
5794 string left(begin, end);
5795 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5796 if (begin != end)
5797 return -CEPHFS_EINVAL;
5798 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5799 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5800 if (r < 0)
5801 return r;
5802 }
5803 } else if (name == "quota.max_bytes") {
5804 int64_t q = boost::lexical_cast<int64_t>(value);
5805 if (q < 0)
5806 return -CEPHFS_EINVAL;
5807 quota->max_bytes = q;
5808 } else if (name == "quota.max_files") {
5809 int64_t q = boost::lexical_cast<int64_t>(value);
5810 if (q < 0)
5811 return -CEPHFS_EINVAL;
5812 quota->max_files = q;
5813 } else {
5814 dout(10) << " unknown quota vxattr " << name << dendl;
5815 return -CEPHFS_EINVAL;
5816 }
5817 } catch (boost::bad_lexical_cast const&) {
5818 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5819 return -CEPHFS_EINVAL;
5820 }
5821
5822 if (!quota->is_valid()) {
5823 dout(10) << "bad quota" << dendl;
5824 return -CEPHFS_EINVAL;
5825 }
5826 return 0;
5827 }
5828
5829 void Server::create_quota_realm(CInode *in)
5830 {
5831 dout(10) << __func__ << " " << *in << dendl;
5832
5833 auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
5834 req->set_filepath(filepath(in->ino()));
5835 req->set_string2("ceph.quota");
5836 // empty vxattr value
5837 req->set_tid(mds->issue_tid());
5838
5839 mds->send_message_mds(req, in->authority().first);
5840 }
5841
5842 /*
5843 * Verify that the file layout attribute carried by client
5844 * is well-formatted.
5845 * Return 0 on success, otherwise this function takes
5846 * responsibility for the passed mdr.
5847 */
5848 int Server::check_layout_vxattr(MDRequestRef& mdr,
5849 string name,
5850 string value,
5851 file_layout_t *layout)
5852 {
5853 const cref_t<MClientRequest> &req = mdr->client_request;
5854 epoch_t epoch;
5855 int r;
5856
5857 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5858 r = parse_layout_vxattr(name, value, osdmap, layout);
5859 epoch = osdmap.get_epoch();
5860 });
5861
5862 if (r == -CEPHFS_ENOENT) {
5863
5864 // we don't have the specified pool, make sure our map
5865 // is newer than or as new as the client.
5866 epoch_t req_epoch = req->get_osdmap_epoch();
5867
5868 if (req_epoch > epoch) {
5869
5870 // well, our map is older. consult mds.
5871 auto fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
5872
5873 mds->objecter->wait_for_map(req_epoch, lambdafy(fin));
5874 return r;
5875 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5876
5877 // For compatibility with client w/ old code, we still need get the
5878 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5879 // we can remove those code.
5880 mdr->waited_for_osdmap = true;
5881 mds->objecter->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5882 mds, new C_MDS_RetryRequest(mdcache, mdr))));
5883 return r;
5884 }
5885 }
5886
5887 if (r < 0) {
5888
5889 if (r == -CEPHFS_ENOENT)
5890 r = -CEPHFS_EINVAL;
5891
5892 respond_to_request(mdr, r);
5893 return r;
5894 }
5895
5896 // all is well
5897 return 0;
5898 }
5899
5900 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
5901 {
5902 const cref_t<MClientRequest> &req = mdr->client_request;
5903 MutationImpl::LockOpVec lov;
5904 string name(req->get_path2());
5905 bufferlist bl = req->get_data();
5906 string value (bl.c_str(), bl.length());
5907 dout(10) << "handle_set_vxattr " << name
5908 << " val " << value.length()
5909 << " bytes on " << *cur
5910 << dendl;
5911
5912 CInode::mempool_inode *pip = nullptr;
5913 string rest;
5914
5915 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5916 return;
5917 }
5918
5919 bool adjust_realm = false;
5920 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5921 if (!cur->is_dir()) {
5922 respond_to_request(mdr, -CEPHFS_EINVAL);
5923 return;
5924 }
5925
5926 if (!xlock_policylock(mdr, cur, true))
5927 return;
5928
5929 /* We need 'As' caps for the fscrypt context */
5930 lov.add_xlock(&cur->authlock);
5931 if (!mds->locker->acquire_locks(mdr, lov)) {
5932 return;
5933 }
5934
5935 /* encrypted directories can't have their layout changed */
5936 if (!cur->get_inode()->fscrypt_auth.empty()) {
5937 respond_to_request(mdr, -CEPHFS_EINVAL);
5938 return;
5939 }
5940
5941 file_layout_t layout;
5942 if (cur->get_projected_inode()->has_layout())
5943 layout = cur->get_projected_inode()->layout;
5944 else if (mdr->dir_layout != file_layout_t())
5945 layout = mdr->dir_layout;
5946 else
5947 layout = mdcache->default_file_layout;
5948
5949 rest = name.substr(name.find("layout"));
5950 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5951 return;
5952
5953 auto pi = cur->project_inode(mdr);
5954 pi.inode->layout = layout;
5955 mdr->no_early_reply = true;
5956 pip = pi.inode.get();
5957 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
5958 if (!cur->is_file()) {
5959 respond_to_request(mdr, -CEPHFS_EINVAL);
5960 return;
5961 }
5962 if (cur->get_projected_inode()->size ||
5963 cur->get_projected_inode()->truncate_seq > 1) {
5964 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
5965 return;
5966 }
5967 file_layout_t layout = cur->get_projected_inode()->layout;
5968 rest = name.substr(name.find("layout"));
5969 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5970 return;
5971
5972 lov.add_xlock(&cur->filelock);
5973 if (!mds->locker->acquire_locks(mdr, lov))
5974 return;
5975
5976 /* encrypted files can't have their layout changed */
5977 if (!cur->get_inode()->fscrypt_auth.empty()) {
5978 respond_to_request(mdr, -CEPHFS_EINVAL);
5979 return;
5980 }
5981
5982 auto pi = cur->project_inode(mdr);
5983 int64_t old_pool = pi.inode->layout.pool_id;
5984 pi.inode->add_old_pool(old_pool);
5985 pi.inode->layout = layout;
5986 pip = pi.inode.get();
5987 } else if (name.compare(0, 10, "ceph.quota") == 0) {
5988 if (!cur->is_dir()) {
5989 respond_to_request(mdr, -CEPHFS_EINVAL);
5990 return;
5991 }
5992
5993 quota_info_t quota = cur->get_projected_inode()->quota;
5994
5995 rest = name.substr(name.find("quota"));
5996 int r = parse_quota_vxattr(rest, value, &quota);
5997 if (r < 0) {
5998 respond_to_request(mdr, r);
5999 return;
6000 }
6001
6002 if (quota.is_enabled() && !cur->get_projected_srnode())
6003 adjust_realm = true;
6004
6005 if (!xlock_policylock(mdr, cur, false, adjust_realm))
6006 return;
6007
6008 if (cur->get_projected_inode()->quota == quota) {
6009 respond_to_request(mdr, 0);
6010 return;
6011 }
6012
6013 auto pi = cur->project_inode(mdr, false, adjust_realm);
6014 pi.inode->quota = quota;
6015
6016 if (adjust_realm)
6017 pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq();
6018
6019 mdr->no_early_reply = true;
6020 pip = pi.inode.get();
6021
6022 client_t exclude_ct = mdr->get_client();
6023 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
6024 } else if (name == "ceph.dir.subvolume"sv) {
6025 if (!cur->is_dir()) {
6026 respond_to_request(mdr, -CEPHFS_EINVAL);
6027 return;
6028 }
6029
6030 bool val;
6031 try {
6032 val = boost::lexical_cast<bool>(value);
6033 } catch (boost::bad_lexical_cast const&) {
6034 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
6035 respond_to_request(mdr, -CEPHFS_EINVAL);
6036 return;
6037 }
6038
6039 /* Verify it's not already a subvolume with lighter weight
6040 * rdlock.
6041 */
6042 if (!mdr->more()->rdonly_checks) {
6043 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
6044 lov.add_rdlock(&cur->snaplock);
6045 if (!mds->locker->acquire_locks(mdr, lov))
6046 return;
6047 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6048 }
6049 const auto srnode = cur->get_projected_srnode();
6050 if (val == (srnode && srnode->is_subvolume())) {
6051 dout(20) << "already marked subvolume" << dendl;
6052 respond_to_request(mdr, 0);
6053 return;
6054 }
6055 mdr->more()->rdonly_checks = true;
6056 }
6057
6058 if ((mdr->locking_state & MutationImpl::ALL_LOCKED) && !mdr->is_xlocked(&cur->snaplock)) {
6059 /* drop the rdlock and acquire xlocks */
6060 dout(20) << "dropping rdlocks" << dendl;
6061 mds->locker->drop_locks(mdr.get());
6062 if (!xlock_policylock(mdr, cur, false, true))
6063 return;
6064 }
6065
6066 /* repeat rdonly checks in case changed between rdlock -> xlock */
6067 SnapRealm *realm = cur->find_snaprealm();
6068 if (val) {
6069 inodeno_t subvol_ino = realm->get_subvolume_ino();
6070 // can't create subvolume inside another subvolume
6071 if (subvol_ino && subvol_ino != cur->ino()) {
6072 respond_to_request(mdr, -CEPHFS_EINVAL);
6073 return;
6074 }
6075 }
6076
6077 const auto srnode = cur->get_projected_srnode();
6078 if (val == (srnode && srnode->is_subvolume())) {
6079 respond_to_request(mdr, 0);
6080 return;
6081 }
6082
6083 auto pi = cur->project_inode(mdr, false, true);
6084 if (!srnode)
6085 pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
6086 if (val)
6087 pi.snapnode->mark_subvolume();
6088 else
6089 pi.snapnode->clear_subvolume();
6090
6091 mdr->no_early_reply = true;
6092 pip = pi.inode.get();
6093 adjust_realm = true;
6094 } else if (name == "ceph.dir.pin"sv) {
6095 if (!cur->is_dir() || cur->is_root()) {
6096 respond_to_request(mdr, -CEPHFS_EINVAL);
6097 return;
6098 }
6099
6100 mds_rank_t rank;
6101 try {
6102 rank = boost::lexical_cast<mds_rank_t>(value);
6103 if (rank < 0) rank = MDS_RANK_NONE;
6104 else if (rank >= MAX_MDS) {
6105 respond_to_request(mdr, -CEPHFS_EDOM);
6106 return;
6107 }
6108 } catch (boost::bad_lexical_cast const&) {
6109 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
6110 respond_to_request(mdr, -CEPHFS_EINVAL);
6111 return;
6112 }
6113
6114 if (!xlock_policylock(mdr, cur))
6115 return;
6116
6117 auto pi = cur->project_inode(mdr);
6118 cur->set_export_pin(rank);
6119 pip = pi.inode.get();
6120 } else if (name == "ceph.dir.pin.random"sv) {
6121 if (!cur->is_dir() || cur->is_root()) {
6122 respond_to_request(mdr, -CEPHFS_EINVAL);
6123 return;
6124 }
6125
6126 double val;
6127 try {
6128 val = boost::lexical_cast<double>(value);
6129 } catch (boost::bad_lexical_cast const&) {
6130 dout(10) << "bad vxattr value, unable to parse float for " << name << dendl;
6131 respond_to_request(mdr, -CEPHFS_EINVAL);
6132 return;
6133 }
6134
6135 if (val < 0.0 || 1.0 < val) {
6136 respond_to_request(mdr, -CEPHFS_EDOM);
6137 return;
6138 } else if (mdcache->export_ephemeral_random_max < val) {
6139 respond_to_request(mdr, -CEPHFS_EINVAL);
6140 return;
6141 }
6142
6143 if (!xlock_policylock(mdr, cur))
6144 return;
6145
6146 auto pi = cur->project_inode(mdr);
6147 cur->setxattr_ephemeral_rand(val);
6148 pip = pi.inode.get();
6149 } else if (name == "ceph.dir.pin.distributed"sv) {
6150 if (!cur->is_dir() || cur->is_root()) {
6151 respond_to_request(mdr, -CEPHFS_EINVAL);
6152 return;
6153 }
6154
6155 bool val;
6156 try {
6157 val = boost::lexical_cast<bool>(value);
6158 } catch (boost::bad_lexical_cast const&) {
6159 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
6160 respond_to_request(mdr, -CEPHFS_EINVAL);
6161 return;
6162 }
6163
6164 if (!xlock_policylock(mdr, cur))
6165 return;
6166
6167 auto pi = cur->project_inode(mdr);
6168 cur->setxattr_ephemeral_dist(val);
6169 pip = pi.inode.get();
6170 } else {
6171 dout(10) << " unknown vxattr " << name << dendl;
6172 respond_to_request(mdr, -CEPHFS_EINVAL);
6173 return;
6174 }
6175
6176 pip->change_attr++;
6177 pip->ctime = mdr->get_op_stamp();
6178 if (mdr->get_op_stamp() > pip->rstat.rctime)
6179 pip->rstat.rctime = mdr->get_op_stamp();
6180 pip->version = cur->pre_dirty();
6181 if (cur->is_file())
6182 pip->update_backtrace();
6183
6184 // log + wait
6185 mdr->ls = mdlog->get_current_segment();
6186 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
6187 mdlog->start_entry(le);
6188 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6189 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6190 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6191
6192 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
6193 false, false, adjust_realm));
6194 return;
6195 }
6196
6197 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
6198 {
6199 const cref_t<MClientRequest> &req = mdr->client_request;
6200 string name(req->get_path2());
6201
6202 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
6203
6204 if (name == "ceph.dir.layout") {
6205 if (!cur->is_dir()) {
6206 respond_to_request(mdr, -CEPHFS_ENODATA);
6207 return;
6208 }
6209 if (cur->is_root()) {
6210 dout(10) << "can't remove layout policy on the root directory" << dendl;
6211 respond_to_request(mdr, -CEPHFS_EINVAL);
6212 return;
6213 }
6214
6215 if (!cur->get_projected_inode()->has_layout()) {
6216 respond_to_request(mdr, -CEPHFS_ENODATA);
6217 return;
6218 }
6219
6220 MutationImpl::LockOpVec lov;
6221 lov.add_xlock(&cur->policylock);
6222 if (!mds->locker->acquire_locks(mdr, lov))
6223 return;
6224
6225 auto pi = cur->project_inode(mdr);
6226 pi.inode->clear_layout();
6227 pi.inode->version = cur->pre_dirty();
6228
6229 // log + wait
6230 mdr->ls = mdlog->get_current_segment();
6231 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
6232 mdlog->start_entry(le);
6233 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6234 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6235 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6236
6237 mdr->no_early_reply = true;
6238 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6239 return;
6240 } else if (name == "ceph.dir.layout.pool_namespace"
6241 || name == "ceph.file.layout.pool_namespace") {
6242 // Namespace is the only layout field that has a meaningful
6243 // null/none value (empty string, means default layout). Is equivalent
6244 // to a setxattr with empty string: pass through the empty payload of
6245 // the rmxattr request to do this.
6246 handle_set_vxattr(mdr, cur);
6247 return;
6248 }
6249
6250 respond_to_request(mdr, -CEPHFS_ENODATA);
6251 }
6252
6253 const Server::XattrHandler Server::xattr_handlers[] = {
6254 {
6255 xattr_name: Server::DEFAULT_HANDLER,
6256 description: "default xattr handler",
6257 validate: &Server::default_xattr_validate,
6258 setxattr: &Server::default_setxattr_handler,
6259 removexattr: &Server::default_removexattr_handler,
6260 },
6261 {
6262 xattr_name: "ceph.mirror.info",
6263 description: "mirror info xattr handler",
6264 validate: &Server::mirror_info_xattr_validate,
6265 setxattr: &Server::mirror_info_setxattr_handler,
6266 removexattr: &Server::mirror_info_removexattr_handler
6267 },
6268 };
6269
6270 const Server::XattrHandler* Server::get_xattr_or_default_handler(std::string_view xattr_name) {
6271 const XattrHandler *default_xattr_handler = nullptr;
6272
6273 for (auto &handler : xattr_handlers) {
6274 if (handler.xattr_name == Server::DEFAULT_HANDLER) {
6275 ceph_assert(default_xattr_handler == nullptr);
6276 default_xattr_handler = &handler;
6277 }
6278 if (handler.xattr_name == xattr_name) {
6279 dout(20) << "handler=" << handler.description << dendl;
6280 return &handler;
6281 }
6282 }
6283
6284 ceph_assert(default_xattr_handler != nullptr);
6285 dout(20) << "handler=" << default_xattr_handler->description << dendl;
6286 return default_xattr_handler;
6287 }
6288
6289 int Server::xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6290 const std::string &xattr_name, int op, int flags) {
6291 if (op == CEPH_MDS_OP_SETXATTR) {
6292 if (xattrs) {
6293 if ((flags & CEPH_XATTR_CREATE) && xattrs->count(mempool::mds_co::string(xattr_name))) {
6294 dout(10) << "setxattr '" << xattr_name << "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur << dendl;
6295 return -CEPHFS_EEXIST;
6296 }
6297 }
6298 if ((flags & CEPH_XATTR_REPLACE) && !(xattrs && xattrs->count(mempool::mds_co::string(xattr_name)))) {
6299 dout(10) << "setxattr '" << xattr_name << "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur << dendl;
6300 return -CEPHFS_ENODATA;
6301 }
6302
6303 return 0;
6304 }
6305
6306 if (op == CEPH_MDS_OP_RMXATTR) {
6307 if (!xattrs || xattrs->count(mempool::mds_co::string(xattr_name)) == 0) {
6308 dout(10) << "removexattr '" << xattr_name << "' and CEPHFS_ENODATA on " << *cur << dendl;
6309 return -CEPHFS_ENODATA;
6310 }
6311
6312 return 0;
6313 }
6314
6315 derr << ": unhandled validation for: " << xattr_name << dendl;
6316 return -CEPHFS_EINVAL;
6317 }
6318
6319 void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name,
6320 const bufferlist &xattr_value) {
6321 size_t len = xattr_value.length();
6322 bufferptr b = buffer::create(len);
6323 if (len) {
6324 xattr_value.begin().copy(len, b.c_str());
6325 }
6326 auto em = xattrs->emplace(std::piecewise_construct,
6327 std::forward_as_tuple(mempool::mds_co::string(xattr_name)),
6328 std::forward_as_tuple(b));
6329 if (!em.second) {
6330 em.first->second = b;
6331 }
6332 }
6333
6334 void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name) {
6335 xattrs->erase(mempool::mds_co::string(xattr_name));
6336 }
6337
6338 int Server::default_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6339 XattrOp *xattr_op) {
6340 return xattr_validate(cur, xattrs, xattr_op->xattr_name, xattr_op->op, xattr_op->flags);
6341 }
6342
6343 void Server::default_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6344 const XattrOp &xattr_op) {
6345 xattr_set(xattrs, xattr_op.xattr_name, xattr_op.xattr_value);
6346 }
6347
6348 void Server::default_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6349 const XattrOp &xattr_op) {
6350 xattr_rm(xattrs, xattr_op.xattr_name);
6351 }
6352
6353 // mirror info xattr handlers
6354 const std::string Server::MirrorXattrInfo::MIRROR_INFO_REGEX = "^cluster_id=([a-f0-9]{8}-" \
6355 "[a-f0-9]{4}-[a-f0-9]{4}-" \
6356 "[a-f0-9]{4}-[a-f0-9]{12})" \
6357 " fs_id=(\\d+)$";
6358 const std::string Server::MirrorXattrInfo::CLUSTER_ID = "ceph.mirror.info.cluster_id";
6359 const std::string Server::MirrorXattrInfo::FS_ID = "ceph.mirror.info.fs_id";
6360 int Server::parse_mirror_info_xattr(const std::string &name, const std::string &value,
6361 std::string &cluster_id, std::string &fs_id) {
6362 dout(20) << "parsing name=" << name << ", value=" << value << dendl;
6363
6364 static const std::regex regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX);
6365 std::smatch match;
6366
6367 std::regex_search(value, match, regex);
6368 if (match.size() != 3) {
6369 derr << "mirror info parse error" << dendl;
6370 return -CEPHFS_EINVAL;
6371 }
6372
6373 cluster_id = match[1];
6374 fs_id = match[2];
6375 dout(20) << " parsed cluster_id=" << cluster_id << ", fs_id=" << fs_id << dendl;
6376 return 0;
6377 }
6378
6379 int Server::mirror_info_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6380 XattrOp *xattr_op) {
6381 if (!cur->is_root()) {
6382 return -CEPHFS_EINVAL;
6383 }
6384
6385 int v1 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::CLUSTER_ID, xattr_op->op, xattr_op->flags);
6386 int v2 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::FS_ID, xattr_op->op, xattr_op->flags);
6387 if (v1 != v2) {
6388 derr << "inconsistent mirror info state (" << v1 << "," << v2 << ")" << dendl;
6389 return -CEPHFS_EINVAL;
6390 }
6391
6392 if (v1 < 0) {
6393 return v1;
6394 }
6395
6396 if (xattr_op->op == CEPH_MDS_OP_RMXATTR) {
6397 return 0;
6398 }
6399
6400 std::string cluster_id;
6401 std::string fs_id;
6402 int r = parse_mirror_info_xattr(xattr_op->xattr_name, xattr_op->xattr_value.to_str(),
6403 cluster_id, fs_id);
6404 if (r < 0) {
6405 return r;
6406 }
6407
6408 xattr_op->xinfo = std::make_unique<MirrorXattrInfo>(cluster_id, fs_id);
6409 return 0;
6410 }
6411
6412 void Server::mirror_info_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6413 const XattrOp &xattr_op) {
6414 auto mirror_info = dynamic_cast<MirrorXattrInfo&>(*(xattr_op.xinfo));
6415
6416 bufferlist bl;
6417 bl.append(mirror_info.cluster_id.c_str(), mirror_info.cluster_id.length());
6418 xattr_set(xattrs, Server::MirrorXattrInfo::CLUSTER_ID, bl);
6419
6420 bl.clear();
6421 bl.append(mirror_info.fs_id.c_str(), mirror_info.fs_id.length());
6422 xattr_set(xattrs, Server::MirrorXattrInfo::FS_ID, bl);
6423 }
6424
6425 void Server::mirror_info_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6426 const XattrOp &xattr_op) {
6427 xattr_rm(xattrs, Server::MirrorXattrInfo::CLUSTER_ID);
6428 xattr_rm(xattrs, Server::MirrorXattrInfo::FS_ID);
6429 }
6430
6431 void Server::handle_client_setxattr(MDRequestRef& mdr)
6432 {
6433 const cref_t<MClientRequest> &req = mdr->client_request;
6434 string name(req->get_path2());
6435
6436 // is a ceph virtual xattr?
6437 if (is_ceph_vxattr(name)) {
6438 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6439 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6440 if (!cur)
6441 return;
6442
6443 handle_set_vxattr(mdr, cur);
6444 return;
6445 }
6446
6447 if (!is_allowed_ceph_xattr(name)) {
6448 respond_to_request(mdr, -CEPHFS_EINVAL);
6449 return;
6450 }
6451
6452 CInode *cur = rdlock_path_pin_ref(mdr, true);
6453 if (!cur)
6454 return;
6455
6456 if (mdr->snapid != CEPH_NOSNAP) {
6457 respond_to_request(mdr, -CEPHFS_EROFS);
6458 return;
6459 }
6460
6461 int flags = req->head.args.setxattr.flags;
6462
6463 MutationImpl::LockOpVec lov;
6464 lov.add_xlock(&cur->xattrlock);
6465 if (!mds->locker->acquire_locks(mdr, lov))
6466 return;
6467
6468 if (!check_access(mdr, cur, MAY_WRITE))
6469 return;
6470
6471 size_t len = req->get_data().length();
6472 size_t inc = len + name.length();
6473
6474 auto handler = Server::get_xattr_or_default_handler(name);
6475 const auto& pxattrs = cur->get_projected_xattrs();
6476 if (pxattrs) {
6477 // check xattrs kv pairs size
6478 size_t cur_xattrs_size = 0;
6479 for (const auto& p : *pxattrs) {
6480 if ((flags & CEPH_XATTR_REPLACE) && name.compare(p.first) == 0) {
6481 continue;
6482 }
6483 cur_xattrs_size += p.first.length() + p.second.length();
6484 }
6485
6486 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
6487 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6488 << cur_xattrs_size << ", inc " << inc << dendl;
6489 respond_to_request(mdr, -CEPHFS_ENOSPC);
6490 return;
6491 }
6492 }
6493
6494 XattrOp xattr_op(CEPH_MDS_OP_SETXATTR, name, req->get_data(), flags);
6495 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6496 if (r < 0) {
6497 respond_to_request(mdr, r);
6498 return;
6499 }
6500
6501 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
6502
6503 // project update
6504 auto pi = cur->project_inode(mdr, true);
6505 pi.inode->version = cur->pre_dirty();
6506 pi.inode->ctime = mdr->get_op_stamp();
6507 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6508 pi.inode->rstat.rctime = mdr->get_op_stamp();
6509 pi.inode->change_attr++;
6510 pi.inode->xattr_version++;
6511
6512 if ((flags & CEPH_XATTR_REMOVE)) {
6513 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
6514 } else {
6515 std::invoke(handler->setxattr, this, cur, pi.xattrs, xattr_op);
6516 }
6517
6518 // log + wait
6519 mdr->ls = mdlog->get_current_segment();
6520 EUpdate *le = new EUpdate(mdlog, "setxattr");
6521 mdlog->start_entry(le);
6522 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6523 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6524 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6525
6526 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6527 }
6528
6529 void Server::handle_client_removexattr(MDRequestRef& mdr)
6530 {
6531 const cref_t<MClientRequest> &req = mdr->client_request;
6532 std::string name(req->get_path2());
6533
6534 // is a ceph virtual xattr?
6535 if (is_ceph_vxattr(name)) {
6536 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6537 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6538 if (!cur)
6539 return;
6540
6541 handle_remove_vxattr(mdr, cur);
6542 return;
6543 }
6544
6545 if (!is_allowed_ceph_xattr(name)) {
6546 respond_to_request(mdr, -CEPHFS_EINVAL);
6547 return;
6548 }
6549
6550 CInode* cur = rdlock_path_pin_ref(mdr, true);
6551 if (!cur)
6552 return;
6553
6554 if (mdr->snapid != CEPH_NOSNAP) {
6555 respond_to_request(mdr, -CEPHFS_EROFS);
6556 return;
6557 }
6558
6559 MutationImpl::LockOpVec lov;
6560 lov.add_xlock(&cur->xattrlock);
6561 if (!mds->locker->acquire_locks(mdr, lov))
6562 return;
6563
6564
6565 auto handler = Server::get_xattr_or_default_handler(name);
6566 bufferlist bl;
6567 XattrOp xattr_op(CEPH_MDS_OP_RMXATTR, name, bl, 0);
6568
6569 const auto& pxattrs = cur->get_projected_xattrs();
6570 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6571 if (r < 0) {
6572 respond_to_request(mdr, r);
6573 return;
6574 }
6575
6576 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
6577
6578 // project update
6579 auto pi = cur->project_inode(mdr, true);
6580 pi.inode->version = cur->pre_dirty();
6581 pi.inode->ctime = mdr->get_op_stamp();
6582 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6583 pi.inode->rstat.rctime = mdr->get_op_stamp();
6584 pi.inode->change_attr++;
6585 pi.inode->xattr_version++;
6586 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
6587
6588 // log + wait
6589 mdr->ls = mdlog->get_current_segment();
6590 EUpdate *le = new EUpdate(mdlog, "removexattr");
6591 mdlog->start_entry(le);
6592 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6593 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6594 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6595
6596 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6597 }
6598
6599 void Server::handle_client_getvxattr(MDRequestRef& mdr)
6600 {
6601 const auto& req = mdr->client_request;
6602 string xattr_name{req->get_path2()};
6603
6604 // is a ceph virtual xattr?
6605 if (!is_ceph_vxattr(xattr_name)) {
6606 respond_to_request(mdr, -CEPHFS_ENODATA);
6607 return;
6608 }
6609
6610 CInode *cur = rdlock_path_pin_ref(mdr, true, false);
6611 if (!cur) {
6612 return;
6613 }
6614
6615 if (is_ceph_dir_vxattr(xattr_name)) {
6616 if (!cur->is_dir()) {
6617 respond_to_request(mdr, -CEPHFS_ENODATA);
6618 return;
6619 }
6620 } else if (is_ceph_file_vxattr(xattr_name)) {
6621 if (cur->is_dir()) {
6622 respond_to_request(mdr, -CEPHFS_ENODATA);
6623 return;
6624 }
6625 }
6626
6627 CachedStackStringStream css;
6628 int r = 0;
6629 ceph::bufferlist bl;
6630 // handle these vxattrs
6631 if ((xattr_name.substr(0, 15) == "ceph.dir.layout"sv) ||
6632 (xattr_name.substr(0, 16) == "ceph.file.layout"sv)) {
6633 std::string layout_field;
6634
6635 struct layout_xattr_info_t {
6636 enum class InheritanceStatus : uint32_t {
6637 DEFAULT = 0,
6638 SET = 1,
6639 INHERITED = 2
6640 };
6641
6642 const file_layout_t layout;
6643 const InheritanceStatus status;
6644
6645 layout_xattr_info_t(const file_layout_t& l, InheritanceStatus inh)
6646 : layout(l), status(inh) { }
6647
6648 static std::string status_to_string(InheritanceStatus status) {
6649 switch (status) {
6650 case InheritanceStatus::DEFAULT: return "default"s;
6651 case InheritanceStatus::SET: return "set"s;
6652 case InheritanceStatus::INHERITED: return "inherited"s;
6653 default: return "unknown"s;
6654 }
6655 }
6656 };
6657
6658 auto is_default_layout = [&](const file_layout_t& layout) -> bool {
6659 return (layout == mdcache->default_file_layout);
6660 };
6661 auto get_inherited_layout = [&](CInode *cur) -> layout_xattr_info_t {
6662 auto orig_in = cur;
6663
6664 while (cur) {
6665 if (cur->get_projected_inode()->has_layout()) {
6666 auto& curr_layout = cur->get_projected_inode()->layout;
6667 if (is_default_layout(curr_layout)) {
6668 return {curr_layout, layout_xattr_info_t::InheritanceStatus::DEFAULT};
6669 }
6670 if (cur == orig_in) {
6671 // we've found a new layout at this inode
6672 return {curr_layout, layout_xattr_info_t::InheritanceStatus::SET};
6673 } else {
6674 return {curr_layout, layout_xattr_info_t::InheritanceStatus::INHERITED};
6675 }
6676 }
6677
6678 if (cur->is_root()) {
6679 break;
6680 }
6681
6682 cur = cur->get_projected_parent_dir()->get_inode();
6683 }
6684 mds->clog->error() << "no layout found at root dir!";
6685 ceph_abort("no layout found at root dir! something is really messed up with layouts!");
6686 };
6687
6688 if (xattr_name == "ceph.dir.layout.json"sv ||
6689 xattr_name == "ceph.file.layout.json"sv) {
6690 // fetch layout only for valid xattr_name
6691 const auto lxi = get_inherited_layout(cur);
6692
6693 *css << "{\"stripe_unit\": " << lxi.layout.stripe_unit
6694 << ", \"stripe_count\": " << lxi.layout.stripe_count
6695 << ", \"object_size\": " << lxi.layout.object_size
6696 << ", \"pool_name\": ";
6697 mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
6698 *css << "\"";
6699 if (o.have_pg_pool(lxi.layout.pool_id)) {
6700 *css << o.get_pool_name(lxi.layout.pool_id);
6701 }
6702 *css << "\"";
6703 });
6704 *css << ", \"pool_id\": " << (uint64_t)lxi.layout.pool_id;
6705 *css << ", \"pool_namespace\": \"" << lxi.layout.pool_ns << "\"";
6706 *css << ", \"inheritance\": \"@"
6707 << layout_xattr_info_t::status_to_string(lxi.status) << "\"}";
6708 } else if ((xattr_name == "ceph.dir.layout.pool_name"sv) ||
6709 (xattr_name == "ceph.file.layout.pool_name"sv)) {
6710 // fetch layout only for valid xattr_name
6711 const auto lxi = get_inherited_layout(cur);
6712 mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
6713 if (o.have_pg_pool(lxi.layout.pool_id)) {
6714 *css << o.get_pool_name(lxi.layout.pool_id);
6715 }
6716 });
6717 } else if ((xattr_name == "ceph.dir.layout.pool_id"sv) ||
6718 (xattr_name == "ceph.file.layout.pool_id"sv)) {
6719 // fetch layout only for valid xattr_name
6720 const auto lxi = get_inherited_layout(cur);
6721 *css << (uint64_t)lxi.layout.pool_id;
6722 } else {
6723 r = -CEPHFS_ENODATA; // no such attribute
6724 }
6725 } else if (xattr_name.substr(0, 12) == "ceph.dir.pin"sv) {
6726 if (xattr_name == "ceph.dir.pin"sv) {
6727 *css << cur->get_projected_inode()->export_pin;
6728 } else if (xattr_name == "ceph.dir.pin.random"sv) {
6729 *css << cur->get_projected_inode()->export_ephemeral_random_pin;
6730 } else if (xattr_name == "ceph.dir.pin.distributed"sv) {
6731 *css << cur->get_projected_inode()->export_ephemeral_distributed_pin;
6732 } else {
6733 // otherwise respond as invalid request
6734 // since we only handle ceph vxattrs here
6735 r = -CEPHFS_ENODATA; // no such attribute
6736 }
6737 } else {
6738 // otherwise respond as invalid request
6739 // since we only handle ceph vxattrs here
6740 r = -CEPHFS_ENODATA; // no such attribute
6741 }
6742
6743 if (r == 0) {
6744 ENCODE_START(1, 1, bl);
6745 encode(css->strv(), bl);
6746 ENCODE_FINISH(bl);
6747 mdr->reply_extra_bl = bl;
6748 }
6749
6750 respond_to_request(mdr, r);
6751 }
6752
6753 // =================================================================
6754 // DIRECTORY and NAMESPACE OPS
6755
6756
6757 // ------------------------------------------------
6758
6759 struct C_WaitUnlinkToFinish : public MDSContext {
6760 protected:
6761 MDCache *mdcache;
6762 CDentry *dn;
6763 MDSContext *fin;
6764
6765 MDSRank *get_mds() override
6766 {
6767 ceph_assert(mdcache != NULL);
6768 return mdcache->mds;
6769 }
6770
6771 public:
6772 C_WaitUnlinkToFinish(MDCache *m, CDentry *d, MDSContext *f) :
6773 mdcache(m), dn(d), fin(f) {}
6774 void finish(int r) override {
6775 fin->complete(r);
6776 dn->put(CDentry::PIN_PURGING);
6777 }
6778 };
6779
6780 bool Server::is_unlink_pending(CDentry *dn)
6781 {
6782 CDentry::linkage_t *dnl = dn->get_projected_linkage();
6783 if (!dnl->is_null() && dn->state_test(CDentry::STATE_UNLINKING)) {
6784 return true;
6785 }
6786 return false;
6787 }
6788
6789 void Server::wait_for_pending_unlink(CDentry *dn, MDRequestRef& mdr)
6790 {
6791 dout(20) << __func__ << " dn " << *dn << dendl;
6792 mds->locker->drop_locks(mdr.get());
6793 auto fin = new C_MDS_RetryRequest(mdcache, mdr);
6794 dn->get(CDentry::PIN_PURGING);
6795 dn->add_waiter(CDentry::WAIT_UNLINK_FINISH, new C_WaitUnlinkToFinish(mdcache, dn, fin));
6796 }
6797
6798 // MKNOD
6799
6800 class C_MDS_mknod_finish : public ServerLogContext {
6801 CDentry *dn;
6802 CInode *newi;
6803 public:
6804 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
6805 ServerLogContext(s, r), dn(d), newi(ni) {}
6806 void finish(int r) override {
6807 ceph_assert(r == 0);
6808
6809 // link the inode
6810 dn->pop_projected_linkage();
6811
6812 // be a bit hacky with the inode version, here.. we decrement it
6813 // just to keep mark_dirty() happen. (we didn't bother projecting
6814 // a new version of hte inode since it's just been created)
6815 newi->mark_dirty(mdr->ls);
6816 newi->mark_dirty_parent(mdr->ls, true);
6817
6818 // mkdir?
6819 if (newi->is_dir()) {
6820 CDir *dir = newi->get_dirfrag(frag_t());
6821 ceph_assert(dir);
6822 dir->mark_dirty(mdr->ls);
6823 dir->mark_new(mdr->ls);
6824 }
6825
6826 mdr->apply();
6827
6828 MDRequestRef null_ref;
6829 get_mds()->mdcache->send_dentry_link(dn, null_ref);
6830
6831 if (newi->is_file()) {
6832 get_mds()->locker->share_inode_max_size(newi);
6833 } else if (newi->is_dir()) {
6834 // We do this now so that the linkages on the new directory are stable.
6835 newi->maybe_ephemeral_rand();
6836 }
6837
6838 // hit pop
6839 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
6840
6841 // reply
6842 server->respond_to_request(mdr, 0);
6843 }
6844 };
6845
6846
6847 void Server::handle_client_mknod(MDRequestRef& mdr)
6848 {
6849 const cref_t<MClientRequest> &req = mdr->client_request;
6850 client_t client = mdr->get_client();
6851
6852 unsigned mode = req->head.args.mknod.mode;
6853 if ((mode & S_IFMT) == 0)
6854 mode |= S_IFREG;
6855
6856 mdr->disable_lock_cache();
6857 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, false, S_ISREG(mode));
6858 if (!dn)
6859 return;
6860
6861 if (is_unlink_pending(dn)) {
6862 wait_for_pending_unlink(dn, mdr);
6863 return;
6864 }
6865
6866 CDir *dir = dn->get_dir();
6867 CInode *diri = dir->get_inode();
6868 if (!check_access(mdr, diri, MAY_WRITE))
6869 return;
6870 if (!check_fragment_space(mdr, dir))
6871 return;
6872 if (!check_dir_max_entries(mdr, dir))
6873 return;
6874
6875 ceph_assert(dn->get_projected_linkage()->is_null());
6876 if (req->get_alternate_name().size() > alternate_name_max) {
6877 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6878 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6879 return;
6880 }
6881 dn->set_alternate_name(req->get_alternate_name());
6882
6883 // set layout
6884 file_layout_t layout;
6885 if (mdr->dir_layout != file_layout_t())
6886 layout = mdr->dir_layout;
6887 else
6888 layout = mdcache->default_file_layout;
6889
6890 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
6891 ceph_assert(newi);
6892
6893 dn->push_projected_linkage(newi);
6894
6895 auto _inode = newi->_get_inode();
6896 _inode->version = dn->pre_dirty();
6897 _inode->rdev = req->head.args.mknod.rdev;
6898 _inode->rstat.rfiles = 1;
6899 _inode->accounted_rstat = _inode->rstat;
6900 if (layout.pool_id != mdcache->default_file_layout.pool_id)
6901 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
6902 _inode->update_backtrace();
6903
6904 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6905 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6906 ceph_assert(follows >= realm->get_newest_seq());
6907
6908 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6909 // want to write to it (e.g., if they are reexporting NFS)
6910 if (S_ISREG(_inode->mode)) {
6911 // issue a cap on the file
6912 int cmode = CEPH_FILE_MODE_RDWR;
6913 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
6914 if (cap) {
6915 cap->set_wanted(0);
6916
6917 // put locks in excl mode
6918 newi->filelock.set_state(LOCK_EXCL);
6919 newi->authlock.set_state(LOCK_EXCL);
6920 newi->xattrlock.set_state(LOCK_EXCL);
6921
6922 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
6923 _inode->client_ranges[client].range.first = 0;
6924 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
6925 _inode->client_ranges[client].follows = follows;
6926 newi->mark_clientwriteable();
6927 cap->mark_clientwriteable();
6928 }
6929 }
6930
6931 ceph_assert(dn->first == follows + 1);
6932 newi->first = dn->first;
6933
6934 dout(10) << "mknod mode " << _inode->mode << " rdev " << _inode->rdev << dendl;
6935
6936 // prepare finisher
6937 mdr->ls = mdlog->get_current_segment();
6938 EUpdate *le = new EUpdate(mdlog, "mknod");
6939 mdlog->start_entry(le);
6940 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6941 journal_allocated_inos(mdr, &le->metablob);
6942
6943 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
6944 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6945 le->metablob.add_primary_dentry(dn, newi, true, true, true);
6946
6947 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6948 mds->balancer->maybe_fragment(dn->get_dir(), false);
6949 }
6950
6951
6952
6953 // MKDIR
6954 /* This function takes responsibility for the passed mdr*/
6955 void Server::handle_client_mkdir(MDRequestRef& mdr)
6956 {
6957 const cref_t<MClientRequest> &req = mdr->client_request;
6958
6959 mdr->disable_lock_cache();
6960 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6961 if (!dn)
6962 return;
6963
6964 if (is_unlink_pending(dn)) {
6965 wait_for_pending_unlink(dn, mdr);
6966 return;
6967 }
6968
6969 CDir *dir = dn->get_dir();
6970 CInode *diri = dir->get_inode();
6971
6972 // mkdir check access
6973 if (!check_access(mdr, diri, MAY_WRITE))
6974 return;
6975
6976 if (!check_fragment_space(mdr, dir))
6977 return;
6978 if (!check_dir_max_entries(mdr, dir))
6979 return;
6980
6981 ceph_assert(dn->get_projected_linkage()->is_null());
6982 if (req->get_alternate_name().size() > alternate_name_max) {
6983 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6984 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6985 return;
6986 }
6987 dn->set_alternate_name(req->get_alternate_name());
6988
6989 // new inode
6990 unsigned mode = req->head.args.mkdir.mode;
6991 mode &= ~S_IFMT;
6992 mode |= S_IFDIR;
6993 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
6994 ceph_assert(newi);
6995
6996 // it's a directory.
6997 dn->push_projected_linkage(newi);
6998
6999 auto _inode = newi->_get_inode();
7000 _inode->version = dn->pre_dirty();
7001 _inode->rstat.rsubdirs = 1;
7002 _inode->accounted_rstat = _inode->rstat;
7003 _inode->update_backtrace();
7004
7005 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
7006 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
7007 ceph_assert(follows >= realm->get_newest_seq());
7008
7009 dout(12) << " follows " << follows << dendl;
7010 ceph_assert(dn->first == follows + 1);
7011 newi->first = dn->first;
7012
7013 // ...and that new dir is empty.
7014 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
7015 newdir->state_set(CDir::STATE_CREATING);
7016 newdir->mark_complete();
7017 newdir->_get_fnode()->version = newdir->pre_dirty();
7018
7019 // prepare finisher
7020 mdr->ls = mdlog->get_current_segment();
7021 EUpdate *le = new EUpdate(mdlog, "mkdir");
7022 mdlog->start_entry(le);
7023 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
7024 journal_allocated_inos(mdr, &le->metablob);
7025 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7026 le->metablob.add_primary_dentry(dn, newi, true, true);
7027 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
7028
7029 // issue a cap on the directory
7030 int cmode = CEPH_FILE_MODE_RDWR;
7031 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
7032 if (cap) {
7033 cap->set_wanted(0);
7034
7035 // put locks in excl mode
7036 newi->filelock.set_state(LOCK_EXCL);
7037 newi->authlock.set_state(LOCK_EXCL);
7038 newi->xattrlock.set_state(LOCK_EXCL);
7039 }
7040
7041 // make sure this inode gets into the journal
7042 le->metablob.add_opened_ino(newi->ino());
7043
7044 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
7045
7046 // We hit_dir (via hit_inode) in our finish callback, but by then we might
7047 // have overshot the split size (multiple mkdir in flight), so here is
7048 // an early chance to split the dir if this mkdir makes it oversized.
7049 mds->balancer->maybe_fragment(dir, false);
7050 }
7051
7052
7053 // SYMLINK
7054
7055 void Server::handle_client_symlink(MDRequestRef& mdr)
7056 {
7057 const auto& req = mdr->client_request;
7058
7059 mdr->disable_lock_cache();
7060 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
7061 if (!dn)
7062 return;
7063
7064 if (is_unlink_pending(dn)) {
7065 wait_for_pending_unlink(dn, mdr);
7066 return;
7067 }
7068
7069 CDir *dir = dn->get_dir();
7070 CInode *diri = dir->get_inode();
7071
7072 if (!check_access(mdr, diri, MAY_WRITE))
7073 return;
7074 if (!check_fragment_space(mdr, dir))
7075 return;
7076 if (!check_dir_max_entries(mdr, dir))
7077 return;
7078
7079 ceph_assert(dn->get_projected_linkage()->is_null());
7080 if (req->get_alternate_name().size() > alternate_name_max) {
7081 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
7082 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
7083 }
7084 dn->set_alternate_name(req->get_alternate_name());
7085
7086 unsigned mode = S_IFLNK | 0777;
7087 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
7088 ceph_assert(newi);
7089
7090 // it's a symlink
7091 dn->push_projected_linkage(newi);
7092
7093 newi->symlink = req->get_path2();
7094 auto _inode = newi->_get_inode();
7095 _inode->version = dn->pre_dirty();
7096 _inode->size = newi->symlink.length();
7097 _inode->rstat.rbytes = _inode->size;
7098 _inode->rstat.rfiles = 1;
7099 _inode->accounted_rstat = _inode->rstat;
7100 _inode->update_backtrace();
7101
7102 newi->first = dn->first;
7103
7104 // prepare finisher
7105 mdr->ls = mdlog->get_current_segment();
7106 EUpdate *le = new EUpdate(mdlog, "symlink");
7107 mdlog->start_entry(le);
7108 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
7109 journal_allocated_inos(mdr, &le->metablob);
7110 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7111 le->metablob.add_primary_dentry(dn, newi, true, true);
7112
7113 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
7114 mds->balancer->maybe_fragment(dir, false);
7115 }
7116
7117
7118
7119
7120
7121 // LINK
7122
7123 void Server::handle_client_link(MDRequestRef& mdr)
7124 {
7125 const cref_t<MClientRequest> &req = mdr->client_request;
7126
7127 dout(7) << "handle_client_link " << req->get_filepath()
7128 << " to " << req->get_filepath2()
7129 << dendl;
7130
7131 mdr->disable_lock_cache();
7132
7133 CDentry *destdn;
7134 CInode *targeti;
7135
7136 if (req->get_filepath2().depth() == 0) {
7137 targeti = mdcache->get_inode(req->get_filepath2().get_ino());
7138 if (!targeti) {
7139 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
7140 inodeno_t ino = req->get_filepath2().get_ino();
7141 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
7142 return;
7143 }
7144 mdr->pin(targeti);
7145
7146 if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
7147 CDentry *pdn = targeti->get_projected_parent_dn();
7148 if (!pdn) {
7149 dout(7) << "target has no parent dn, failing..." << dendl;
7150 respond_to_request(mdr, -CEPHFS_EINVAL);
7151 return;
7152 }
7153 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
7154 return;
7155 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
7156 }
7157
7158 destdn = rdlock_path_xlock_dentry(mdr, false);
7159 if (!destdn)
7160 return;
7161 } else {
7162 auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
7163 destdn = ret.first;
7164 if (!destdn)
7165 return;
7166
7167 if (!destdn->get_projected_linkage()->is_null()) {
7168 respond_to_request(mdr, -CEPHFS_EEXIST);
7169 return;
7170 }
7171
7172 targeti = ret.second->get_projected_linkage()->get_inode();
7173 }
7174
7175 if (is_unlink_pending(destdn)) {
7176 wait_for_pending_unlink(destdn, mdr);
7177 return;
7178 }
7179
7180 ceph_assert(destdn->get_projected_linkage()->is_null());
7181 if (req->get_alternate_name().size() > alternate_name_max) {
7182 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
7183 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
7184 return;
7185 }
7186 destdn->set_alternate_name(req->get_alternate_name());
7187
7188 if (targeti->is_dir()) {
7189 dout(7) << "target is a dir, failing..." << dendl;
7190 respond_to_request(mdr, -CEPHFS_EINVAL);
7191 return;
7192 }
7193
7194 CDir *dir = destdn->get_dir();
7195 dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
7196 dout(7) << "target is " << *targeti << dendl;
7197
7198 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7199 MutationImpl::LockOpVec lov;
7200 lov.add_xlock(&targeti->snaplock);
7201 lov.add_xlock(&targeti->linklock);
7202
7203 if (!mds->locker->acquire_locks(mdr, lov))
7204 return;
7205
7206 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7207 }
7208
7209 if (targeti->get_projected_inode()->nlink == 0) {
7210 dout(7) << "target has no link, failing..." << dendl;
7211 respond_to_request(mdr, -CEPHFS_ENOENT);
7212 return;
7213 }
7214
7215 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7216 if (!check_access(mdr, targeti, MAY_WRITE))
7217 return;
7218
7219 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
7220 return;
7221
7222 if (!check_fragment_space(mdr, dir))
7223 return;
7224
7225 if (!check_dir_max_entries(mdr, dir))
7226 return;
7227 }
7228
7229 CInode* target_pin = targeti->get_projected_parent_dir()->inode;
7230 SnapRealm *target_realm = target_pin->find_snaprealm();
7231 if (target_pin != dir->inode &&
7232 target_realm->get_subvolume_ino() !=
7233 dir->inode->find_snaprealm()->get_subvolume_ino()) {
7234 if (target_pin->is_stray()) {
7235 mds->locker->drop_locks(mdr.get());
7236 targeti->add_waiter(CInode::WAIT_UNLINK,
7237 new C_MDS_RetryRequest(mdcache, mdr));
7238 mdlog->flush();
7239 return;
7240 }
7241 dout(7) << "target is in different subvolume, failing..." << dendl;
7242 respond_to_request(mdr, -CEPHFS_EXDEV);
7243 return;
7244 }
7245
7246 // go!
7247 ceph_assert(g_conf()->mds_kill_link_at != 1);
7248
7249 // local or remote?
7250 if (targeti->is_auth())
7251 _link_local(mdr, destdn, targeti, target_realm);
7252 else
7253 _link_remote(mdr, true, destdn, targeti);
7254 mds->balancer->maybe_fragment(dir, false);
7255 }
7256
7257
7258 class C_MDS_link_local_finish : public ServerLogContext {
7259 CDentry *dn;
7260 CInode *targeti;
7261 version_t dnpv;
7262 version_t tipv;
7263 bool adjust_realm;
7264 public:
7265 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
7266 version_t dnpv_, version_t tipv_, bool ar) :
7267 ServerLogContext(s, r), dn(d), targeti(ti),
7268 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
7269 void finish(int r) override {
7270 ceph_assert(r == 0);
7271 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
7272 }
7273 };
7274
7275
7276 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm)
7277 {
7278 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
7279
7280 mdr->ls = mdlog->get_current_segment();
7281
7282 // predirty NEW dentry
7283 version_t dnpv = dn->pre_dirty();
7284 version_t tipv = targeti->pre_dirty();
7285
7286 // project inode update
7287 auto pi = targeti->project_inode(mdr);
7288 pi.inode->nlink++;
7289 pi.inode->ctime = mdr->get_op_stamp();
7290 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
7291 pi.inode->rstat.rctime = mdr->get_op_stamp();
7292 pi.inode->change_attr++;
7293 pi.inode->version = tipv;
7294
7295 bool adjust_realm = false;
7296 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
7297 sr_t *newsnap = targeti->project_snaprealm();
7298 targeti->mark_snaprealm_global(newsnap);
7299 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true);
7300 adjust_realm = true;
7301 }
7302
7303 // log + wait
7304 EUpdate *le = new EUpdate(mdlog, "link_local");
7305 mdlog->start_entry(le);
7306 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7307 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
7308 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
7309 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
7310 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
7311
7312 // do this after predirty_*, to avoid funky extra dnl arg
7313 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
7314
7315 journal_and_reply(mdr, targeti, dn, le,
7316 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
7317 }
7318
7319 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
7320 version_t dnpv, version_t tipv, bool adjust_realm)
7321 {
7322 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
7323
7324 // link and unlock the NEW dentry
7325 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
7326 if (!dnl->get_inode())
7327 dn->link_remote(dnl, targeti);
7328 dn->mark_dirty(dnpv, mdr->ls);
7329
7330 // target inode
7331 mdr->apply();
7332
7333 MDRequestRef null_ref;
7334 mdcache->send_dentry_link(dn, null_ref);
7335
7336 if (adjust_realm) {
7337 int op = CEPH_SNAP_OP_SPLIT;
7338 mds->mdcache->send_snap_update(targeti, 0, op);
7339 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7340 }
7341
7342 // bump target popularity
7343 mds->balancer->hit_inode(targeti, META_POP_IWR);
7344 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7345
7346 // reply
7347 respond_to_request(mdr, 0);
7348 }
7349
7350
7351 // link / unlink remote
7352
7353 class C_MDS_link_remote_finish : public ServerLogContext {
7354 bool inc;
7355 CDentry *dn;
7356 CInode *targeti;
7357 version_t dpv;
7358 public:
7359 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
7360 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
7361 dpv(d->get_projected_version()) {}
7362 void finish(int r) override {
7363 ceph_assert(r == 0);
7364 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
7365 }
7366 };
7367
7368 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
7369 {
7370 dout(10) << "_link_remote "
7371 << (inc ? "link ":"unlink ")
7372 << *dn << " to " << *targeti << dendl;
7373
7374 // 1. send LinkPrepare to dest (journal nlink++ prepare)
7375 mds_rank_t linkauth = targeti->authority().first;
7376 if (mdr->more()->witnessed.count(linkauth) == 0) {
7377 if (mds->is_cluster_degraded() &&
7378 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
7379 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
7380 if (mdr->more()->waiting_on_peer.empty())
7381 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
7382 return;
7383 }
7384
7385 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
7386 int op;
7387 if (inc)
7388 op = MMDSPeerRequest::OP_LINKPREP;
7389 else
7390 op = MMDSPeerRequest::OP_UNLINKPREP;
7391 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, op);
7392 targeti->set_object_info(req->get_object_info());
7393 req->op_stamp = mdr->get_op_stamp();
7394 if (auto& desti_srnode = mdr->more()->desti_srnode)
7395 encode(*desti_srnode, req->desti_snapbl);
7396 mds->send_message_mds(req, linkauth);
7397
7398 ceph_assert(mdr->more()->waiting_on_peer.count(linkauth) == 0);
7399 mdr->more()->waiting_on_peer.insert(linkauth);
7400 return;
7401 }
7402 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
7403
7404 ceph_assert(g_conf()->mds_kill_link_at != 2);
7405
7406 if (auto& desti_srnode = mdr->more()->desti_srnode) {
7407 delete desti_srnode;
7408 desti_srnode = NULL;
7409 }
7410
7411 mdr->set_mds_stamp(ceph_clock_now());
7412
7413 // add to event
7414 mdr->ls = mdlog->get_current_segment();
7415 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
7416 mdlog->start_entry(le);
7417 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7418 if (!mdr->more()->witnessed.empty()) {
7419 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7420 le->reqid = mdr->reqid;
7421 le->had_peers = true;
7422 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7423 }
7424
7425 if (inc) {
7426 dn->pre_dirty();
7427 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
7428 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
7429 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
7430 } else {
7431 dn->pre_dirty();
7432 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
7433 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7434 le->metablob.add_null_dentry(dn, true);
7435 dn->push_projected_linkage();
7436 }
7437
7438 journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
7439 new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
7440 }
7441
7442 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
7443 CDentry *dn, CInode *targeti,
7444 version_t dpv)
7445 {
7446 dout(10) << "_link_remote_finish "
7447 << (inc ? "link ":"unlink ")
7448 << *dn << " to " << *targeti << dendl;
7449
7450 ceph_assert(g_conf()->mds_kill_link_at != 3);
7451
7452 if (!mdr->more()->witnessed.empty())
7453 mdcache->logged_leader_update(mdr->reqid);
7454
7455 if (inc) {
7456 // link the new dentry
7457 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
7458 if (!dnl->get_inode())
7459 dn->link_remote(dnl, targeti);
7460 dn->mark_dirty(dpv, mdr->ls);
7461 } else {
7462 // unlink main dentry
7463 dn->get_dir()->unlink_inode(dn);
7464 dn->pop_projected_linkage();
7465 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
7466 }
7467
7468 mdr->apply();
7469
7470 MDRequestRef null_ref;
7471 if (inc) {
7472 mdcache->send_dentry_link(dn, null_ref);
7473 } else {
7474 dn->state_clear(CDentry::STATE_UNLINKING);
7475 mdcache->send_dentry_unlink(dn, NULL, null_ref);
7476
7477 MDSContext::vec finished;
7478 dn->take_waiting(CDentry::WAIT_UNLINK_FINISH, finished);
7479 mdcache->mds->queue_waiters(finished);
7480 }
7481
7482 // bump target popularity
7483 mds->balancer->hit_inode(targeti, META_POP_IWR);
7484 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7485
7486 // reply
7487 respond_to_request(mdr, 0);
7488
7489 if (!inc)
7490 // removing a new dn?
7491 dn->get_dir()->try_remove_unlinked_dn(dn);
7492 }
7493
7494
7495 // remote linking/unlinking
7496
7497 class C_MDS_PeerLinkPrep : public ServerLogContext {
7498 CInode *targeti;
7499 bool adjust_realm;
7500 public:
7501 C_MDS_PeerLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
7502 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
7503 void finish(int r) override {
7504 ceph_assert(r == 0);
7505 server->_logged_peer_link(mdr, targeti, adjust_realm);
7506 }
7507 };
7508
7509 class C_MDS_PeerLinkCommit : public ServerContext {
7510 MDRequestRef mdr;
7511 CInode *targeti;
7512 public:
7513 C_MDS_PeerLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
7514 ServerContext(s), mdr(r), targeti(t) { }
7515 void finish(int r) override {
7516 server->_commit_peer_link(mdr, r, targeti);
7517 }
7518 };
7519
7520 void Server::handle_peer_link_prep(MDRequestRef& mdr)
7521 {
7522 dout(10) << "handle_peer_link_prep " << *mdr
7523 << " on " << mdr->peer_request->get_object_info()
7524 << dendl;
7525
7526 ceph_assert(g_conf()->mds_kill_link_at != 4);
7527
7528 CInode *targeti = mdcache->get_inode(mdr->peer_request->get_object_info().ino);
7529 ceph_assert(targeti);
7530 dout(10) << "targeti " << *targeti << dendl;
7531 CDentry *dn = targeti->get_parent_dn();
7532 CDentry::linkage_t *dnl = dn->get_linkage();
7533 ceph_assert(dnl->is_primary());
7534
7535 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7536
7537 mdr->auth_pin(targeti);
7538
7539 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
7540 ceph_assert(g_conf()->mds_kill_link_at != 5);
7541
7542 // journal it
7543 mdr->ls = mdlog->get_current_segment();
7544 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_prep", mdr->reqid, mdr->peer_to_mds,
7545 EPeerUpdate::OP_PREPARE, EPeerUpdate::LINK);
7546 mdlog->start_entry(le);
7547
7548 auto pi = dnl->get_inode()->project_inode(mdr);
7549
7550 // update journaled target inode
7551 bool inc;
7552 bool adjust_realm = false;
7553 bool realm_projected = false;
7554 if (mdr->peer_request->get_op() == MMDSPeerRequest::OP_LINKPREP) {
7555 inc = true;
7556 pi.inode->nlink++;
7557
7558 CDentry *target_pdn = targeti->get_projected_parent_dn();
7559 SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm();
7560 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
7561 sr_t *newsnap = targeti->project_snaprealm();
7562 targeti->mark_snaprealm_global(newsnap);
7563 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true);
7564 adjust_realm = true;
7565 realm_projected = true;
7566 }
7567 } else {
7568 inc = false;
7569 pi.inode->nlink--;
7570 if (targeti->is_projected_snaprealm_global()) {
7571 ceph_assert(mdr->peer_request->desti_snapbl.length());
7572 auto p = mdr->peer_request->desti_snapbl.cbegin();
7573
7574 sr_t *newsnap = targeti->project_snaprealm();
7575 decode(*newsnap, p);
7576
7577 if (pi.inode->nlink == 0)
7578 ceph_assert(!newsnap->is_parent_global());
7579
7580 realm_projected = true;
7581 } else {
7582 ceph_assert(mdr->peer_request->desti_snapbl.length() == 0);
7583 }
7584 }
7585
7586 link_rollback rollback;
7587 rollback.reqid = mdr->reqid;
7588 rollback.ino = targeti->ino();
7589 rollback.old_ctime = targeti->get_inode()->ctime; // we hold versionlock xlock; no concorrent projections
7590 const auto& pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
7591 rollback.old_dir_mtime = pf->fragstat.mtime;
7592 rollback.old_dir_rctime = pf->rstat.rctime;
7593 rollback.was_inc = inc;
7594 if (realm_projected) {
7595 if (targeti->snaprealm) {
7596 encode(true, rollback.snapbl);
7597 targeti->encode_snap_blob(rollback.snapbl);
7598 } else {
7599 encode(false, rollback.snapbl);
7600 }
7601 }
7602 encode(rollback, le->rollback);
7603 mdr->more()->rollback_bl = le->rollback;
7604
7605 pi.inode->ctime = mdr->get_op_stamp();
7606 pi.inode->version = targeti->pre_dirty();
7607
7608 dout(10) << " projected inode " << pi.inode->ino << " v " << pi.inode->version << dendl;
7609
7610 // commit case
7611 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
7612 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
7613 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7614
7615 // set up commit waiter
7616 mdr->more()->peer_commit = new C_MDS_PeerLinkCommit(this, mdr, targeti);
7617
7618 mdr->more()->peer_update_journaled = true;
7619 submit_mdlog_entry(le, new C_MDS_PeerLinkPrep(this, mdr, targeti, adjust_realm),
7620 mdr, __func__);
7621 mdlog->flush();
7622 }
7623
7624 void Server::_logged_peer_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
7625 {
7626 dout(10) << "_logged_peer_link " << *mdr
7627 << " " << *targeti << dendl;
7628
7629 ceph_assert(g_conf()->mds_kill_link_at != 6);
7630
7631 // update the target
7632 mdr->apply();
7633
7634 // hit pop
7635 mds->balancer->hit_inode(targeti, META_POP_IWR);
7636
7637 // done.
7638 mdr->reset_peer_request();
7639
7640 if (adjust_realm) {
7641 int op = CEPH_SNAP_OP_SPLIT;
7642 mds->mdcache->send_snap_update(targeti, 0, op);
7643 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7644 }
7645
7646 // ack
7647 if (!mdr->aborted) {
7648 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_LINKPREPACK);
7649 mds->send_message_mds(reply, mdr->peer_to_mds);
7650 } else {
7651 dout(10) << " abort flag set, finishing" << dendl;
7652 mdcache->request_finish(mdr);
7653 }
7654 }
7655
7656
7657 struct C_MDS_CommittedPeer : public ServerLogContext {
7658 C_MDS_CommittedPeer(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
7659 void finish(int r) override {
7660 server->_committed_peer(mdr);
7661 }
7662 };
7663
7664 void Server::_commit_peer_link(MDRequestRef& mdr, int r, CInode *targeti)
7665 {
7666 dout(10) << "_commit_peer_link " << *mdr
7667 << " r=" << r
7668 << " " << *targeti << dendl;
7669
7670 ceph_assert(g_conf()->mds_kill_link_at != 7);
7671
7672 if (r == 0) {
7673 // drop our pins, etc.
7674 mdr->cleanup();
7675
7676 // write a commit to the journal
7677 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_commit", mdr->reqid, mdr->peer_to_mds,
7678 EPeerUpdate::OP_COMMIT, EPeerUpdate::LINK);
7679 mdlog->start_entry(le);
7680 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7681 mdlog->flush();
7682 } else {
7683 do_link_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7684 }
7685 }
7686
7687 void Server::_committed_peer(MDRequestRef& mdr)
7688 {
7689 dout(10) << "_committed_peer " << *mdr << dendl;
7690
7691 ceph_assert(g_conf()->mds_kill_link_at != 8);
7692
7693 bool assert_exist = mdr->more()->peer_update_journaled;
7694 mdcache->finish_uncommitted_peer(mdr->reqid, assert_exist);
7695 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_COMMITTED);
7696 mds->send_message_mds(req, mdr->peer_to_mds);
7697 mdcache->request_finish(mdr);
7698 }
7699
7700 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
7701 MutationRef mut;
7702 map<client_t,ref_t<MClientSnap>> splits;
7703 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
7704 map<client_t,ref_t<MClientSnap>>&& _splits) :
7705 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
7706 }
7707 void finish(int r) override {
7708 server->_link_rollback_finish(mut, mdr, splits);
7709 }
7710 };
7711
7712 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7713 {
7714 link_rollback rollback;
7715 auto p = rbl.cbegin();
7716 decode(rollback, p);
7717
7718 dout(10) << "do_link_rollback on " << rollback.reqid
7719 << (rollback.was_inc ? " inc":" dec")
7720 << " ino " << rollback.ino
7721 << dendl;
7722
7723 ceph_assert(g_conf()->mds_kill_link_at != 9);
7724
7725 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
7726 ceph_assert(mdr || mds->is_resolve());
7727
7728 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7729 mut->ls = mds->mdlog->get_current_segment();
7730
7731 CInode *in = mdcache->get_inode(rollback.ino);
7732 ceph_assert(in);
7733 dout(10) << " target is " << *in << dendl;
7734 ceph_assert(!in->is_projected()); // live peer request hold versionlock xlock.
7735
7736 auto pi = in->project_inode(mut);
7737 pi.inode->version = in->pre_dirty();
7738
7739 // parent dir rctime
7740 CDir *parent = in->get_projected_parent_dn()->get_dir();
7741 auto pf = parent->project_fnode(mut);
7742 pf->version = parent->pre_dirty();
7743 if (pf->fragstat.mtime == pi.inode->ctime) {
7744 pf->fragstat.mtime = rollback.old_dir_mtime;
7745 if (pf->rstat.rctime == pi.inode->ctime)
7746 pf->rstat.rctime = rollback.old_dir_rctime;
7747 mut->add_updated_lock(&parent->get_inode()->filelock);
7748 mut->add_updated_lock(&parent->get_inode()->nestlock);
7749 }
7750
7751 // inode
7752 pi.inode->ctime = rollback.old_ctime;
7753 if (rollback.was_inc)
7754 pi.inode->nlink--;
7755 else
7756 pi.inode->nlink++;
7757
7758 map<client_t,ref_t<MClientSnap>> splits;
7759 if (rollback.snapbl.length() && in->snaprealm) {
7760 bool hadrealm;
7761 auto p = rollback.snapbl.cbegin();
7762 decode(hadrealm, p);
7763 if (hadrealm) {
7764 if (!mds->is_resolve()) {
7765 sr_t *new_srnode = new sr_t();
7766 decode(*new_srnode, p);
7767 in->project_snaprealm(new_srnode);
7768 } else {
7769 decode(in->snaprealm->srnode, p);
7770 }
7771 } else {
7772 SnapRealm *realm = parent->get_inode()->find_snaprealm();
7773 if (!mds->is_resolve())
7774 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
7775 in->project_snaprealm(NULL);
7776 }
7777 }
7778
7779 // journal it
7780 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_rollback", rollback.reqid, leader,
7781 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::LINK);
7782 mdlog->start_entry(le);
7783 le->commit.add_dir_context(parent);
7784 le->commit.add_dir(parent, true);
7785 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
7786
7787 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
7788 mdr, __func__);
7789 mdlog->flush();
7790 }
7791
7792 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
7793 map<client_t,ref_t<MClientSnap>>& splits)
7794 {
7795 dout(10) << "_link_rollback_finish" << dendl;
7796
7797 ceph_assert(g_conf()->mds_kill_link_at != 10);
7798
7799 mut->apply();
7800
7801 if (!mds->is_resolve())
7802 mdcache->send_snaps(splits);
7803
7804 if (mdr)
7805 mdcache->request_finish(mdr);
7806
7807 mdcache->finish_rollback(mut->reqid, mdr);
7808
7809 mut->cleanup();
7810 }
7811
7812
7813 void Server::handle_peer_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m)
7814 {
7815 dout(10) << "handle_peer_link_prep_ack " << *mdr
7816 << " " << *m << dendl;
7817 mds_rank_t from = mds_rank_t(m->get_source().num());
7818
7819 ceph_assert(g_conf()->mds_kill_link_at != 11);
7820
7821 // note peer
7822 mdr->more()->peers.insert(from);
7823
7824 // witnessed!
7825 ceph_assert(mdr->more()->witnessed.count(from) == 0);
7826 mdr->more()->witnessed.insert(from);
7827 ceph_assert(!m->is_not_journaled());
7828 mdr->more()->has_journaled_peers = true;
7829
7830 // remove from waiting list
7831 ceph_assert(mdr->more()->waiting_on_peer.count(from));
7832 mdr->more()->waiting_on_peer.erase(from);
7833
7834 ceph_assert(mdr->more()->waiting_on_peer.empty());
7835
7836 dispatch_client_request(mdr); // go again!
7837 }
7838
7839
7840
7841
7842
7843 // UNLINK
7844
7845 void Server::handle_client_unlink(MDRequestRef& mdr)
7846 {
7847 const cref_t<MClientRequest> &req = mdr->client_request;
7848 client_t client = mdr->get_client();
7849
7850 // rmdir or unlink?
7851 bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
7852
7853 if (rmdir)
7854 mdr->disable_lock_cache();
7855
7856 CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
7857 if (!dn)
7858 return;
7859
7860 // notify replica MDSes the dentry is under unlink
7861 if (!dn->state_test(CDentry::STATE_UNLINKING)) {
7862 dn->state_set(CDentry::STATE_UNLINKING);
7863 mdcache->send_dentry_unlink(dn, nullptr, mdr, true);
7864 if (dn->replica_unlinking_ref) {
7865 return;
7866 }
7867 }
7868
7869 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
7870 ceph_assert(!dnl->is_null());
7871 CInode *in = dnl->get_inode();
7872
7873 if (rmdir) {
7874 dout(7) << "handle_client_rmdir on " << *dn << dendl;
7875 } else {
7876 dout(7) << "handle_client_unlink on " << *dn << dendl;
7877 }
7878 dout(7) << "dn links to " << *in << dendl;
7879
7880 // rmdir vs is_dir
7881 if (in->is_dir()) {
7882 if (rmdir) {
7883 // do empty directory checks
7884 if (_dir_is_nonempty_unlocked(mdr, in)) {
7885 dn->state_clear(CDentry::STATE_UNLINKING);
7886 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7887 return;
7888 }
7889 } else {
7890 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
7891 dn->state_clear(CDentry::STATE_UNLINKING);
7892 respond_to_request(mdr, -CEPHFS_EISDIR);
7893 return;
7894 }
7895 } else {
7896 if (rmdir) {
7897 // unlink
7898 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
7899 dn->state_clear(CDentry::STATE_UNLINKING);
7900 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7901 return;
7902 }
7903 }
7904
7905 CInode *diri = dn->get_dir()->get_inode();
7906 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7907 if (!check_access(mdr, diri, MAY_WRITE)) {
7908 dn->state_clear(CDentry::STATE_UNLINKING);
7909 return;
7910 }
7911 }
7912
7913 // -- create stray dentry? --
7914 CDentry *straydn = NULL;
7915 if (dnl->is_primary()) {
7916 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
7917 if (!straydn)
7918 return;
7919 dout(10) << " straydn is " << *straydn << dendl;
7920 } else if (mdr->straydn) {
7921 mdr->unpin(mdr->straydn);
7922 mdr->straydn = NULL;
7923 }
7924
7925 // lock
7926 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7927 MutationImpl::LockOpVec lov;
7928
7929 lov.add_xlock(&in->linklock);
7930 lov.add_xlock(&in->snaplock);
7931 if (in->is_dir())
7932 lov.add_rdlock(&in->filelock); // to verify it's empty
7933
7934 if (straydn) {
7935 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7936 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7937 lov.add_xlock(&straydn->lock);
7938 }
7939
7940 if (!mds->locker->acquire_locks(mdr, lov))
7941 return;
7942
7943 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7944 }
7945
7946 if (in->is_dir() &&
7947 _dir_is_nonempty(mdr, in)) {
7948 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7949 dn->state_clear(CDentry::STATE_UNLINKING);
7950 return;
7951 }
7952
7953 if (straydn)
7954 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7955
7956 if (!mdr->more()->desti_srnode) {
7957 if (in->is_projected_snaprealm_global()) {
7958 sr_t *new_srnode = in->prepare_new_srnode(0);
7959 in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary());
7960 // dropping the last linkage or dropping the last remote linkage,
7961 // detch the inode from global snaprealm
7962 auto nlink = in->get_projected_inode()->nlink;
7963 if (nlink == 1 ||
7964 (nlink == 2 && !dnl->is_primary() &&
7965 !in->get_projected_parent_dir()->inode->is_stray()))
7966 in->clear_snaprealm_global(new_srnode);
7967 mdr->more()->desti_srnode = new_srnode;
7968 } else if (dnl->is_primary()) {
7969 // prepare snaprealm blob for peer request
7970 SnapRealm *realm = in->find_snaprealm();
7971 snapid_t follows = realm->get_newest_seq();
7972 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
7973 sr_t *new_srnode = in->prepare_new_srnode(follows);
7974 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7975 mdr->more()->desti_srnode = new_srnode;
7976 }
7977 }
7978 }
7979
7980 // yay!
7981 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
7982 // subtree root auths need to be witnesses
7983 set<mds_rank_t> witnesses;
7984 in->list_replicas(witnesses);
7985 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7986
7987 for (set<mds_rank_t>::iterator p = witnesses.begin();
7988 p != witnesses.end();
7989 ++p) {
7990 if (mdr->more()->witnessed.count(*p)) {
7991 dout(10) << " already witnessed by mds." << *p << dendl;
7992 } else if (mdr->more()->waiting_on_peer.count(*p)) {
7993 dout(10) << " already waiting on witness mds." << *p << dendl;
7994 } else {
7995 if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
7996 return;
7997 }
7998 }
7999 if (!mdr->more()->waiting_on_peer.empty())
8000 return; // we're waiting for a witness.
8001 }
8002
8003 if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
8004 mds->locker->create_lock_cache(mdr, diri);
8005
8006 // ok!
8007 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
8008 _link_remote(mdr, false, dn, dnl->get_inode());
8009 else
8010 _unlink_local(mdr, dn, straydn);
8011 }
8012
8013 class C_MDS_unlink_local_finish : public ServerLogContext {
8014 CDentry *dn;
8015 CDentry *straydn;
8016 version_t dnpv; // deleted dentry
8017 public:
8018 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
8019 ServerLogContext(s, r), dn(d), straydn(sd),
8020 dnpv(d->get_projected_version()) {}
8021 void finish(int r) override {
8022 ceph_assert(r == 0);
8023 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
8024 }
8025 };
8026
8027 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
8028 {
8029 dout(10) << "_unlink_local " << *dn << dendl;
8030
8031 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8032 CInode *in = dnl->get_inode();
8033
8034
8035 // ok, let's do it.
8036 mdr->ls = mdlog->get_current_segment();
8037
8038 // prepare log entry
8039 EUpdate *le = new EUpdate(mdlog, "unlink_local");
8040 mdlog->start_entry(le);
8041 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
8042 if (!mdr->more()->witnessed.empty()) {
8043 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
8044 le->reqid = mdr->reqid;
8045 le->had_peers = true;
8046 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
8047 }
8048
8049 if (straydn) {
8050 ceph_assert(dnl->is_primary());
8051 straydn->push_projected_linkage(in);
8052 }
8053
8054 // the unlinked dentry
8055 dn->pre_dirty();
8056
8057 auto pi = in->project_inode(mdr);
8058 {
8059 std::string t;
8060 dn->make_path_string(t, true);
8061 pi.inode->stray_prior_path = std::move(t);
8062 }
8063 pi.inode->version = in->pre_dirty();
8064 pi.inode->ctime = mdr->get_op_stamp();
8065 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
8066 pi.inode->rstat.rctime = mdr->get_op_stamp();
8067 pi.inode->change_attr++;
8068 pi.inode->nlink--;
8069 if (pi.inode->nlink == 0)
8070 in->state_set(CInode::STATE_ORPHAN);
8071
8072 if (mdr->more()->desti_srnode) {
8073 auto& desti_srnode = mdr->more()->desti_srnode;
8074 in->project_snaprealm(desti_srnode);
8075 desti_srnode = NULL;
8076 }
8077
8078 if (straydn) {
8079 // will manually pop projected inode
8080
8081 // primary link. add stray dentry.
8082 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
8083 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
8084
8085 pi.inode->update_backtrace();
8086 le->metablob.add_primary_dentry(straydn, in, true, true);
8087 } else {
8088 // remote link. update remote inode.
8089 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
8090 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
8091 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
8092 }
8093
8094 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
8095 le->metablob.add_null_dentry(dn, true);
8096
8097 if (in->is_dir()) {
8098 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8099 le->metablob.renamed_dirino = in->ino();
8100 }
8101
8102 dn->push_projected_linkage();
8103
8104 if (straydn) {
8105 ceph_assert(in->first <= straydn->first);
8106 in->first = straydn->first;
8107 }
8108
8109 if (in->is_dir()) {
8110 ceph_assert(straydn);
8111 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
8112 }
8113
8114 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
8115 }
8116
8117 void Server::_unlink_local_finish(MDRequestRef& mdr,
8118 CDentry *dn, CDentry *straydn,
8119 version_t dnpv)
8120 {
8121 dout(10) << "_unlink_local_finish " << *dn << dendl;
8122
8123 if (!mdr->more()->witnessed.empty())
8124 mdcache->logged_leader_update(mdr->reqid);
8125
8126 CInode *strayin = NULL;
8127 bool hadrealm = false;
8128 if (straydn) {
8129 // if there is newly created snaprealm, need to split old snaprealm's
8130 // inodes_with_caps. So pop snaprealm before linkage changes.
8131 strayin = dn->get_linkage()->get_inode();
8132 hadrealm = strayin->snaprealm ? true : false;
8133 strayin->early_pop_projected_snaprealm();
8134 }
8135
8136 // unlink main dentry
8137 dn->get_dir()->unlink_inode(dn);
8138 dn->pop_projected_linkage();
8139 dn->mark_dirty(dnpv, mdr->ls);
8140
8141 // relink as stray? (i.e. was primary link?)
8142 if (straydn) {
8143 dout(20) << " straydn is " << *straydn << dendl;
8144 straydn->pop_projected_linkage();
8145 mdcache->touch_dentry_bottom(straydn);
8146 }
8147
8148 mdr->apply();
8149
8150 dn->state_clear(CDentry::STATE_UNLINKING);
8151 mdcache->send_dentry_unlink(dn, straydn, mdr);
8152
8153 MDSContext::vec finished;
8154 dn->take_waiting(CDentry::WAIT_UNLINK_FINISH, finished);
8155 mdcache->mds->queue_waiters(finished);
8156
8157 if (straydn) {
8158 // update subtree map?
8159 if (strayin->is_dir())
8160 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
8161
8162 if (strayin->snaprealm && !hadrealm)
8163 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
8164 }
8165
8166 // bump pop
8167 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
8168
8169 // reply
8170 respond_to_request(mdr, 0);
8171
8172 // removing a new dn?
8173 dn->get_dir()->try_remove_unlinked_dn(dn);
8174
8175 // clean up ?
8176 // respond_to_request() drops locks. So stray reintegration can race with us.
8177 if (straydn && !straydn->get_projected_linkage()->is_null()) {
8178 // Tip off the MDCache that this dentry is a stray that
8179 // might be elegible for purge.
8180 mdcache->notify_stray(straydn);
8181 }
8182 }
8183
8184 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
8185 {
8186 if (mds->is_cluster_degraded() &&
8187 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
8188 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
8189 if (mdr->more()->waiting_on_peer.empty())
8190 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
8191 return false;
8192 }
8193
8194 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
8195 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREP);
8196 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
8197 for (auto dn : trace)
8198 req->srcdnpath.push_dentry(dn->get_name());
8199 mdcache->encode_replica_stray(straydn, who, req->straybl);
8200 if (mdr->more()->desti_srnode)
8201 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
8202
8203 req->op_stamp = mdr->get_op_stamp();
8204 mds->send_message_mds(req, who);
8205
8206 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
8207 mdr->more()->waiting_on_peer.insert(who);
8208 return true;
8209 }
8210
8211 struct C_MDS_PeerRmdirPrep : public ServerLogContext {
8212 CDentry *dn, *straydn;
8213 C_MDS_PeerRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
8214 : ServerLogContext(s, r), dn(d), straydn(st) {}
8215 void finish(int r) override {
8216 server->_logged_peer_rmdir(mdr, dn, straydn);
8217 }
8218 };
8219
8220 struct C_MDS_PeerRmdirCommit : public ServerContext {
8221 MDRequestRef mdr;
8222 CDentry *straydn;
8223 C_MDS_PeerRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
8224 : ServerContext(s), mdr(r), straydn(sd) { }
8225 void finish(int r) override {
8226 server->_commit_peer_rmdir(mdr, r, straydn);
8227 }
8228 };
8229
8230 void Server::handle_peer_rmdir_prep(MDRequestRef& mdr)
8231 {
8232 dout(10) << "handle_peer_rmdir_prep " << *mdr
8233 << " " << mdr->peer_request->srcdnpath
8234 << " to " << mdr->peer_request->destdnpath
8235 << dendl;
8236
8237 vector<CDentry*> trace;
8238 filepath srcpath(mdr->peer_request->srcdnpath);
8239 dout(10) << " src " << srcpath << dendl;
8240 CInode *in;
8241 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
8242 int r = mdcache->path_traverse(mdr, cf, srcpath,
8243 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
8244 &trace, &in);
8245 if (r > 0) return;
8246 if (r == -CEPHFS_ESTALE) {
8247 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
8248 mdr->peer_to_mds, true);
8249 return;
8250 }
8251 ceph_assert(r == 0);
8252 CDentry *dn = trace.back();
8253 dout(10) << " dn " << *dn << dendl;
8254 mdr->pin(dn);
8255
8256 ceph_assert(mdr->straydn);
8257 CDentry *straydn = mdr->straydn;
8258 dout(10) << " straydn " << *straydn << dendl;
8259
8260 mdr->set_op_stamp(mdr->peer_request->op_stamp);
8261
8262 rmdir_rollback rollback;
8263 rollback.reqid = mdr->reqid;
8264 rollback.src_dir = dn->get_dir()->dirfrag();
8265 rollback.src_dname = dn->get_name();
8266 rollback.dest_dir = straydn->get_dir()->dirfrag();
8267 rollback.dest_dname = straydn->get_name();
8268 if (mdr->peer_request->desti_snapbl.length()) {
8269 if (in->snaprealm) {
8270 encode(true, rollback.snapbl);
8271 in->encode_snap_blob(rollback.snapbl);
8272 } else {
8273 encode(false, rollback.snapbl);
8274 }
8275 }
8276 encode(rollback, mdr->more()->rollback_bl);
8277 // FIXME: rollback snaprealm
8278 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
8279
8280 // set up commit waiter
8281 mdr->more()->peer_commit = new C_MDS_PeerRmdirCommit(this, mdr, straydn);
8282
8283 straydn->push_projected_linkage(in);
8284 dn->push_projected_linkage();
8285
8286 ceph_assert(straydn->first >= in->first);
8287 in->first = straydn->first;
8288
8289 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
8290 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
8291 _logged_peer_rmdir(mdr, dn, straydn);
8292 return;
8293 }
8294
8295 mdr->ls = mdlog->get_current_segment();
8296 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir", mdr->reqid, mdr->peer_to_mds,
8297 EPeerUpdate::OP_PREPARE, EPeerUpdate::RMDIR);
8298 mdlog->start_entry(le);
8299 le->rollback = mdr->more()->rollback_bl;
8300
8301 le->commit.add_dir_context(straydn->get_dir());
8302 le->commit.add_primary_dentry(straydn, in, true);
8303 // peer: no need to journal original dentry
8304
8305 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8306 le->commit.renamed_dirino = in->ino();
8307
8308 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
8309 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
8310
8311 mdr->more()->peer_update_journaled = true;
8312 submit_mdlog_entry(le, new C_MDS_PeerRmdirPrep(this, mdr, dn, straydn),
8313 mdr, __func__);
8314 mdlog->flush();
8315 }
8316
8317 void Server::_logged_peer_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
8318 {
8319 dout(10) << "_logged_peer_rmdir " << *mdr << " on " << *dn << dendl;
8320 CInode *in = dn->get_linkage()->get_inode();
8321
8322 bool new_realm;
8323 if (mdr->peer_request->desti_snapbl.length()) {
8324 new_realm = !in->snaprealm;
8325 in->decode_snap_blob(mdr->peer_request->desti_snapbl);
8326 ceph_assert(in->snaprealm);
8327 } else {
8328 new_realm = false;
8329 }
8330
8331 // update our cache now, so we are consistent with what is in the journal
8332 // when we journal a subtree map
8333 dn->get_dir()->unlink_inode(dn);
8334 straydn->pop_projected_linkage();
8335 dn->pop_projected_linkage();
8336
8337 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->peer_update_journaled);
8338
8339 if (new_realm)
8340 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
8341
8342 // done.
8343 mdr->reset_peer_request();
8344 mdr->straydn = 0;
8345
8346 if (!mdr->aborted) {
8347 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREPACK);
8348 if (!mdr->more()->peer_update_journaled)
8349 reply->mark_not_journaled();
8350 mds->send_message_mds(reply, mdr->peer_to_mds);
8351 } else {
8352 dout(10) << " abort flag set, finishing" << dendl;
8353 mdcache->request_finish(mdr);
8354 }
8355 }
8356
8357 void Server::handle_peer_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
8358 {
8359 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
8360 << " " << *ack << dendl;
8361
8362 mds_rank_t from = mds_rank_t(ack->get_source().num());
8363
8364 mdr->more()->peers.insert(from);
8365 mdr->more()->witnessed.insert(from);
8366 if (!ack->is_not_journaled())
8367 mdr->more()->has_journaled_peers = true;
8368
8369 // remove from waiting list
8370 ceph_assert(mdr->more()->waiting_on_peer.count(from));
8371 mdr->more()->waiting_on_peer.erase(from);
8372
8373 if (mdr->more()->waiting_on_peer.empty())
8374 dispatch_client_request(mdr); // go again!
8375 else
8376 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
8377 }
8378
8379 void Server::_commit_peer_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
8380 {
8381 dout(10) << "_commit_peer_rmdir " << *mdr << " r=" << r << dendl;
8382
8383 if (r == 0) {
8384 if (mdr->more()->peer_update_journaled) {
8385 CInode *strayin = straydn->get_projected_linkage()->get_inode();
8386 if (strayin && !strayin->snaprealm)
8387 mdcache->clear_dirty_bits_for_stray(strayin);
8388 }
8389
8390 mdr->cleanup();
8391
8392 if (mdr->more()->peer_update_journaled) {
8393 // write a commit to the journal
8394 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_commit", mdr->reqid,
8395 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
8396 EPeerUpdate::RMDIR);
8397 mdlog->start_entry(le);
8398 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
8399 mdlog->flush();
8400 } else {
8401 _committed_peer(mdr);
8402 }
8403 } else {
8404 // abort
8405 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
8406 }
8407 }
8408
8409 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
8410 metareqid_t reqid;
8411 CDentry *dn;
8412 CDentry *straydn;
8413 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
8414 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
8415 void finish(int r) override {
8416 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
8417 }
8418 };
8419
8420 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
8421 {
8422 // unlink the other rollback methods, the rmdir rollback is only
8423 // needed to record the subtree changes in the journal for inode
8424 // replicas who are auth for empty dirfrags. no actual changes to
8425 // the file system are taking place here, so there is no Mutation.
8426
8427 rmdir_rollback rollback;
8428 auto p = rbl.cbegin();
8429 decode(rollback, p);
8430
8431 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
8432 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
8433 ceph_assert(mdr || mds->is_resolve());
8434
8435 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
8436 if (!dir)
8437 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
8438 ceph_assert(dir);
8439 CDentry *dn = dir->lookup(rollback.src_dname);
8440 ceph_assert(dn);
8441 dout(10) << " dn " << *dn << dendl;
8442 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
8443 ceph_assert(straydir);
8444 CDentry *straydn = straydir->lookup(rollback.dest_dname);
8445 ceph_assert(straydn);
8446 dout(10) << " straydn " << *straydn << dendl;
8447 CInode *in = straydn->get_linkage()->get_inode();
8448
8449 dn->push_projected_linkage(in);
8450 straydn->push_projected_linkage();
8451
8452 if (rollback.snapbl.length() && in->snaprealm) {
8453 bool hadrealm;
8454 auto p = rollback.snapbl.cbegin();
8455 decode(hadrealm, p);
8456 if (hadrealm) {
8457 decode(in->snaprealm->srnode, p);
8458 } else {
8459 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
8460 }
8461 }
8462
8463 if (mdr && !mdr->more()->peer_update_journaled) {
8464 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
8465
8466 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
8467 return;
8468 }
8469
8470
8471 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_rollback", rollback.reqid, leader,
8472 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RMDIR);
8473 mdlog->start_entry(le);
8474
8475 le->commit.add_dir_context(dn->get_dir());
8476 le->commit.add_primary_dentry(dn, in, true);
8477 // peer: no need to journal straydn
8478
8479 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8480 le->commit.renamed_dirino = in->ino();
8481
8482 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
8483
8484 submit_mdlog_entry(le,
8485 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
8486 dn, straydn),
8487 mdr, __func__);
8488 mdlog->flush();
8489 }
8490
8491 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
8492 {
8493 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
8494
8495 straydn->get_dir()->unlink_inode(straydn);
8496 dn->pop_projected_linkage();
8497 straydn->pop_projected_linkage();
8498
8499 CInode *in = dn->get_linkage()->get_inode();
8500 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
8501 !mdr || mdr->more()->peer_update_journaled);
8502
8503 if (mds->is_resolve()) {
8504 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
8505 mdcache->try_trim_non_auth_subtree(root);
8506 }
8507
8508 if (mdr)
8509 mdcache->request_finish(mdr);
8510
8511 mdcache->finish_rollback(reqid, mdr);
8512 }
8513
8514
8515 /** _dir_is_nonempty[_unlocked]
8516 *
8517 * check if a directory is non-empty (i.e. we can rmdir it).
8518 *
8519 * the unlocked varient this is a fastpath check. we can't really be
8520 * sure until we rdlock the filelock.
8521 */
8522 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
8523 {
8524 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
8525 ceph_assert(in->is_auth());
8526
8527 if (in->filelock.is_cached())
8528 return false; // there can be pending async create/unlink. don't know.
8529 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
8530 return true; // in a snapshot!
8531
8532 auto&& ls = in->get_dirfrags();
8533 for (const auto& dir : ls) {
8534 // is the frag obviously non-empty?
8535 if (dir->is_auth()) {
8536 if (dir->get_projected_fnode()->fragstat.size()) {
8537 dout(10) << "dir_is_nonempty_unlocked dirstat has "
8538 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
8539 return true;
8540 }
8541 }
8542 }
8543
8544 return false;
8545 }
8546
8547 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
8548 {
8549 dout(10) << "dir_is_nonempty " << *in << dendl;
8550 ceph_assert(in->is_auth());
8551 ceph_assert(in->filelock.can_read(mdr->get_client()));
8552
8553 frag_info_t dirstat;
8554 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
8555
8556 auto&& ls = in->get_dirfrags();
8557 for (const auto& dir : ls) {
8558 const auto& pf = dir->get_projected_fnode();
8559 if (pf->fragstat.size()) {
8560 dout(10) << "dir_is_nonempty dirstat has "
8561 << pf->fragstat.size() << " items " << *dir << dendl;
8562 return true;
8563 }
8564
8565 if (pf->accounted_fragstat.version == dirstat_version)
8566 dirstat.add(pf->accounted_fragstat);
8567 else
8568 dirstat.add(pf->fragstat);
8569 }
8570
8571 return dirstat.size() != in->get_projected_inode()->dirstat.size();
8572 }
8573
8574
8575 // ======================================================
8576
8577
8578 class C_MDS_rename_finish : public ServerLogContext {
8579 CDentry *srcdn;
8580 CDentry *destdn;
8581 CDentry *straydn;
8582 public:
8583 C_MDS_rename_finish(Server *s, MDRequestRef& r,
8584 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
8585 ServerLogContext(s, r),
8586 srcdn(sdn), destdn(ddn), straydn(stdn) { }
8587 void finish(int r) override {
8588 ceph_assert(r == 0);
8589 server->_rename_finish(mdr, srcdn, destdn, straydn);
8590 }
8591 };
8592
8593
8594 /** handle_client_rename
8595 *
8596 * rename leader is the destdn auth. this is because cached inodes
8597 * must remain connected. thus, any replica of srci, must also
8598 * replicate destdn, and possibly straydn, so that srci (and
8599 * destdn->inode) remain connected during the rename.
8600 *
8601 * to do this, we freeze srci, then leader (destdn auth) verifies that
8602 * all other nodes have also replciated destdn and straydn. note that
8603 * destdn replicas need not also replicate srci. this only works when
8604 * destdn is leader.
8605 *
8606 * This function takes responsibility for the passed mdr.
8607 */
8608 void Server::handle_client_rename(MDRequestRef& mdr)
8609 {
8610 const auto& req = mdr->client_request;
8611 dout(7) << "handle_client_rename " << *req << dendl;
8612
8613 filepath destpath = req->get_filepath();
8614 filepath srcpath = req->get_filepath2();
8615 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
8616 respond_to_request(mdr, -CEPHFS_EBUSY);
8617 return;
8618 }
8619
8620 if (req->get_alternate_name().size() > alternate_name_max) {
8621 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
8622 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
8623 return;
8624 }
8625
8626 auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
8627 if (!destdn)
8628 return;
8629
8630 if (is_unlink_pending(destdn)) {
8631 wait_for_pending_unlink(destdn, mdr);
8632 return;
8633 }
8634
8635 if (is_unlink_pending(srcdn)) {
8636 wait_for_pending_unlink(srcdn, mdr);
8637 return;
8638 }
8639
8640 dout(10) << " destdn " << *destdn << dendl;
8641 CDir *destdir = destdn->get_dir();
8642 ceph_assert(destdir->is_auth());
8643 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8644
8645 dout(10) << " srcdn " << *srcdn << dendl;
8646 CDir *srcdir = srcdn->get_dir();
8647 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8648 CInode *srci = srcdnl->get_inode();
8649 dout(10) << " srci " << *srci << dendl;
8650
8651 // -- some sanity checks --
8652 if (destdn == srcdn) {
8653 dout(7) << "rename src=dest, noop" << dendl;
8654 respond_to_request(mdr, 0);
8655 return;
8656 }
8657
8658 // dest a child of src?
8659 // e.g. mv /usr /usr/foo
8660 if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
8661 dout(7) << "cannot rename item to be a child of itself" << dendl;
8662 respond_to_request(mdr, -CEPHFS_EINVAL);
8663 return;
8664 }
8665
8666 // is this a stray migration, reintegration or merge? (sanity checks!)
8667 if (mdr->reqid.name.is_mds() &&
8668 !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
8669 MDS_INO_IS_STRAY(destpath.get_ino())) &&
8670 !(destdnl->is_remote() &&
8671 destdnl->get_remote_ino() == srci->ino())) {
8672 respond_to_request(mdr, -CEPHFS_EINVAL); // actually, this won't reply, but whatev.
8673 return;
8674 }
8675
8676 CInode *oldin = 0;
8677 if (!destdnl->is_null()) {
8678 //dout(10) << "dest dn exists " << *destdn << dendl;
8679 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
8680 if (!oldin) return;
8681 dout(10) << " oldin " << *oldin << dendl;
8682
8683 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8684 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
8685 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
8686 return;
8687 }
8688
8689 // mv /some/thing /to/some/existing_other_thing
8690 if (oldin->is_dir() && !srci->is_dir()) {
8691 respond_to_request(mdr, -CEPHFS_EISDIR);
8692 return;
8693 }
8694 if (!oldin->is_dir() && srci->is_dir()) {
8695 respond_to_request(mdr, -CEPHFS_ENOTDIR);
8696 return;
8697 }
8698 if (srci == oldin && !srcdir->inode->is_stray()) {
8699 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
8700 return;
8701 }
8702 if (destdn->get_alternate_name() != req->get_alternate_name()) {
8703 /* the dentry exists but the alternate_names do not match, fail... */
8704 respond_to_request(mdr, -CEPHFS_EINVAL);
8705 return;
8706 }
8707 }
8708
8709 vector<CDentry*>& srctrace = mdr->dn[1];
8710 vector<CDentry*>& desttrace = mdr->dn[0];
8711
8712 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8713 if (destpath.get_ino() != srcpath.get_ino() &&
8714 !(req->get_source().is_mds() &&
8715 MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
8716 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
8717 CInode *destbase = desttrace[0]->get_dir()->get_inode();
8718 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8719 while (srcbase != destbase &&
8720 !srcbase->is_projected_ancestor_of(destbase)) {
8721 CDentry *pdn = srcbase->get_projected_parent_dn();
8722 srctrace.insert(srctrace.begin(), pdn);
8723 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
8724 srcbase = pdn->get_dir()->get_inode();
8725 }
8726
8727 // then, extend destpath until it shares the same parent inode as srcpath.
8728 while (destbase != srcbase) {
8729 CDentry *pdn = destbase->get_projected_parent_dn();
8730 desttrace.insert(desttrace.begin(), pdn);
8731 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
8732 destbase = pdn->get_dir()->get_inode();
8733 }
8734 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
8735 }
8736
8737
8738 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
8739 if (linkmerge)
8740 dout(10) << " this is a link merge" << dendl;
8741
8742 // -- create stray dentry? --
8743 CDentry *straydn = NULL;
8744 if (destdnl->is_primary() && !linkmerge) {
8745 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
8746 if (!straydn)
8747 return;
8748 dout(10) << " straydn is " << *straydn << dendl;
8749 } else if (mdr->straydn) {
8750 mdr->unpin(mdr->straydn);
8751 mdr->straydn = NULL;
8752 }
8753
8754
8755 // -- locks --
8756 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
8757 MutationImpl::LockOpVec lov;
8758
8759 // we need to update srci's ctime. xlock its least contended lock to do that...
8760 lov.add_xlock(&srci->linklock);
8761 lov.add_xlock(&srci->snaplock);
8762
8763 if (oldin) {
8764 // xlock oldin (for nlink--)
8765 lov.add_xlock(&oldin->linklock);
8766 lov.add_xlock(&oldin->snaplock);
8767 if (oldin->is_dir()) {
8768 ceph_assert(srci->is_dir());
8769 lov.add_rdlock(&oldin->filelock); // to verify it's empty
8770
8771 // adjust locking order?
8772 int cmp = mdr->compare_paths();
8773 if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
8774 std::reverse(lov.begin(), lov.end());
8775 } else {
8776 ceph_assert(!srci->is_dir());
8777 // adjust locking order;
8778 if (srci->ino() > oldin->ino())
8779 std::reverse(lov.begin(), lov.end());
8780 }
8781 }
8782
8783 // straydn?
8784 if (straydn) {
8785 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
8786 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
8787 lov.add_xlock(&straydn->lock);
8788 }
8789
8790 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
8791 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
8792 return;
8793
8794 mdr->locking_state |= MutationImpl::ALL_LOCKED;
8795 }
8796
8797 if (linkmerge)
8798 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
8799
8800 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
8801 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
8802 return;
8803
8804 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
8805 return;
8806
8807 if (!linkmerge && !check_fragment_space(mdr, destdn->get_dir()))
8808 return;
8809
8810 if (!linkmerge && !check_dir_max_entries(mdr, destdn->get_dir()))
8811 return;
8812
8813 if (!check_access(mdr, srci, MAY_WRITE))
8814 return;
8815 }
8816
8817 // with read lock, really verify oldin is empty
8818 if (oldin &&
8819 oldin->is_dir() &&
8820 _dir_is_nonempty(mdr, oldin)) {
8821 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
8822 return;
8823 }
8824
8825 /* project_snaprealm_past_parent() will do this job
8826 *
8827 // moving between snaprealms?
8828 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8829 SnapRealm *srcrealm = srci->find_snaprealm();
8830 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8831 if (srcrealm != destrealm &&
8832 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8833 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8834 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8835 mdcache->snaprealm_create(mdr, srci);
8836 return;
8837 }
8838 }
8839 */
8840
8841 SnapRealm *dest_realm = nullptr;
8842 SnapRealm *src_realm = nullptr;
8843 if (!linkmerge) {
8844 dest_realm = destdir->inode->find_snaprealm();
8845 if (srcdir->inode == destdir->inode)
8846 src_realm = dest_realm;
8847 else
8848 src_realm = srcdir->inode->find_snaprealm();
8849 if (src_realm != dest_realm &&
8850 src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) {
8851 respond_to_request(mdr, -CEPHFS_EXDEV);
8852 return;
8853 }
8854 }
8855
8856 ceph_assert(g_conf()->mds_kill_rename_at != 1);
8857
8858 // -- open all srcdn inode frags, if any --
8859 // we need these open so that auth can properly delegate from inode to dirfrags
8860 // after the inode is _ours_.
8861 if (srcdnl->is_primary() &&
8862 !srcdn->is_auth() &&
8863 srci->is_dir()) {
8864 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
8865 mdr->set_stickydirs(srci);
8866
8867 frag_vec_t leaves;
8868 srci->dirfragtree.get_leaves(leaves);
8869 for (const auto& leaf : leaves) {
8870 CDir *dir = srci->get_dirfrag(leaf);
8871 if (!dir) {
8872 dout(10) << " opening " << leaf << " under " << *srci << dendl;
8873 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
8874 return;
8875 }
8876 }
8877 }
8878
8879 // -- prepare snaprealm ---
8880
8881 if (linkmerge) {
8882 if (!mdr->more()->srci_srnode &&
8883 srci->get_projected_inode()->nlink == 1 &&
8884 srci->is_projected_snaprealm_global()) {
8885 sr_t *new_srnode = srci->prepare_new_srnode(0);
8886 srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false);
8887
8888 srci->clear_snaprealm_global(new_srnode);
8889 mdr->more()->srci_srnode = new_srnode;
8890 }
8891 } else {
8892 if (oldin && !mdr->more()->desti_srnode) {
8893 if (oldin->is_projected_snaprealm_global()) {
8894 sr_t *new_srnode = oldin->prepare_new_srnode(0);
8895 oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary());
8896 // dropping the last linkage or dropping the last remote linkage,
8897 // detch the inode from global snaprealm
8898 auto nlink = oldin->get_projected_inode()->nlink;
8899 if (nlink == 1 ||
8900 (nlink == 2 && !destdnl->is_primary() &&
8901 !oldin->get_projected_parent_dir()->inode->is_stray()))
8902 oldin->clear_snaprealm_global(new_srnode);
8903 mdr->more()->desti_srnode = new_srnode;
8904 } else if (destdnl->is_primary()) {
8905 snapid_t follows = dest_realm->get_newest_seq();
8906 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
8907 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
8908 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
8909 mdr->more()->desti_srnode = new_srnode;
8910 }
8911 }
8912 }
8913 if (!mdr->more()->srci_srnode) {
8914 if (srci->is_projected_snaprealm_global()) {
8915 sr_t *new_srnode = srci->prepare_new_srnode(0);
8916 srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary());
8917 mdr->more()->srci_srnode = new_srnode;
8918 } else if (srcdnl->is_primary()) {
8919 snapid_t follows = src_realm->get_newest_seq();
8920 if (src_realm != dest_realm &&
8921 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
8922 sr_t *new_srnode = srci->prepare_new_srnode(follows);
8923 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
8924 mdr->more()->srci_srnode = new_srnode;
8925 }
8926 }
8927 }
8928 }
8929
8930 // -- prepare witnesses --
8931
8932 /*
8933 * NOTE: we use _all_ replicas as witnesses.
8934 * this probably isn't totally necessary (esp for file renames),
8935 * but if/when we change that, we have to make sure rejoin is
8936 * sufficiently robust to handle strong rejoins from survivors
8937 * with totally wrong dentry->inode linkage.
8938 * (currently, it can ignore rename effects, because the resolve
8939 * stage will sort them out.)
8940 */
8941 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
8942 if (srcdn->is_auth())
8943 srcdn->list_replicas(witnesses);
8944 else
8945 witnesses.insert(srcdn->authority().first);
8946 if (srcdnl->is_remote() && !srci->is_auth())
8947 witnesses.insert(srci->authority().first);
8948 destdn->list_replicas(witnesses);
8949 if (destdnl->is_remote() && !oldin->is_auth())
8950 witnesses.insert(oldin->authority().first);
8951 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
8952
8953 if (!witnesses.empty()) {
8954 // Replicas can't see projected dentry linkages and will get confused.
8955 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8956 // can't project these inodes' linkages.
8957 bool need_flush = false;
8958 for (auto& dn : srctrace) {
8959 if (dn->is_projected()) {
8960 need_flush = true;
8961 break;
8962 }
8963 }
8964 if (!need_flush) {
8965 CDentry *dn = destdn;
8966 do {
8967 if (dn->is_projected()) {
8968 need_flush = true;
8969 break;
8970 }
8971 CInode *diri = dn->get_dir()->get_inode();
8972 dn = diri->get_projected_parent_dn();
8973 } while (dn);
8974 }
8975 if (need_flush) {
8976 mdlog->wait_for_safe(
8977 new MDSInternalContextWrapper(mds,
8978 new C_MDS_RetryRequest(mdcache, mdr)));
8979 mdlog->flush();
8980 return;
8981 }
8982 }
8983
8984 // do srcdn auth last
8985 mds_rank_t last = MDS_RANK_NONE;
8986 if (!srcdn->is_auth()) {
8987 last = srcdn->authority().first;
8988 mdr->more()->srcdn_auth_mds = last;
8989 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8990 // are involved in the rename operation.
8991 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
8992 dout(10) << " preparing ambiguous auth for srci" << dendl;
8993 ceph_assert(mdr->more()->is_remote_frozen_authpin);
8994 ceph_assert(mdr->more()->rename_inode == srci);
8995 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8996 return;
8997 }
8998 }
8999
9000 for (set<mds_rank_t>::iterator p = witnesses.begin();
9001 p != witnesses.end();
9002 ++p) {
9003 if (*p == last) continue; // do it last!
9004 if (mdr->more()->witnessed.count(*p)) {
9005 dout(10) << " already witnessed by mds." << *p << dendl;
9006 } else if (mdr->more()->waiting_on_peer.count(*p)) {
9007 dout(10) << " already waiting on witness mds." << *p << dendl;
9008 } else {
9009 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
9010 return;
9011 }
9012 }
9013 if (!mdr->more()->waiting_on_peer.empty())
9014 return; // we're waiting for a witness.
9015
9016 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
9017 dout(10) << " preparing last witness (srcdn auth)" << dendl;
9018 ceph_assert(mdr->more()->waiting_on_peer.count(last) == 0);
9019 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
9020 return;
9021 }
9022
9023 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
9024 if (!mdr->more()->peers.empty() && !srci->is_dir())
9025 ceph_assert(g_conf()->mds_kill_rename_at != 3);
9026 if (!mdr->more()->peers.empty() && srci->is_dir())
9027 ceph_assert(g_conf()->mds_kill_rename_at != 4);
9028
9029 // -- declare now --
9030 mdr->set_mds_stamp(ceph_clock_now());
9031
9032 // -- prepare journal entry --
9033 mdr->ls = mdlog->get_current_segment();
9034 EUpdate *le = new EUpdate(mdlog, "rename");
9035 mdlog->start_entry(le);
9036 le->metablob.add_client_req(mdr->reqid, req->get_oldest_client_tid());
9037 if (!mdr->more()->witnessed.empty()) {
9038 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
9039
9040 le->reqid = mdr->reqid;
9041 le->had_peers = true;
9042
9043 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
9044 // no need to send frozen auth pin to recovring auth MDS of srci
9045 mdr->more()->is_remote_frozen_authpin = false;
9046 }
9047
9048 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, req->get_alternate_name(), straydn);
9049 if (le->client_map.length())
9050 le->cmapv = mds->sessionmap.get_projected();
9051
9052 // -- commit locally --
9053 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
9054
9055 journal_and_reply(mdr, srci, destdn, le, fin);
9056
9057 // trigger to flush mdlog in case reintegrating or migrating the stray dn,
9058 // because the link requests maybe waiting.
9059 if (srcdn->get_dir()->inode->is_stray()) {
9060 mdlog->flush();
9061 }
9062 mds->balancer->maybe_fragment(destdn->get_dir(), false);
9063 }
9064
9065
9066 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9067 {
9068 dout(10) << "_rename_finish " << *mdr << dendl;
9069
9070 if (!mdr->more()->witnessed.empty())
9071 mdcache->logged_leader_update(mdr->reqid);
9072
9073 // apply
9074 _rename_apply(mdr, srcdn, destdn, straydn);
9075
9076 mdcache->send_dentry_link(destdn, mdr);
9077
9078 CDentry::linkage_t *destdnl = destdn->get_linkage();
9079 CInode *in = destdnl->get_inode();
9080 bool need_eval = mdr->more()->cap_imports.count(in);
9081
9082 // test hack: test peer commit
9083 if (!mdr->more()->peers.empty() && !in->is_dir())
9084 ceph_assert(g_conf()->mds_kill_rename_at != 5);
9085 if (!mdr->more()->peers.empty() && in->is_dir())
9086 ceph_assert(g_conf()->mds_kill_rename_at != 6);
9087
9088 // bump popularity
9089 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
9090 if (destdnl->is_remote() && in->is_auth())
9091 mds->balancer->hit_inode(in, META_POP_IWR);
9092
9093 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
9094
9095 ceph_assert(g_conf()->mds_kill_rename_at != 7);
9096
9097 // reply
9098 respond_to_request(mdr, 0);
9099
9100 if (need_eval)
9101 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
9102
9103 // clean up?
9104 // respond_to_request() drops locks. So stray reintegration can race with us.
9105 if (straydn && !straydn->get_projected_linkage()->is_null()) {
9106 mdcache->notify_stray(straydn);
9107 }
9108 }
9109
9110
9111
9112 // helpers
9113
9114 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
9115 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
9116 {
9117 const auto& client_req = mdr->client_request;
9118 ceph_assert(client_req);
9119
9120 if (mds->is_cluster_degraded() &&
9121 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
9122 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
9123 if (mdr->more()->waiting_on_peer.empty())
9124 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
9125 return false;
9126 }
9127
9128 dout(10) << "_rename_prepare_witness mds." << who << dendl;
9129 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREP);
9130
9131 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
9132 for (auto dn : srctrace)
9133 req->srcdnpath.push_dentry(dn->get_name());
9134 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
9135 for (auto dn : dsttrace)
9136 req->destdnpath.push_dentry(dn->get_name());
9137 req->alternate_name = client_req->alternate_name;
9138 if (straydn)
9139 mdcache->encode_replica_stray(straydn, who, req->straybl);
9140
9141 if (mdr->more()->srci_srnode)
9142 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
9143 if (mdr->more()->desti_srnode)
9144 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
9145
9146 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
9147
9148 // srcdn auth will verify our current witness list is sufficient
9149 req->witnesses = witnesse;
9150
9151 req->op_stamp = mdr->get_op_stamp();
9152 mds->send_message_mds(req, who);
9153
9154 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
9155 mdr->more()->waiting_on_peer.insert(who);
9156 return true;
9157 }
9158
9159 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
9160 {
9161 version_t oldpv = mdr->more()->inode_import_v;
9162
9163 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9164
9165 /* import node */
9166 auto blp = mdr->more()->inode_import.cbegin();
9167
9168 // imported caps
9169 map<client_t,entity_inst_t> client_map;
9170 map<client_t, client_metadata_t> client_metadata_map;
9171 decode(client_map, blp);
9172 decode(client_metadata_map, blp);
9173 prepare_force_open_sessions(client_map, client_metadata_map,
9174 mdr->more()->imported_session_map);
9175 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
9176 encode(client_metadata_map, *client_map_bl);
9177
9178 list<ScatterLock*> updated_scatterlocks;
9179 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
9180 mdr->more()->cap_imports, updated_scatterlocks);
9181
9182 // hack: force back to !auth and clean, temporarily
9183 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
9184 srcdnl->get_inode()->mark_clean();
9185
9186 return oldpv;
9187 }
9188
9189 bool Server::_need_force_journal(CInode *diri, bool empty)
9190 {
9191 auto&& dirs = diri->get_dirfrags();
9192
9193 bool force_journal = false;
9194 if (empty) {
9195 for (const auto& dir : dirs) {
9196 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
9197 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
9198 force_journal = true;
9199 break;
9200 } else
9201 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
9202 }
9203 } else {
9204 // see if any children of our frags are auth subtrees.
9205 std::vector<CDir*> subtrees;
9206 mdcache->get_subtrees(subtrees);
9207 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
9208 for (const auto& dir : dirs) {
9209 for (const auto& subtree : subtrees) {
9210 if (dir->contains(subtree)) {
9211 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
9212 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
9213 << *subtree << dendl;
9214 force_journal = true;
9215 break;
9216 } else
9217 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
9218 } else
9219 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
9220 }
9221 if (force_journal)
9222 break;
9223 }
9224 }
9225 return force_journal;
9226 }
9227
9228 void Server::_rename_prepare(MDRequestRef& mdr,
9229 EMetaBlob *metablob, bufferlist *client_map_bl,
9230 CDentry *srcdn, CDentry *destdn, std::string_view alternate_name,
9231 CDentry *straydn)
9232 {
9233 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
9234 if (straydn)
9235 dout(10) << " straydn " << *straydn << dendl;
9236
9237 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
9238 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
9239 CInode *srci = srcdnl->get_inode();
9240 CInode *oldin = destdnl->get_inode();
9241
9242 // primary+remote link merge?
9243 bool linkmerge = (srci == oldin);
9244 if (linkmerge)
9245 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
9246 bool silent = srcdn->get_dir()->inode->is_stray();
9247
9248 bool force_journal_dest = false;
9249 if (srci->is_dir() && !destdn->is_auth()) {
9250 if (srci->is_auth()) {
9251 // if we are auth for srci and exporting it, force journal because journal replay needs
9252 // the source inode to create auth subtrees.
9253 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
9254 force_journal_dest = true;
9255 } else
9256 force_journal_dest = _need_force_journal(srci, false);
9257 }
9258
9259 bool force_journal_stray = false;
9260 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
9261 force_journal_stray = _need_force_journal(oldin, true);
9262
9263 if (linkmerge)
9264 dout(10) << " merging remote and primary links to the same inode" << dendl;
9265 if (silent)
9266 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
9267 if (force_journal_dest)
9268 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
9269 if (force_journal_stray)
9270 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
9271
9272 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
9273 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
9274 metablob->renamed_dirino = srci->ino();
9275 } else if (oldin && oldin->is_dir() && force_journal_stray) {
9276 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
9277 metablob->renamed_dirino = oldin->ino();
9278 }
9279
9280 // prepare
9281 CInode::mempool_inode *spi = 0; // renamed inode
9282 CInode::mempool_inode *tpi = 0; // target/overwritten inode
9283
9284 // target inode
9285 if (!linkmerge) {
9286 if (destdnl->is_primary()) {
9287 ceph_assert(straydn); // moving to straydn.
9288 // link--, and move.
9289 if (destdn->is_auth()) {
9290 auto pi= oldin->project_inode(mdr); //project_snaprealm
9291 pi.inode->version = straydn->pre_dirty(pi.inode->version);
9292 pi.inode->update_backtrace();
9293 tpi = pi.inode.get();
9294 }
9295 straydn->push_projected_linkage(oldin);
9296 } else if (destdnl->is_remote()) {
9297 // nlink-- targeti
9298 if (oldin->is_auth()) {
9299 auto pi = oldin->project_inode(mdr);
9300 pi.inode->version = oldin->pre_dirty();
9301 tpi = pi.inode.get();
9302 }
9303 }
9304 }
9305
9306 // dest
9307 if (destdnl->is_null()) {
9308 /* handle_client_rename checks that alternate_name matches for existing destdn */
9309 destdn->set_alternate_name(alternate_name);
9310 }
9311 if (srcdnl->is_remote()) {
9312 if (!linkmerge) {
9313 // destdn
9314 if (destdn->is_auth())
9315 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
9316 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
9317 // srci
9318 if (srci->is_auth()) {
9319 auto pi = srci->project_inode(mdr);
9320 pi.inode->version = srci->pre_dirty();
9321 spi = pi.inode.get();
9322 }
9323 } else {
9324 dout(10) << " will merge remote onto primary link" << dendl;
9325 if (destdn->is_auth()) {
9326 auto pi = oldin->project_inode(mdr);
9327 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->get_version());
9328 spi = pi.inode.get();
9329 }
9330 }
9331 } else { // primary
9332 if (destdn->is_auth()) {
9333 version_t oldpv;
9334 if (srcdn->is_auth())
9335 oldpv = srci->get_projected_version();
9336 else {
9337 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
9338
9339 // note which dirfrags have child subtrees in the journal
9340 // event, so that we can open those (as bounds) during replay.
9341 if (srci->is_dir()) {
9342 auto&& ls = srci->get_dirfrags();
9343 for (const auto& dir : ls) {
9344 if (!dir->is_auth())
9345 metablob->renamed_dir_frags.push_back(dir->get_frag());
9346 }
9347 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
9348 }
9349 }
9350 auto pi = srci->project_inode(mdr); // project snaprealm if srcdnl->is_primary
9351 // & srcdnl->snaprealm
9352 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
9353 pi.inode->update_backtrace();
9354 spi = pi.inode.get();
9355 }
9356 destdn->push_projected_linkage(srci);
9357 }
9358
9359 // src
9360 if (srcdn->is_auth())
9361 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
9362 srcdn->push_projected_linkage(); // push null linkage
9363
9364 if (!silent) {
9365 if (spi) {
9366 spi->ctime = mdr->get_op_stamp();
9367 if (mdr->get_op_stamp() > spi->rstat.rctime)
9368 spi->rstat.rctime = mdr->get_op_stamp();
9369 spi->change_attr++;
9370 if (linkmerge)
9371 spi->nlink--;
9372 }
9373 if (tpi) {
9374 tpi->ctime = mdr->get_op_stamp();
9375 if (mdr->get_op_stamp() > tpi->rstat.rctime)
9376 tpi->rstat.rctime = mdr->get_op_stamp();
9377 tpi->change_attr++;
9378 {
9379 std::string t;
9380 destdn->make_path_string(t, true);
9381 tpi->stray_prior_path = std::move(t);
9382 }
9383 tpi->nlink--;
9384 if (tpi->nlink == 0)
9385 oldin->state_set(CInode::STATE_ORPHAN);
9386 }
9387 }
9388
9389 // prepare nesting, mtime updates
9390 int predirty_dir = silent ? 0:PREDIRTY_DIR;
9391
9392 // guarantee stray dir is processed first during journal replay. unlink the old inode,
9393 // then link the source inode to destdn
9394 if (destdnl->is_primary()) {
9395 ceph_assert(straydn);
9396 if (straydn->is_auth()) {
9397 metablob->add_dir_context(straydn->get_dir());
9398 metablob->add_dir(straydn->get_dir(), true);
9399 }
9400 }
9401
9402 if (!linkmerge && destdnl->is_remote() && oldin->is_auth()) {
9403 CDir *oldin_dir = oldin->get_projected_parent_dir();
9404 if (oldin_dir != srcdn->get_dir() && oldin_dir != destdn->get_dir())
9405 mdcache->predirty_journal_parents(mdr, metablob, oldin, oldin_dir, PREDIRTY_PRIMARY);
9406 }
9407
9408 // sub off target
9409 if (destdn->is_auth() && !destdnl->is_null()) {
9410 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
9411 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
9412 if (destdnl->is_primary()) {
9413 ceph_assert(straydn);
9414 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
9415 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
9416 }
9417 }
9418
9419 if (srcdnl->is_remote() && srci->is_auth()) {
9420 CDir *srci_dir = srci->get_projected_parent_dir();
9421 if (srci_dir != srcdn->get_dir() && srci_dir != destdn->get_dir())
9422 mdcache->predirty_journal_parents(mdr, metablob, srci, srci_dir, PREDIRTY_PRIMARY);
9423 }
9424
9425 // move srcdn
9426 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
9427 int flags = predirty_dir | predirty_primary;
9428 if (srcdn->is_auth())
9429 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
9430 if (destdn->is_auth())
9431 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
9432
9433 // add it all to the metablob
9434 // target inode
9435 if (!linkmerge) {
9436 if (destdnl->is_primary()) {
9437 ceph_assert(straydn);
9438 if (destdn->is_auth()) {
9439 // project snaprealm, too
9440 if (auto& desti_srnode = mdr->more()->desti_srnode) {
9441 oldin->project_snaprealm(desti_srnode);
9442 if (tpi->nlink == 0)
9443 ceph_assert(!desti_srnode->is_parent_global());
9444 desti_srnode = NULL;
9445 }
9446 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
9447 metablob->add_primary_dentry(straydn, oldin, true, true);
9448 } else if (force_journal_stray) {
9449 dout(10) << " forced journaling straydn " << *straydn << dendl;
9450 metablob->add_dir_context(straydn->get_dir());
9451 metablob->add_primary_dentry(straydn, oldin, true);
9452 }
9453 } else if (destdnl->is_remote()) {
9454 if (oldin->is_auth()) {
9455 sr_t *new_srnode = NULL;
9456 if (mdr->peer_request) {
9457 if (mdr->peer_request->desti_snapbl.length() > 0) {
9458 new_srnode = new sr_t();
9459 auto p = mdr->peer_request->desti_snapbl.cbegin();
9460 decode(*new_srnode, p);
9461 }
9462 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
9463 new_srnode = desti_srnode;
9464 desti_srnode = NULL;
9465 }
9466 if (new_srnode) {
9467 oldin->project_snaprealm(new_srnode);
9468 if (tpi->nlink == 0)
9469 ceph_assert(!new_srnode->is_parent_global());
9470 }
9471 // auth for targeti
9472 CDentry *oldin_pdn = oldin->get_projected_parent_dn();
9473 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin_pdn);
9474 metablob->add_primary_dentry(oldin_pdn, oldin, true);
9475 }
9476 }
9477 }
9478
9479 // dest
9480 if (srcdnl->is_remote()) {
9481 ceph_assert(!linkmerge);
9482 if (destdn->is_auth() && !destdnl->is_null())
9483 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
9484 else
9485 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
9486
9487 if (destdn->is_auth())
9488 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
9489
9490 if (srci->is_auth() ) { // it's remote
9491 if (mdr->peer_request) {
9492 if (mdr->peer_request->srci_snapbl.length() > 0) {
9493 sr_t *new_srnode = new sr_t();
9494 auto p = mdr->peer_request->srci_snapbl.cbegin();
9495 decode(*new_srnode, p);
9496 srci->project_snaprealm(new_srnode);
9497 }
9498 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9499 srci->project_snaprealm(srci_srnode);
9500 srci_srnode = NULL;
9501 }
9502
9503 CDentry *srci_pdn = srci->get_projected_parent_dn();
9504 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn);
9505 metablob->add_primary_dentry(srci_pdn, srci, true);
9506 }
9507 } else if (srcdnl->is_primary()) {
9508 // project snap parent update?
9509 if (destdn->is_auth()) {
9510 if (auto& srci_srnode = mdr->more()->srci_srnode) {
9511 srci->project_snaprealm(srci_srnode);
9512 srci_srnode = NULL;
9513 }
9514 }
9515
9516 if (destdn->is_auth() && !destdnl->is_null())
9517 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
9518
9519 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
9520 {
9521 auto do_corruption = inject_rename_corrupt_dentry_first;
9522 if (unlikely(do_corruption > 0.0)) {
9523 auto r = ceph::util::generate_random_number(0.0, 1.0);
9524 if (r < do_corruption) {
9525 dout(0) << "corrupting dn: " << *destdn << dendl;
9526 destdn->first = -10;
9527 }
9528 }
9529 }
9530
9531 if (destdn->is_auth())
9532 metablob->add_primary_dentry(destdn, srci, true, true);
9533 else if (force_journal_dest) {
9534 dout(10) << " forced journaling destdn " << *destdn << dendl;
9535 metablob->add_dir_context(destdn->get_dir());
9536 metablob->add_primary_dentry(destdn, srci, true);
9537 if (srcdn->is_auth() && srci->is_dir()) {
9538 // journal new subtrees root dirfrags
9539 auto&& ls = srci->get_dirfrags();
9540 for (const auto& dir : ls) {
9541 if (dir->is_auth())
9542 metablob->add_dir(dir, true);
9543 }
9544 }
9545 }
9546 }
9547
9548 // src
9549 if (srcdn->is_auth()) {
9550 dout(10) << " journaling srcdn " << *srcdn << dendl;
9551 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
9552 // also journal the inode in case we need do peer rename rollback. It is Ok to add
9553 // both primary and NULL dentries. Because during journal replay, null dentry is
9554 // processed after primary dentry.
9555 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
9556 metablob->add_primary_dentry(srcdn, srci, true);
9557 metablob->add_null_dentry(srcdn, true);
9558 } else
9559 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
9560
9561 // make renamed inode first track the dn
9562 if (srcdnl->is_primary() && destdn->is_auth()) {
9563 ceph_assert(srci->first <= destdn->first);
9564 srci->first = destdn->first;
9565 }
9566 // make stray inode first track the straydn
9567 if (straydn && straydn->is_auth()) {
9568 ceph_assert(oldin->first <= straydn->first);
9569 oldin->first = straydn->first;
9570 }
9571
9572 if (oldin && oldin->is_dir()) {
9573 ceph_assert(straydn);
9574 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
9575 }
9576 if (srci->is_dir())
9577 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
9578
9579 }
9580
9581
9582 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9583 {
9584 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
9585 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
9586
9587 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9588 CDentry::linkage_t *destdnl = destdn->get_linkage();
9589
9590 CInode *oldin = destdnl->get_inode();
9591
9592 // primary+remote link merge?
9593 bool linkmerge = (srcdnl->get_inode() == oldin);
9594 if (linkmerge)
9595 ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
9596
9597 bool new_in_snaprealm = false;
9598 bool new_oldin_snaprealm = false;
9599
9600 // target inode
9601 if (!linkmerge) {
9602 if (destdnl->is_primary()) {
9603 ceph_assert(straydn);
9604 dout(10) << "straydn is " << *straydn << dendl;
9605
9606 // if there is newly created snaprealm, need to split old snaprealm's
9607 // inodes_with_caps. So pop snaprealm before linkage changes.
9608 if (destdn->is_auth()) {
9609 bool hadrealm = (oldin->snaprealm ? true : false);
9610 oldin->early_pop_projected_snaprealm();
9611 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
9612 } else {
9613 ceph_assert(mdr->peer_request);
9614 if (mdr->peer_request->desti_snapbl.length()) {
9615 new_oldin_snaprealm = !oldin->snaprealm;
9616 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
9617 ceph_assert(oldin->snaprealm);
9618 }
9619 }
9620
9621 destdn->get_dir()->unlink_inode(destdn, false);
9622
9623 straydn->pop_projected_linkage();
9624 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9625 ceph_assert(!straydn->is_projected()); // no other projected
9626
9627 // nlink-- targeti
9628 if (destdn->is_auth())
9629 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9630
9631 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
9632 } else if (destdnl->is_remote()) {
9633 destdn->get_dir()->unlink_inode(destdn, false);
9634 if (oldin->is_auth()) {
9635 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9636 } else if (mdr->peer_request) {
9637 if (mdr->peer_request->desti_snapbl.length() > 0) {
9638 ceph_assert(oldin->snaprealm);
9639 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
9640 }
9641 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
9642 delete desti_srnode;
9643 desti_srnode = NULL;
9644 }
9645 }
9646 }
9647
9648 // unlink src before we relink it at dest
9649 CInode *in = srcdnl->get_inode();
9650 ceph_assert(in);
9651
9652 bool srcdn_was_remote = srcdnl->is_remote();
9653 if (!srcdn_was_remote) {
9654 // if there is newly created snaprealm, need to split old snaprealm's
9655 // inodes_with_caps. So pop snaprealm before linkage changes.
9656 if (destdn->is_auth()) {
9657 bool hadrealm = (in->snaprealm ? true : false);
9658 in->early_pop_projected_snaprealm();
9659 new_in_snaprealm = (in->snaprealm && !hadrealm);
9660 } else {
9661 ceph_assert(mdr->peer_request);
9662 if (mdr->peer_request->srci_snapbl.length()) {
9663 new_in_snaprealm = !in->snaprealm;
9664 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
9665 ceph_assert(in->snaprealm);
9666 }
9667 }
9668 }
9669
9670 srcdn->get_dir()->unlink_inode(srcdn);
9671
9672 // After the stray dn being unlinked from the corresponding inode in case of
9673 // reintegrate_stray/migrate_stray, just wake up the waitiers.
9674 MDSContext::vec finished;
9675 in->take_waiting(CInode::WAIT_UNLINK, finished);
9676 if (!finished.empty()) {
9677 mds->queue_waiters(finished);
9678 }
9679
9680 // dest
9681 if (srcdn_was_remote) {
9682 if (!linkmerge) {
9683 // destdn
9684 destdnl = destdn->pop_projected_linkage();
9685 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9686 ceph_assert(!destdn->is_projected()); // no other projected
9687
9688 destdn->link_remote(destdnl, in);
9689 if (destdn->is_auth())
9690 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
9691 // in
9692 if (in->is_auth()) {
9693 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9694 } else if (mdr->peer_request) {
9695 if (mdr->peer_request->srci_snapbl.length() > 0) {
9696 ceph_assert(in->snaprealm);
9697 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
9698 }
9699 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9700 delete srci_srnode;
9701 srci_srnode = NULL;
9702 }
9703 } else {
9704 dout(10) << "merging remote onto primary link" << dendl;
9705 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9706 }
9707 } else { // primary
9708 if (linkmerge) {
9709 dout(10) << "merging primary onto remote link" << dendl;
9710 destdn->get_dir()->unlink_inode(destdn, false);
9711 }
9712 destdnl = destdn->pop_projected_linkage();
9713 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9714 ceph_assert(!destdn->is_projected()); // no other projected
9715
9716 // srcdn inode import?
9717 if (!srcdn->is_auth() && destdn->is_auth()) {
9718 ceph_assert(mdr->more()->inode_import.length() > 0);
9719
9720 map<client_t,Capability::Import> imported_caps;
9721
9722 // finish cap imports
9723 finish_force_open_sessions(mdr->more()->imported_session_map);
9724 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
9725 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
9726 mdr->more()->srcdn_auth_mds, true,
9727 mdr->more()->imported_session_map,
9728 mdr->more()->cap_imports[destdnl->get_inode()],
9729 imported_caps);
9730 }
9731
9732 mdr->more()->inode_import.clear();
9733 encode(imported_caps, mdr->more()->inode_import);
9734
9735 /* hack: add an auth pin for each xlock we hold. These were
9736 * remote xlocks previously but now they're local and
9737 * we're going to try and unpin when we xlock_finish. */
9738
9739 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
9740 i != mdr->locks.end();
9741 ++i) {
9742 SimpleLock *lock = i->lock;
9743 if (lock->get_parent() != destdnl->get_inode())
9744 break;
9745 if (i->is_xlock() && !lock->is_locallock())
9746 mds->locker->xlock_import(lock);
9747 }
9748
9749 // hack: fix auth bit
9750 in->state_set(CInode::STATE_AUTH);
9751
9752 mdr->clear_ambiguous_auth();
9753 }
9754
9755 if (destdn->is_auth())
9756 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9757 }
9758
9759 // src
9760 if (srcdn->is_auth())
9761 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
9762 srcdn->pop_projected_linkage();
9763 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9764 ceph_assert(!srcdn->is_projected()); // no other projected
9765
9766 // apply remaining projected inodes (nested)
9767 mdr->apply();
9768
9769 // update subtree map?
9770 if (destdnl->is_primary() && in->is_dir())
9771 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
9772
9773 if (straydn && oldin->is_dir())
9774 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
9775
9776 if (new_oldin_snaprealm)
9777 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
9778 if (new_in_snaprealm)
9779 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
9780
9781 // removing a new dn?
9782 if (srcdn->is_auth())
9783 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
9784 }
9785
9786
9787
9788 // ------------
9789 // PEER
9790
9791 class C_MDS_PeerRenamePrep : public ServerLogContext {
9792 CDentry *srcdn, *destdn, *straydn;
9793 public:
9794 C_MDS_PeerRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
9795 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
9796 void finish(int r) override {
9797 server->_logged_peer_rename(mdr, srcdn, destdn, straydn);
9798 }
9799 };
9800
9801 class C_MDS_PeerRenameCommit : public ServerContext {
9802 MDRequestRef mdr;
9803 CDentry *srcdn, *destdn, *straydn;
9804 public:
9805 C_MDS_PeerRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
9806 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
9807 void finish(int r) override {
9808 server->_commit_peer_rename(mdr, r, srcdn, destdn, straydn);
9809 }
9810 };
9811
9812 class C_MDS_PeerRenameSessionsFlushed : public ServerContext {
9813 MDRequestRef mdr;
9814 public:
9815 C_MDS_PeerRenameSessionsFlushed(Server *s, MDRequestRef& r) :
9816 ServerContext(s), mdr(r) {}
9817 void finish(int r) override {
9818 server->_peer_rename_sessions_flushed(mdr);
9819 }
9820 };
9821
9822 void Server::handle_peer_rename_prep(MDRequestRef& mdr)
9823 {
9824 dout(10) << "handle_peer_rename_prep " << *mdr
9825 << " " << mdr->peer_request->srcdnpath
9826 << " to " << mdr->peer_request->destdnpath
9827 << dendl;
9828
9829 if (mdr->peer_request->is_interrupted()) {
9830 dout(10) << " peer request interrupted, sending noop reply" << dendl;
9831 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9832 reply->mark_interrupted();
9833 mds->send_message_mds(reply, mdr->peer_to_mds);
9834 mdr->reset_peer_request();
9835 return;
9836 }
9837
9838 // discover destdn
9839 filepath destpath(mdr->peer_request->destdnpath);
9840 dout(10) << " dest " << destpath << dendl;
9841 vector<CDentry*> trace;
9842 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
9843 int r = mdcache->path_traverse(mdr, cf, destpath,
9844 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
9845 &trace);
9846 if (r > 0) return;
9847 if (r == -CEPHFS_ESTALE) {
9848 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
9849 mdr->peer_to_mds, true);
9850 return;
9851 }
9852 ceph_assert(r == 0); // we shouldn't get an error here!
9853
9854 CDentry *destdn = trace.back();
9855 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
9856 dout(10) << " destdn " << *destdn << dendl;
9857 mdr->pin(destdn);
9858
9859 // discover srcdn
9860 filepath srcpath(mdr->peer_request->srcdnpath);
9861 dout(10) << " src " << srcpath << dendl;
9862 CInode *srci = nullptr;
9863 r = mdcache->path_traverse(mdr, cf, srcpath,
9864 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
9865 &trace, &srci);
9866 if (r > 0) return;
9867 ceph_assert(r == 0);
9868
9869 CDentry *srcdn = trace.back();
9870 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
9871 dout(10) << " srcdn " << *srcdn << dendl;
9872 mdr->pin(srcdn);
9873 mdr->pin(srci);
9874
9875 // stray?
9876 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
9877 if (linkmerge)
9878 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
9879 CDentry *straydn = mdr->straydn;
9880 if (destdnl->is_primary() && !linkmerge)
9881 ceph_assert(straydn);
9882
9883 mdr->set_op_stamp(mdr->peer_request->op_stamp);
9884 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
9885
9886 // set up commit waiter (early, to clean up any freezing etc we do)
9887 if (!mdr->more()->peer_commit)
9888 mdr->more()->peer_commit = new C_MDS_PeerRenameCommit(this, mdr, srcdn, destdn, straydn);
9889
9890 // am i srcdn auth?
9891 if (srcdn->is_auth()) {
9892 set<mds_rank_t> srcdnrep;
9893 srcdn->list_replicas(srcdnrep);
9894
9895 bool reply_witness = false;
9896 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
9897 // freeze?
9898 // we need this to
9899 // - avoid conflicting lock state changes
9900 // - avoid concurrent updates to the inode
9901 // (this could also be accomplished with the versionlock)
9902 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
9903 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
9904 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
9905
9906 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9907 if (srcdnl->get_inode()->is_frozen_auth_pin())
9908 mdr->unfreeze_auth_pin();
9909
9910 if (!frozen_inode) {
9911 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
9912 return;
9913 }
9914
9915 /*
9916 * set ambiguous auth for srci
9917 * NOTE: we don't worry about ambiguous cache expire as we do
9918 * with subtree migrations because all peers will pin
9919 * srcdn->get_inode() for duration of this rename.
9920 */
9921 mdr->set_ambiguous_auth(srcdnl->get_inode());
9922
9923 // just mark the source inode as ambiguous auth if more than two MDS are involved.
9924 // the leader will send another OP_RENAMEPREP peer request later.
9925 if (mdr->peer_request->witnesses.size() > 1) {
9926 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
9927 reply_witness = true;
9928 }
9929
9930 // make sure bystanders have received all lock related messages
9931 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
9932 if (*p == mdr->peer_to_mds ||
9933 (mds->is_cluster_degraded() &&
9934 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
9935 continue;
9936 auto notify = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMENOTIFY);
9937 mds->send_message_mds(notify, *p);
9938 mdr->more()->waiting_on_peer.insert(*p);
9939 }
9940
9941 // make sure clients have received all cap related messages
9942 set<client_t> export_client_set;
9943 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
9944
9945 MDSGatherBuilder gather(g_ceph_context);
9946 flush_client_sessions(export_client_set, gather);
9947 if (gather.has_subs()) {
9948 mdr->more()->waiting_on_peer.insert(MDS_RANK_NONE);
9949 gather.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr));
9950 gather.activate();
9951 }
9952 }
9953
9954 // is witness list sufficient?
9955 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
9956 if (*p == mdr->peer_to_mds ||
9957 mdr->peer_request->witnesses.count(*p)) continue;
9958 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
9959 reply_witness = true;
9960 break;
9961 }
9962
9963 if (reply_witness) {
9964 ceph_assert(!srcdnrep.empty());
9965 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9966 reply->witnesses.swap(srcdnrep);
9967 mds->send_message_mds(reply, mdr->peer_to_mds);
9968 mdr->reset_peer_request();
9969 return;
9970 }
9971 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
9972 if (!mdr->more()->waiting_on_peer.empty()) {
9973 dout(10) << " still waiting for rename notify acks from "
9974 << mdr->more()->waiting_on_peer << dendl;
9975 return;
9976 }
9977 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
9978 // set ambiguous auth for srci on witnesses
9979 mdr->set_ambiguous_auth(srcdnl->get_inode());
9980 }
9981
9982 // encode everything we'd need to roll this back... basically, just the original state.
9983 rename_rollback rollback;
9984
9985 rollback.reqid = mdr->reqid;
9986
9987 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
9988 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9989 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
9990 rollback.orig_src.dname = srcdn->get_name();
9991 if (srcdnl->is_primary())
9992 rollback.orig_src.ino = srcdnl->get_inode()->ino();
9993 else {
9994 ceph_assert(srcdnl->is_remote());
9995 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
9996 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
9997 }
9998
9999 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
10000 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
10001 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
10002 rollback.orig_dest.dname = destdn->get_name();
10003 if (destdnl->is_primary())
10004 rollback.orig_dest.ino = destdnl->get_inode()->ino();
10005 else if (destdnl->is_remote()) {
10006 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
10007 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
10008 }
10009
10010 if (straydn) {
10011 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
10012 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
10013 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
10014 rollback.stray.dname = straydn->get_name();
10015 }
10016 if (mdr->peer_request->desti_snapbl.length()) {
10017 CInode *oldin = destdnl->get_inode();
10018 if (oldin->snaprealm) {
10019 encode(true, rollback.desti_snapbl);
10020 oldin->encode_snap_blob(rollback.desti_snapbl);
10021 } else {
10022 encode(false, rollback.desti_snapbl);
10023 }
10024 }
10025 if (mdr->peer_request->srci_snapbl.length()) {
10026 if (srci->snaprealm) {
10027 encode(true, rollback.srci_snapbl);
10028 srci->encode_snap_blob(rollback.srci_snapbl);
10029 } else {
10030 encode(false, rollback.srci_snapbl);
10031 }
10032 }
10033 encode(rollback, mdr->more()->rollback_bl);
10034 // FIXME: rollback snaprealm
10035 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
10036
10037 // journal.
10038 mdr->ls = mdlog->get_current_segment();
10039 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_prep", mdr->reqid, mdr->peer_to_mds,
10040 EPeerUpdate::OP_PREPARE, EPeerUpdate::RENAME);
10041 mdlog->start_entry(le);
10042 le->rollback = mdr->more()->rollback_bl;
10043
10044 bufferlist blah; // inode import data... obviously not used if we're the peer
10045 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, mdr->peer_request->alternate_name, straydn);
10046
10047 if (le->commit.empty()) {
10048 dout(10) << " empty metablob, skipping journal" << dendl;
10049 mdlog->cancel_entry(le);
10050 mdr->ls = NULL;
10051 _logged_peer_rename(mdr, srcdn, destdn, straydn);
10052 } else {
10053 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
10054 mdr->more()->peer_update_journaled = true;
10055 submit_mdlog_entry(le, new C_MDS_PeerRenamePrep(this, mdr, srcdn, destdn, straydn),
10056 mdr, __func__);
10057 mdlog->flush();
10058 }
10059 }
10060
10061 void Server::_logged_peer_rename(MDRequestRef& mdr,
10062 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
10063 {
10064 dout(10) << "_logged_peer_rename " << *mdr << dendl;
10065
10066 // prepare ack
10067 ref_t<MMDSPeerRequest> reply;
10068 if (!mdr->aborted) {
10069 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
10070 if (!mdr->more()->peer_update_journaled)
10071 reply->mark_not_journaled();
10072 }
10073
10074 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
10075 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
10076
10077 // export srci?
10078 if (srcdn->is_auth() && srcdnl->is_primary()) {
10079 // set export bounds for CInode::encode_export()
10080 if (reply) {
10081 std::vector<CDir*> bounds;
10082 if (srcdnl->get_inode()->is_dir()) {
10083 srcdnl->get_inode()->get_dirfrags(bounds);
10084 for (const auto& bound : bounds) {
10085 bound->state_set(CDir::STATE_EXPORTBOUND);
10086 }
10087 }
10088
10089 map<client_t,entity_inst_t> exported_client_map;
10090 map<client_t, client_metadata_t> exported_client_metadata_map;
10091 bufferlist inodebl;
10092 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
10093 exported_client_map,
10094 exported_client_metadata_map);
10095
10096 for (const auto& bound : bounds) {
10097 bound->state_clear(CDir::STATE_EXPORTBOUND);
10098 }
10099
10100 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
10101 encode(exported_client_metadata_map, reply->inode_export);
10102 reply->inode_export.claim_append(inodebl);
10103 reply->inode_export_v = srcdnl->get_inode()->get_version();
10104 }
10105
10106 // remove mdr auth pin
10107 mdr->auth_unpin(srcdnl->get_inode());
10108 mdr->more()->is_inode_exporter = true;
10109
10110 if (srcdnl->get_inode()->is_dirty())
10111 srcdnl->get_inode()->mark_clean();
10112
10113 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
10114 }
10115
10116 // apply
10117 _rename_apply(mdr, srcdn, destdn, straydn);
10118
10119 CDentry::linkage_t *destdnl = destdn->get_linkage();
10120
10121 // bump popularity
10122 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
10123 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
10124 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
10125
10126 // done.
10127 mdr->reset_peer_request();
10128 mdr->straydn = 0;
10129
10130 if (reply) {
10131 mds->send_message_mds(reply, mdr->peer_to_mds);
10132 } else {
10133 ceph_assert(mdr->aborted);
10134 dout(10) << " abort flag set, finishing" << dendl;
10135 mdcache->request_finish(mdr);
10136 }
10137 }
10138
10139 void Server::_commit_peer_rename(MDRequestRef& mdr, int r,
10140 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
10141 {
10142 dout(10) << "_commit_peer_rename " << *mdr << " r=" << r << dendl;
10143
10144 CInode *in = destdn->get_linkage()->get_inode();
10145
10146 inodeno_t migrated_stray;
10147 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
10148 migrated_stray = in->ino();
10149
10150 MDSContext::vec finished;
10151 if (r == 0) {
10152 // unfreeze+singleauth inode
10153 // hmm, do i really need to delay this?
10154 if (mdr->more()->is_inode_exporter) {
10155 // drop our pins
10156 // we exported, clear out any xlocks that we moved to another MDS
10157
10158 for (auto i = mdr->locks.lower_bound(&in->versionlock);
10159 i != mdr->locks.end(); ) {
10160 SimpleLock *lock = i->lock;
10161 if (lock->get_parent() != in)
10162 break;
10163 // we only care about xlocks on the exported inode
10164 if (i->is_xlock() && !lock->is_locallock())
10165 mds->locker->xlock_export(i++, mdr.get());
10166 else
10167 ++i;
10168 }
10169
10170 map<client_t,Capability::Import> peer_imported;
10171 auto bp = mdr->more()->inode_import.cbegin();
10172 decode(peer_imported, bp);
10173
10174 dout(10) << " finishing inode export on " << *in << dendl;
10175 mdcache->migrator->finish_export_inode(in, mdr->peer_to_mds, peer_imported, finished);
10176 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
10177
10178 // unfreeze
10179 ceph_assert(in->is_frozen_inode());
10180 in->unfreeze_inode(finished);
10181 }
10182
10183 // singleauth
10184 if (mdr->more()->is_ambiguous_auth) {
10185 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
10186 mdr->more()->is_ambiguous_auth = false;
10187 }
10188
10189 if (straydn && mdr->more()->peer_update_journaled) {
10190 CInode *strayin = straydn->get_projected_linkage()->get_inode();
10191 if (strayin && !strayin->snaprealm)
10192 mdcache->clear_dirty_bits_for_stray(strayin);
10193 }
10194
10195 mds->queue_waiters(finished);
10196 mdr->cleanup();
10197
10198 if (mdr->more()->peer_update_journaled) {
10199 // write a commit to the journal
10200 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_commit", mdr->reqid,
10201 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
10202 EPeerUpdate::RENAME);
10203 mdlog->start_entry(le);
10204 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
10205 mdlog->flush();
10206 } else {
10207 _committed_peer(mdr);
10208 }
10209 } else {
10210
10211 // abort
10212 // rollback_bl may be empty if we froze the inode but had to provide an expanded
10213 // witness list from the leader, and they failed before we tried prep again.
10214 if (mdr->more()->rollback_bl.length()) {
10215 if (mdr->more()->is_inode_exporter) {
10216 dout(10) << " reversing inode export of " << *in << dendl;
10217 in->abort_export();
10218 }
10219 if (mdcache->is_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds)) {
10220 mdcache->remove_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds);
10221 // rollback but preserve the peer request
10222 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, false);
10223 mdr->more()->rollback_bl.clear();
10224 } else
10225 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, true);
10226 } else {
10227 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl;
10228 // singleauth
10229 if (mdr->more()->is_ambiguous_auth) {
10230 if (srcdn->is_auth())
10231 mdr->more()->rename_inode->unfreeze_inode(finished);
10232
10233 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
10234 mdr->more()->is_ambiguous_auth = false;
10235 }
10236 mds->queue_waiters(finished);
10237 mdcache->request_finish(mdr);
10238 }
10239 }
10240
10241 if (migrated_stray && mds->is_stopping())
10242 mdcache->shutdown_export_stray_finish(migrated_stray);
10243 }
10244
10245 static void _rollback_repair_dir(MutationRef& mut, CDir *dir,
10246 rename_rollback::drec &r, utime_t ctime,
10247 bool isdir, const nest_info_t &rstat)
10248 {
10249 auto pf = dir->project_fnode(mut);
10250 pf->version = dir->pre_dirty();
10251
10252 if (isdir) {
10253 pf->fragstat.nsubdirs += 1;
10254 } else {
10255 pf->fragstat.nfiles += 1;
10256 }
10257 if (r.ino) {
10258 pf->rstat.rbytes += rstat.rbytes;
10259 pf->rstat.rfiles += rstat.rfiles;
10260 pf->rstat.rsubdirs += rstat.rsubdirs;
10261 pf->rstat.rsnaps += rstat.rsnaps;
10262 }
10263 if (pf->fragstat.mtime == ctime) {
10264 pf->fragstat.mtime = r.dirfrag_old_mtime;
10265 if (pf->rstat.rctime == ctime)
10266 pf->rstat.rctime = r.dirfrag_old_rctime;
10267 }
10268 mut->add_updated_lock(&dir->get_inode()->filelock);
10269 mut->add_updated_lock(&dir->get_inode()->nestlock);
10270 }
10271
10272 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
10273 MutationRef mut;
10274 CDentry *srcdn;
10275 version_t srcdnpv;
10276 CDentry *destdn;
10277 CDentry *straydn;
10278 map<client_t,ref_t<MClientSnap>> splits[2];
10279 bool finish_mdr;
10280 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
10281 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
10282 map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
10283 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
10284 straydn(st), finish_mdr(f) {
10285 splits[0].swap(_splits[0]);
10286 splits[1].swap(_splits[1]);
10287 }
10288 void finish(int r) override {
10289 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
10290 destdn, straydn, splits, finish_mdr);
10291 }
10292 };
10293
10294 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr,
10295 bool finish_mdr)
10296 {
10297 rename_rollback rollback;
10298 auto p = rbl.cbegin();
10299 decode(rollback, p);
10300
10301 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
10302 // need to finish this update before sending resolve to claim the subtree
10303 mdcache->add_rollback(rollback.reqid, leader);
10304
10305 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
10306 mut->ls = mds->mdlog->get_current_segment();
10307
10308 CDentry *srcdn = NULL;
10309 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
10310 if (!srcdir)
10311 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
10312 if (srcdir) {
10313 dout(10) << " srcdir " << *srcdir << dendl;
10314 srcdn = srcdir->lookup(rollback.orig_src.dname);
10315 if (srcdn) {
10316 dout(10) << " srcdn " << *srcdn << dendl;
10317 ceph_assert(srcdn->get_linkage()->is_null());
10318 } else
10319 dout(10) << " srcdn not found" << dendl;
10320 } else
10321 dout(10) << " srcdir not found" << dendl;
10322
10323 CDentry *destdn = NULL;
10324 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
10325 if (!destdir)
10326 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
10327 if (destdir) {
10328 dout(10) << " destdir " << *destdir << dendl;
10329 destdn = destdir->lookup(rollback.orig_dest.dname);
10330 if (destdn)
10331 dout(10) << " destdn " << *destdn << dendl;
10332 else
10333 dout(10) << " destdn not found" << dendl;
10334 } else
10335 dout(10) << " destdir not found" << dendl;
10336
10337 CInode *in = NULL;
10338 if (rollback.orig_src.ino) {
10339 in = mdcache->get_inode(rollback.orig_src.ino);
10340 if (in && in->is_dir())
10341 ceph_assert(srcdn && destdn);
10342 } else
10343 in = mdcache->get_inode(rollback.orig_src.remote_ino);
10344
10345 CDir *straydir = NULL;
10346 CDentry *straydn = NULL;
10347 if (rollback.stray.dirfrag.ino) {
10348 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
10349 if (straydir) {
10350 dout(10) << "straydir " << *straydir << dendl;
10351 straydn = straydir->lookup(rollback.stray.dname);
10352 if (straydn) {
10353 dout(10) << " straydn " << *straydn << dendl;
10354 ceph_assert(straydn->get_linkage()->is_primary());
10355 } else
10356 dout(10) << " straydn not found" << dendl;
10357 } else
10358 dout(10) << "straydir not found" << dendl;
10359 }
10360
10361 CInode *target = NULL;
10362 if (rollback.orig_dest.ino) {
10363 target = mdcache->get_inode(rollback.orig_dest.ino);
10364 if (target)
10365 ceph_assert(destdn && straydn);
10366 } else if (rollback.orig_dest.remote_ino)
10367 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
10368
10369 // can't use is_auth() in the resolve stage
10370 mds_rank_t whoami = mds->get_nodeid();
10371 // peer
10372 ceph_assert(!destdn || destdn->authority().first != whoami);
10373 ceph_assert(!straydn || straydn->authority().first != whoami);
10374
10375 bool force_journal_src = false;
10376 bool force_journal_dest = false;
10377 if (in && in->is_dir() && srcdn->authority().first != whoami)
10378 force_journal_src = _need_force_journal(in, false);
10379 if (in && target && target->is_dir())
10380 force_journal_dest = _need_force_journal(in, true);
10381
10382 version_t srcdnpv = 0;
10383 // repair src
10384 if (srcdn) {
10385 if (srcdn->authority().first == whoami)
10386 srcdnpv = srcdn->pre_dirty();
10387 if (rollback.orig_src.ino) {
10388 ceph_assert(in);
10389 srcdn->push_projected_linkage(in);
10390 } else
10391 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
10392 rollback.orig_src.remote_d_type);
10393 }
10394
10395 map<client_t,ref_t<MClientSnap>> splits[2];
10396
10397 const CInode::mempool_inode *pip = nullptr;
10398 if (in) {
10399 bool projected;
10400 CDir *pdir = in->get_projected_parent_dir();
10401 if (pdir->authority().first == whoami) {
10402 auto pi = in->project_inode(mut);
10403 pi.inode->version = in->pre_dirty();
10404 if (pdir != srcdir) {
10405 auto pf = pdir->project_fnode(mut);
10406 pf->version = pdir->pre_dirty();
10407 }
10408 if (pi.inode->ctime == rollback.ctime)
10409 pi.inode->ctime = rollback.orig_src.old_ctime;
10410 projected = true;
10411 } else {
10412 if (in->get_inode()->ctime == rollback.ctime) {
10413 auto _inode = CInode::allocate_inode(*in->get_inode());
10414 _inode->ctime = rollback.orig_src.old_ctime;
10415 in->reset_inode(_inode);
10416 }
10417 projected = false;
10418 }
10419 pip = in->get_projected_inode().get();
10420
10421 if (rollback.srci_snapbl.length() && in->snaprealm) {
10422 bool hadrealm;
10423 auto p = rollback.srci_snapbl.cbegin();
10424 decode(hadrealm, p);
10425 if (hadrealm) {
10426 if (projected && !mds->is_resolve()) {
10427 sr_t *new_srnode = new sr_t();
10428 decode(*new_srnode, p);
10429 in->project_snaprealm(new_srnode);
10430 } else
10431 decode(in->snaprealm->srnode, p);
10432 } else {
10433 SnapRealm *realm;
10434 if (rollback.orig_src.ino) {
10435 ceph_assert(srcdir);
10436 realm = srcdir->get_inode()->find_snaprealm();
10437 } else {
10438 realm = in->snaprealm->parent;
10439 }
10440 if (!mds->is_resolve())
10441 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
10442 if (projected)
10443 in->project_snaprealm(NULL);
10444 else
10445 in->snaprealm->merge_to(realm);
10446 }
10447 }
10448 }
10449
10450 // repair dest
10451 if (destdn) {
10452 if (rollback.orig_dest.ino && target) {
10453 destdn->push_projected_linkage(target);
10454 } else if (rollback.orig_dest.remote_ino) {
10455 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
10456 rollback.orig_dest.remote_d_type);
10457 } else {
10458 // the dentry will be trimmed soon, it's ok to have wrong linkage
10459 if (rollback.orig_dest.ino)
10460 ceph_assert(mds->is_resolve());
10461 destdn->push_projected_linkage();
10462 }
10463 }
10464
10465 if (straydn)
10466 straydn->push_projected_linkage();
10467
10468 if (target) {
10469 bool projected;
10470 CInode::inode_ptr ti;
10471 CDir *pdir = target->get_projected_parent_dir();
10472 if (pdir->authority().first == whoami) {
10473 auto pi = target->project_inode(mut);
10474 pi.inode->version = target->pre_dirty();
10475 if (pdir != srcdir) {
10476 auto pf = pdir->project_fnode(mut);
10477 pf->version = pdir->pre_dirty();
10478 }
10479 ti = pi.inode;
10480 projected = true;
10481 } else {
10482 ti = CInode::allocate_inode(*target->get_inode());
10483 projected = false;
10484 }
10485
10486 if (ti->ctime == rollback.ctime)
10487 ti->ctime = rollback.orig_dest.old_ctime;
10488 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
10489 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
10490 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
10491 else
10492 ceph_assert(rollback.orig_dest.remote_ino &&
10493 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
10494 } else
10495 ti->nlink++;
10496
10497 if (!projected)
10498 target->reset_inode(ti);
10499
10500 if (rollback.desti_snapbl.length() && target->snaprealm) {
10501 bool hadrealm;
10502 auto p = rollback.desti_snapbl.cbegin();
10503 decode(hadrealm, p);
10504 if (hadrealm) {
10505 if (projected && !mds->is_resolve()) {
10506 sr_t *new_srnode = new sr_t();
10507 decode(*new_srnode, p);
10508 target->project_snaprealm(new_srnode);
10509 } else
10510 decode(target->snaprealm->srnode, p);
10511 } else {
10512 SnapRealm *realm;
10513 if (rollback.orig_dest.ino) {
10514 ceph_assert(destdir);
10515 realm = destdir->get_inode()->find_snaprealm();
10516 } else {
10517 realm = target->snaprealm->parent;
10518 }
10519 if (!mds->is_resolve())
10520 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
10521 if (projected)
10522 target->project_snaprealm(NULL);
10523 else
10524 target->snaprealm->merge_to(realm);
10525 }
10526 }
10527 }
10528
10529 if (srcdn && srcdn->authority().first == whoami) {
10530 nest_info_t blah;
10531 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
10532 in && in->is_dir(), pip ? pip->accounted_rstat : blah);
10533 }
10534
10535 if (srcdn)
10536 dout(0) << " srcdn back to " << *srcdn << dendl;
10537 if (in)
10538 dout(0) << " srci back to " << *in << dendl;
10539 if (destdn)
10540 dout(0) << " destdn back to " << *destdn << dendl;
10541 if (target)
10542 dout(0) << " desti back to " << *target << dendl;
10543
10544 // journal it
10545 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_rollback", rollback.reqid, leader,
10546 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RENAME);
10547 mdlog->start_entry(le);
10548
10549 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
10550 le->commit.add_dir_context(srcdir);
10551 if (rollback.orig_src.ino)
10552 le->commit.add_primary_dentry(srcdn, 0, true);
10553 else
10554 le->commit.add_remote_dentry(srcdn, true);
10555 }
10556
10557 if (!rollback.orig_src.ino && // remote linkage
10558 in && in->authority().first == whoami) {
10559 le->commit.add_dir_context(in->get_projected_parent_dir());
10560 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
10561 }
10562
10563 if (force_journal_dest) {
10564 ceph_assert(rollback.orig_dest.ino);
10565 le->commit.add_dir_context(destdir);
10566 le->commit.add_primary_dentry(destdn, 0, true);
10567 }
10568
10569 // peer: no need to journal straydn
10570
10571 if (target && target != in && target->authority().first == whoami) {
10572 ceph_assert(rollback.orig_dest.remote_ino);
10573 le->commit.add_dir_context(target->get_projected_parent_dir());
10574 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
10575 }
10576
10577 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
10578 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
10579 le->commit.renamed_dirino = in->ino();
10580 if (srcdn->authority().first == whoami) {
10581 auto&& ls = in->get_dirfrags();
10582 for (const auto& dir : ls) {
10583 if (!dir->is_auth())
10584 le->commit.renamed_dir_frags.push_back(dir->get_frag());
10585 }
10586 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
10587 }
10588 } else if (force_journal_dest) {
10589 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
10590 le->commit.renamed_dirino = target->ino();
10591 }
10592
10593 if (target && target->is_dir()) {
10594 ceph_assert(destdn);
10595 mdcache->project_subtree_rename(target, straydir, destdir);
10596 }
10597
10598 if (in && in->is_dir()) {
10599 ceph_assert(srcdn);
10600 mdcache->project_subtree_rename(in, destdir, srcdir);
10601 }
10602
10603 if (mdr && !mdr->more()->peer_update_journaled) {
10604 ceph_assert(le->commit.empty());
10605 mdlog->cancel_entry(le);
10606 mut->ls = NULL;
10607 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
10608 } else {
10609 ceph_assert(!le->commit.empty());
10610 if (mdr)
10611 mdr->more()->peer_update_journaled = false;
10612 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
10613 srcdn, srcdnpv, destdn, straydn,
10614 splits, finish_mdr);
10615 submit_mdlog_entry(le, fin, mdr, __func__);
10616 mdlog->flush();
10617 }
10618 }
10619
10620 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
10621 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
10622 map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
10623 {
10624 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
10625
10626 if (straydn) {
10627 straydn->get_dir()->unlink_inode(straydn);
10628 straydn->pop_projected_linkage();
10629 }
10630 if (destdn) {
10631 destdn->get_dir()->unlink_inode(destdn);
10632 destdn->pop_projected_linkage();
10633 }
10634 if (srcdn) {
10635 srcdn->pop_projected_linkage();
10636 if (srcdn->authority().first == mds->get_nodeid()) {
10637 srcdn->mark_dirty(srcdnpv, mut->ls);
10638 if (srcdn->get_linkage()->is_primary())
10639 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
10640 }
10641 }
10642
10643 mut->apply();
10644
10645 if (srcdn && srcdn->get_linkage()->is_primary()) {
10646 CInode *in = srcdn->get_linkage()->get_inode();
10647 if (in && in->is_dir()) {
10648 ceph_assert(destdn);
10649 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
10650 }
10651 }
10652
10653 if (destdn) {
10654 CInode *oldin = destdn->get_linkage()->get_inode();
10655 // update subtree map?
10656 if (oldin && oldin->is_dir()) {
10657 ceph_assert(straydn);
10658 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
10659 }
10660 }
10661
10662 if (mds->is_resolve()) {
10663 CDir *root = NULL;
10664 if (straydn)
10665 root = mdcache->get_subtree_root(straydn->get_dir());
10666 else if (destdn)
10667 root = mdcache->get_subtree_root(destdn->get_dir());
10668 if (root)
10669 mdcache->try_trim_non_auth_subtree(root);
10670 } else {
10671 mdcache->send_snaps(splits[1]);
10672 mdcache->send_snaps(splits[0]);
10673 }
10674
10675 if (mdr) {
10676 MDSContext::vec finished;
10677 if (mdr->more()->is_ambiguous_auth) {
10678 if (srcdn->is_auth())
10679 mdr->more()->rename_inode->unfreeze_inode(finished);
10680
10681 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
10682 mdr->more()->is_ambiguous_auth = false;
10683 }
10684 mds->queue_waiters(finished);
10685 if (finish_mdr || mdr->aborted)
10686 mdcache->request_finish(mdr);
10687 else
10688 mdr->more()->peer_rolling_back = false;
10689 }
10690
10691 mdcache->finish_rollback(mut->reqid, mdr);
10692
10693 mut->cleanup();
10694 }
10695
10696 void Server::handle_peer_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
10697 {
10698 dout(10) << "handle_peer_rename_prep_ack " << *mdr
10699 << " witnessed by " << ack->get_source()
10700 << " " << *ack << dendl;
10701 mds_rank_t from = mds_rank_t(ack->get_source().num());
10702
10703 // note peer
10704 mdr->more()->peers.insert(from);
10705 if (mdr->more()->srcdn_auth_mds == from &&
10706 mdr->more()->is_remote_frozen_authpin &&
10707 !mdr->more()->is_ambiguous_auth) {
10708 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
10709 }
10710
10711 // witnessed? or add extra witnesses?
10712 ceph_assert(mdr->more()->witnessed.count(from) == 0);
10713 if (ack->is_interrupted()) {
10714 dout(10) << " peer request interrupted, noop" << dendl;
10715 } else if (ack->witnesses.empty()) {
10716 mdr->more()->witnessed.insert(from);
10717 if (!ack->is_not_journaled())
10718 mdr->more()->has_journaled_peers = true;
10719 } else {
10720 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
10721 mdr->more()->extra_witnesses = ack->witnesses;
10722 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
10723 }
10724
10725 // srci import?
10726 if (ack->inode_export.length()) {
10727 dout(10) << " got srci import" << dendl;
10728 mdr->more()->inode_import.share(ack->inode_export);
10729 mdr->more()->inode_import_v = ack->inode_export_v;
10730 }
10731
10732 // remove from waiting list
10733 ceph_assert(mdr->more()->waiting_on_peer.count(from));
10734 mdr->more()->waiting_on_peer.erase(from);
10735
10736 if (mdr->more()->waiting_on_peer.empty())
10737 dispatch_client_request(mdr); // go again!
10738 else
10739 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
10740 }
10741
10742 void Server::handle_peer_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
10743 {
10744 dout(10) << "handle_peer_rename_notify_ack " << *mdr << " from mds."
10745 << ack->get_source() << dendl;
10746 ceph_assert(mdr->is_peer());
10747 mds_rank_t from = mds_rank_t(ack->get_source().num());
10748
10749 if (mdr->more()->waiting_on_peer.count(from)) {
10750 mdr->more()->waiting_on_peer.erase(from);
10751
10752 if (mdr->more()->waiting_on_peer.empty()) {
10753 if (mdr->peer_request)
10754 dispatch_peer_request(mdr);
10755 } else
10756 dout(10) << " still waiting for rename notify acks from "
10757 << mdr->more()->waiting_on_peer << dendl;
10758 }
10759 }
10760
10761 void Server::_peer_rename_sessions_flushed(MDRequestRef& mdr)
10762 {
10763 dout(10) << "_peer_rename_sessions_flushed " << *mdr << dendl;
10764
10765 if (mdr->more()->waiting_on_peer.count(MDS_RANK_NONE)) {
10766 mdr->more()->waiting_on_peer.erase(MDS_RANK_NONE);
10767
10768 if (mdr->more()->waiting_on_peer.empty()) {
10769 if (mdr->peer_request)
10770 dispatch_peer_request(mdr);
10771 } else
10772 dout(10) << " still waiting for rename notify acks from "
10773 << mdr->more()->waiting_on_peer << dendl;
10774 }
10775 }
10776
10777 // snaps
10778 /* This function takes responsibility for the passed mdr*/
10779 void Server::handle_client_lssnap(MDRequestRef& mdr)
10780 {
10781 const cref_t<MClientRequest> &req = mdr->client_request;
10782
10783 // traverse to path
10784 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10785 if (!diri)
10786 return;
10787
10788 if (!diri->is_dir()) {
10789 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10790 return;
10791 }
10792 dout(10) << "lssnap on " << *diri << dendl;
10793
10794 // lock snap
10795 if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
10796 return;
10797
10798 if (!check_access(mdr, diri, MAY_READ))
10799 return;
10800
10801 SnapRealm *realm = diri->find_snaprealm();
10802 map<snapid_t,const SnapInfo*> infomap;
10803 realm->get_snap_info(infomap, diri->get_oldest_snap());
10804
10805 unsigned max_entries = req->head.args.readdir.max_entries;
10806 if (!max_entries)
10807 max_entries = infomap.size();
10808 int max_bytes = req->head.args.readdir.max_bytes;
10809 if (!max_bytes)
10810 // make sure at least one item can be encoded
10811 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
10812
10813 __u64 last_snapid = 0;
10814 string offset_str = req->get_path2();
10815 if (!offset_str.empty())
10816 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
10817
10818 //Empty DirStat
10819 bufferlist dirbl;
10820 static DirStat empty;
10821 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
10822
10823 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
10824
10825 __u32 num = 0;
10826 bufferlist dnbl;
10827 auto p = infomap.upper_bound(last_snapid);
10828 for (; p != infomap.end() && num < max_entries; ++p) {
10829 dout(10) << p->first << " -> " << *p->second << dendl;
10830
10831 // actual
10832 string snap_name;
10833 if (p->second->ino == diri->ino())
10834 snap_name = p->second->name;
10835 else
10836 snap_name = p->second->get_long_name();
10837
10838 unsigned start_len = dnbl.length();
10839 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
10840 break;
10841
10842 encode(snap_name, dnbl);
10843 //infinite lease
10844 LeaseStat e(CEPH_LEASE_VALID, -1, 0);
10845 mds->locker->encode_lease(dnbl, mdr->session->info, e);
10846 dout(20) << "encode_infinite_lease" << dendl;
10847
10848 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
10849 if (r < 0) {
10850 bufferlist keep;
10851 keep.substr_of(dnbl, 0, start_len);
10852 dnbl.swap(keep);
10853 break;
10854 }
10855 ++num;
10856 }
10857
10858 encode(num, dirbl);
10859 __u16 flags = 0;
10860 if (p == infomap.end()) {
10861 flags = CEPH_READDIR_FRAG_END;
10862 if (last_snapid == 0)
10863 flags |= CEPH_READDIR_FRAG_COMPLETE;
10864 }
10865 encode(flags, dirbl);
10866 dirbl.claim_append(dnbl);
10867
10868 mdr->reply_extra_bl = dirbl;
10869 mdr->tracei = diri;
10870 respond_to_request(mdr, 0);
10871 }
10872
10873
10874 // MKSNAP
10875
10876 struct C_MDS_mksnap_finish : public ServerLogContext {
10877 CInode *diri;
10878 SnapInfo info;
10879 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
10880 ServerLogContext(s, r), diri(di), info(i) {}
10881 void finish(int r) override {
10882 server->_mksnap_finish(mdr, diri, info);
10883 }
10884 };
10885
10886 /* This function takes responsibility for the passed mdr*/
10887 void Server::handle_client_mksnap(MDRequestRef& mdr)
10888 {
10889 const cref_t<MClientRequest> &req = mdr->client_request;
10890 // make sure we have as new a map as the client
10891 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
10892 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
10893 return;
10894 }
10895 if (!mds->mdsmap->allows_snaps()) {
10896 // you can't make snapshots until you set an option right now
10897 dout(5) << "new snapshots are disabled for this fs" << dendl;
10898 respond_to_request(mdr, -CEPHFS_EPERM);
10899 return;
10900 }
10901
10902 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10903 if (!diri)
10904 return;
10905
10906 // dir only
10907 if (!diri->is_dir()) {
10908 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10909 return;
10910 }
10911 if (diri->is_system() && !diri->is_root()) {
10912 // no snaps in system dirs (root is ok)
10913 dout(5) << "is an internal system dir" << dendl;
10914 respond_to_request(mdr, -CEPHFS_EPERM);
10915 return;
10916 }
10917
10918 std::string_view snapname = req->get_filepath().last_dentry();
10919
10920 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10921 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
10922 respond_to_request(mdr, -CEPHFS_EPERM);
10923 return;
10924 }
10925
10926 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
10927
10928 // lock snap
10929 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10930 MutationImpl::LockOpVec lov;
10931 lov.add_xlock(&diri->snaplock);
10932 if (!mds->locker->acquire_locks(mdr, lov))
10933 return;
10934
10935 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10936 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10937 return;
10938 }
10939 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10940 }
10941
10942 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10943 return;
10944
10945 if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino();
10946 (subvol_ino && subvol_ino != diri->ino())) {
10947 dout(5) << "is a descendent of a subvolume dir" << dendl;
10948 respond_to_request(mdr, -CEPHFS_EPERM);
10949 return;
10950 }
10951
10952 // check if we can create any more snapshots
10953 // we don't allow any more if we are already at or beyond the limit
10954 if (diri->snaprealm &&
10955 diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
10956 respond_to_request(mdr, -CEPHFS_EMLINK);
10957 return;
10958 }
10959
10960 // make sure name is unique
10961 if (diri->snaprealm &&
10962 diri->snaprealm->exists(snapname)) {
10963 respond_to_request(mdr, -CEPHFS_EEXIST);
10964 return;
10965 }
10966 if (snapname.length() == 0 ||
10967 snapname.length() > snapshot_name_max ||
10968 snapname[0] == '_') {
10969 respond_to_request(mdr, -CEPHFS_EINVAL);
10970 return;
10971 }
10972
10973 // allocate a snapid
10974 if (!mdr->more()->stid) {
10975 // prepare an stid
10976 mds->snapclient->prepare_create(diri->ino(), snapname,
10977 mdr->get_mds_stamp(),
10978 &mdr->more()->stid, &mdr->more()->snapidbl,
10979 new C_MDS_RetryRequest(mdcache, mdr));
10980 return;
10981 }
10982
10983 version_t stid = mdr->more()->stid;
10984 snapid_t snapid;
10985 auto p = mdr->more()->snapidbl.cbegin();
10986 decode(snapid, p);
10987 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
10988
10989 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10990
10991 SnapPayload payload;
10992 if (req->get_data().length()) {
10993 try {
10994 auto iter = req->get_data().cbegin();
10995 decode(payload, iter);
10996 } catch (const ceph::buffer::error &e) {
10997 // backward compat -- client sends xattr bufferlist. however,
10998 // that is not used anywhere -- so (log and) ignore.
10999 dout(20) << ": no metadata in payload (old client?)" << dendl;
11000 }
11001 }
11002
11003 // journal
11004 SnapInfo info;
11005 info.ino = diri->ino();
11006 info.snapid = snapid;
11007 info.name = snapname;
11008 info.stamp = mdr->get_op_stamp();
11009 info.metadata = payload.metadata;
11010
11011 auto pi = diri->project_inode(mdr, false, true);
11012 pi.inode->ctime = info.stamp;
11013 if (info.stamp > pi.inode->rstat.rctime)
11014 pi.inode->rstat.rctime = info.stamp;
11015 pi.inode->rstat.rsnaps++;
11016 pi.inode->version = diri->pre_dirty();
11017
11018 // project the snaprealm
11019 auto &newsnap = *pi.snapnode;
11020 newsnap.created = snapid;
11021 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
11022 if (!em.second)
11023 em.first->second = info;
11024 newsnap.seq = snapid;
11025 newsnap.last_created = snapid;
11026 newsnap.last_modified = info.stamp;
11027 newsnap.change_attr++;
11028
11029 // journal the inode changes
11030 mdr->ls = mdlog->get_current_segment();
11031 EUpdate *le = new EUpdate(mdlog, "mksnap");
11032 mdlog->start_entry(le);
11033
11034 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
11035 le->metablob.add_table_transaction(TABLE_SNAP, stid);
11036 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
11037 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
11038
11039 // journal the snaprealm changes
11040 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
11041 mdr, __func__);
11042 mdlog->flush();
11043 }
11044
11045 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
11046 {
11047 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
11048
11049 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
11050
11051 mdr->apply();
11052
11053 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
11054
11055 // create snap
11056 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
11057
11058 // notify other mds
11059 mdcache->send_snap_update(diri, mdr->more()->stid, op);
11060
11061 mdcache->do_realm_invalidate_and_update_notify(diri, op);
11062
11063 // yay
11064 mdr->in[0] = diri;
11065 mdr->snapid = info.snapid;
11066 mdr->tracei = diri;
11067 respond_to_request(mdr, 0);
11068 }
11069
11070
11071 // RMSNAP
11072
11073 struct C_MDS_rmsnap_finish : public ServerLogContext {
11074 CInode *diri;
11075 snapid_t snapid;
11076 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
11077 ServerLogContext(s, r), diri(di), snapid(sn) {}
11078 void finish(int r) override {
11079 server->_rmsnap_finish(mdr, diri, snapid);
11080 }
11081 };
11082
11083 /* This function takes responsibility for the passed mdr*/
11084 void Server::handle_client_rmsnap(MDRequestRef& mdr)
11085 {
11086 const cref_t<MClientRequest> &req = mdr->client_request;
11087
11088 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
11089 if (!diri)
11090 return;
11091
11092 if (!diri->is_dir()) {
11093 respond_to_request(mdr, -CEPHFS_ENOTDIR);
11094 return;
11095 }
11096
11097 std::string_view snapname = req->get_filepath().last_dentry();
11098
11099 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
11100 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
11101 respond_to_request(mdr, -CEPHFS_EPERM);
11102 return;
11103 }
11104
11105 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
11106
11107 // does snap exist?
11108 if (snapname.length() == 0 || snapname[0] == '_') {
11109 respond_to_request(mdr, -CEPHFS_EINVAL); // can't prune a parent snap, currently.
11110 return;
11111 }
11112 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
11113 respond_to_request(mdr, -CEPHFS_ENOENT);
11114 return;
11115 }
11116 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
11117 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
11118 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
11119 MutationImpl::LockOpVec lov;
11120 lov.add_xlock(&diri->snaplock);
11121 if (!mds->locker->acquire_locks(mdr, lov))
11122 return;
11123 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
11124 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
11125 return;
11126 }
11127 mdr->locking_state |= MutationImpl::ALL_LOCKED;
11128 }
11129
11130 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
11131 return;
11132
11133 // prepare
11134 if (!mdr->more()->stid) {
11135 mds->snapclient->prepare_destroy(diri->ino(), snapid,
11136 &mdr->more()->stid, &mdr->more()->snapidbl,
11137 new C_MDS_RetryRequest(mdcache, mdr));
11138 return;
11139 }
11140 version_t stid = mdr->more()->stid;
11141 auto p = mdr->more()->snapidbl.cbegin();
11142 snapid_t seq;
11143 decode(seq, p);
11144 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
11145
11146 ceph_assert(mds->snapclient->get_cached_version() >= stid);
11147
11148 // journal
11149 auto pi = diri->project_inode(mdr, false, true);
11150 pi.inode->version = diri->pre_dirty();
11151 pi.inode->ctime = mdr->get_op_stamp();
11152 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
11153 pi.inode->rstat.rctime = mdr->get_op_stamp();
11154 pi.inode->rstat.rsnaps--;
11155
11156 mdr->ls = mdlog->get_current_segment();
11157 EUpdate *le = new EUpdate(mdlog, "rmsnap");
11158 mdlog->start_entry(le);
11159
11160 // project the snaprealm
11161 auto &newnode = *pi.snapnode;
11162 newnode.snaps.erase(snapid);
11163 newnode.seq = seq;
11164 newnode.last_destroyed = seq;
11165 newnode.last_modified = mdr->get_op_stamp();
11166 newnode.change_attr++;
11167
11168 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
11169 le->metablob.add_table_transaction(TABLE_SNAP, stid);
11170 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
11171 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
11172
11173 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
11174 mdr, __func__);
11175 mdlog->flush();
11176 }
11177
11178 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
11179 {
11180 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
11181 snapid_t stid = mdr->more()->stid;
11182
11183 mdr->apply();
11184
11185 mds->snapclient->commit(stid, mdr->ls);
11186
11187 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
11188
11189 // notify other mds
11190 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
11191
11192 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
11193
11194 // yay
11195 mdr->in[0] = diri;
11196 mdr->tracei = diri;
11197 mdr->snapid = snapid;
11198 respond_to_request(mdr, 0);
11199
11200 // purge snapshot data
11201 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
11202 }
11203
11204 struct C_MDS_renamesnap_finish : public ServerLogContext {
11205 CInode *diri;
11206 snapid_t snapid;
11207 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
11208 ServerLogContext(s, r), diri(di), snapid(sn) {}
11209 void finish(int r) override {
11210 server->_renamesnap_finish(mdr, diri, snapid);
11211 }
11212 };
11213
11214 /* This function takes responsibility for the passed mdr*/
11215 void Server::handle_client_renamesnap(MDRequestRef& mdr)
11216 {
11217 const cref_t<MClientRequest> &req = mdr->client_request;
11218 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
11219 respond_to_request(mdr, -CEPHFS_EINVAL);
11220 return;
11221 }
11222
11223 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
11224 if (!diri)
11225 return;
11226
11227 if (!diri->is_dir()) { // dir only
11228 respond_to_request(mdr, -CEPHFS_ENOTDIR);
11229 return;
11230 }
11231
11232 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
11233 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
11234 respond_to_request(mdr, -CEPHFS_EPERM);
11235 return;
11236 }
11237
11238 std::string_view dstname = req->get_filepath().last_dentry();
11239 std::string_view srcname = req->get_filepath2().last_dentry();
11240 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
11241
11242 if (srcname.length() == 0 || srcname[0] == '_') {
11243 respond_to_request(mdr, -CEPHFS_EINVAL); // can't rename a parent snap.
11244 return;
11245 }
11246 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
11247 respond_to_request(mdr, -CEPHFS_ENOENT);
11248 return;
11249 }
11250 if (dstname.length() == 0 || dstname[0] == '_') {
11251 respond_to_request(mdr, -CEPHFS_EINVAL);
11252 return;
11253 }
11254 if (diri->snaprealm->exists(dstname)) {
11255 respond_to_request(mdr, -CEPHFS_EEXIST);
11256 return;
11257 }
11258
11259 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
11260 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
11261
11262 // lock snap
11263 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
11264 MutationImpl::LockOpVec lov;
11265 lov.add_xlock(&diri->snaplock);
11266 if (!mds->locker->acquire_locks(mdr, lov))
11267 return;
11268 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
11269 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
11270 return;
11271 }
11272 mdr->locking_state |= MutationImpl::ALL_LOCKED;
11273 }
11274
11275 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
11276 return;
11277
11278 // prepare
11279 if (!mdr->more()->stid) {
11280 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
11281 &mdr->more()->stid,
11282 new C_MDS_RetryRequest(mdcache, mdr));
11283 return;
11284 }
11285
11286 version_t stid = mdr->more()->stid;
11287 dout(10) << " stid is " << stid << dendl;
11288
11289 ceph_assert(mds->snapclient->get_cached_version() >= stid);
11290
11291 // journal
11292 auto pi = diri->project_inode(mdr, false, true);
11293 pi.inode->ctime = mdr->get_op_stamp();
11294 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
11295 pi.inode->rstat.rctime = mdr->get_op_stamp();
11296 pi.inode->version = diri->pre_dirty();
11297
11298 // project the snaprealm
11299 auto &newsnap = *pi.snapnode;
11300 auto it = newsnap.snaps.find(snapid);
11301 ceph_assert(it != newsnap.snaps.end());
11302 it->second.name = dstname;
11303 newsnap.last_modified = mdr->get_op_stamp();
11304 newsnap.change_attr++;
11305
11306 // journal the inode changes
11307 mdr->ls = mdlog->get_current_segment();
11308 EUpdate *le = new EUpdate(mdlog, "renamesnap");
11309 mdlog->start_entry(le);
11310
11311 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
11312 le->metablob.add_table_transaction(TABLE_SNAP, stid);
11313 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
11314 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
11315
11316 // journal the snaprealm changes
11317 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
11318 mdr, __func__);
11319 mdlog->flush();
11320 }
11321
11322 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
11323 {
11324 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
11325
11326 mdr->apply();
11327
11328 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
11329
11330 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
11331
11332 // notify other mds
11333 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
11334
11335 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
11336
11337 // yay
11338 mdr->in[0] = diri;
11339 mdr->tracei = diri;
11340 mdr->snapid = snapid;
11341 respond_to_request(mdr, 0);
11342 }
11343
11344 /**
11345 * Return true if server is in state RECONNECT and this
11346 * client has not yet reconnected.
11347 */
11348 bool Server::waiting_for_reconnect(client_t c) const
11349 {
11350 return client_reconnect_gather.count(c) > 0;
11351 }
11352
11353 void Server::dump_reconnect_status(Formatter *f) const
11354 {
11355 f->open_object_section("reconnect_status");
11356 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
11357 f->close_section();
11358 }
11359
11360 const bufferlist& Server::get_snap_trace(Session *session, SnapRealm *realm) const {
11361 ceph_assert(session);
11362 ceph_assert(realm);
11363 if (session->info.has_feature(CEPHFS_FEATURE_NEW_SNAPREALM_INFO)) {
11364 return realm->get_snap_trace_new();
11365 } else {
11366 return realm->get_snap_trace();
11367 }
11368 }
11369
11370 const bufferlist& Server::get_snap_trace(client_t client, SnapRealm *realm) const {
11371 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
11372 return get_snap_trace(session, realm);
11373 }