]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
21
22 #include "MDSRank.h"
23 #include "Server.h"
24 #include "Locker.h"
25 #include "MDCache.h"
26 #include "MDLog.h"
27 #include "Migrator.h"
28 #include "MDBalancer.h"
29 #include "InoTable.h"
30 #include "SnapClient.h"
31 #include "Mutation.h"
32 #include "MetricsHandler.h"
33 #include "cephfs_features.h"
34
35 #include "msg/Messenger.h"
36
37 #include "osdc/Objecter.h"
38
39 #include "events/EUpdate.h"
40 #include "events/EPeerUpdate.h"
41 #include "events/ESession.h"
42 #include "events/EOpen.h"
43 #include "events/ECommitted.h"
44 #include "events/EPurged.h"
45
46 #include "include/stringify.h"
47 #include "include/filepath.h"
48 #include "common/errno.h"
49 #include "common/Timer.h"
50 #include "common/perf_counters.h"
51 #include "include/compat.h"
52 #include "osd/OSDMap.h"
53 #include "fscrypt.h"
54
55 #include <errno.h>
56
57 #include <list>
58 #include <regex>
59 #include <string_view>
60 #include <functional>
61
62 #include "common/config.h"
63
64 #include "msg/Message.h"
65
66 #define dout_context g_ceph_context
67 #define dout_subsys ceph_subsys_mds
68 #undef dout_prefix
69 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
70
71 using namespace std;
72
73 class ServerContext : public MDSContext {
74 protected:
75 Server *server;
76 MDSRank *get_mds() override
77 {
78 return server->mds;
79 }
80
81 public:
82 explicit ServerContext(Server *s) : server(s) {
83 ceph_assert(server != NULL);
84 }
85 };
86
87 class Batch_Getattr_Lookup : public BatchOp {
88 protected:
89 Server* server;
90 ceph::ref_t<MDRequestImpl> mdr;
91 std::vector<ceph::ref_t<MDRequestImpl>> batch_reqs;
92 int res = 0;
93 public:
94 Batch_Getattr_Lookup(Server* s, const ceph::ref_t<MDRequestImpl>& r)
95 : server(s), mdr(r) {
96 if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP)
97 mdr->batch_op_map = &mdr->dn[0].back()->batch_ops;
98 else
99 mdr->batch_op_map = &mdr->in[0]->batch_ops;
100 }
101 void add_request(const ceph::ref_t<MDRequestImpl>& r) override {
102 batch_reqs.push_back(r);
103 }
104 ceph::ref_t<MDRequestImpl> find_new_head() override {
105 while (!batch_reqs.empty()) {
106 auto r = std::move(batch_reqs.back());
107 batch_reqs.pop_back();
108 if (r->killed)
109 continue;
110
111 r->batch_op_map = mdr->batch_op_map;
112 mdr->batch_op_map = nullptr;
113 mdr = r;
114 return mdr;
115 }
116 return nullptr;
117 }
118 void _forward(mds_rank_t t) override {
119 MDCache* mdcache = server->mdcache;
120 mdcache->mds->forward_message_mds(mdr, t);
121 mdr->set_mds_stamp(ceph_clock_now());
122 for (auto& m : batch_reqs) {
123 if (!m->killed)
124 mdcache->request_forward(m, t);
125 }
126 batch_reqs.clear();
127 }
128 void _respond(int r) override {
129 mdr->set_mds_stamp(ceph_clock_now());
130 for (auto& m : batch_reqs) {
131 if (!m->killed) {
132 m->tracei = mdr->tracei;
133 m->tracedn = mdr->tracedn;
134 server->respond_to_request(m, r);
135 }
136 }
137 batch_reqs.clear();
138 server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
139 }
140 void print(std::ostream& o) const override {
141 o << "[batch front=" << *mdr << "]";
142 }
143 };
144
145 class ServerLogContext : public MDSLogContextBase {
146 protected:
147 Server *server;
148 MDSRank *get_mds() override
149 {
150 return server->mds;
151 }
152
153 MDRequestRef mdr;
154 void pre_finish(int r) override {
155 if (mdr)
156 mdr->mark_event("journal_committed: ");
157 }
158 public:
159 explicit ServerLogContext(Server *s) : server(s) {
160 ceph_assert(server != NULL);
161 }
162 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
163 ceph_assert(server != NULL);
164 }
165 };
166
167 void Server::create_logger()
168 {
169 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
170
171 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
172 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
173 plb.add_u64_counter(l_mdss_handle_peer_request, "handle_peer_request",
174 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
175 plb.add_u64_counter(l_mdss_handle_client_session,
176 "handle_client_session", "Client session messages", "hcs",
177 PerfCountersBuilder::PRIO_INTERESTING);
178 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
179 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
180 plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
181 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
182 PerfCountersBuilder::PRIO_INTERESTING);
183
184 // fop latencies are useful
185 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
186 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
187 "Request type lookup hash of inode latency");
188 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
189 "Request type lookup inode latency");
190 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
191 "Request type lookup parent latency");
192 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
193 "Request type lookup name latency");
194 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
195 "Request type lookup latency");
196 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
197 "Request type lookup snapshot latency");
198 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
199 "Request type get attribute latency");
200 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
201 "Request type set attribute latency");
202 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
203 "Request type set file layout latency");
204 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
205 "Request type set directory layout latency");
206 plb.add_time_avg(l_mdss_req_getvxattr_latency, "req_getvxattr_latency",
207 "Request type get virtual extended attribute latency");
208 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
209 "Request type set extended attribute latency");
210 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
211 "Request type remove extended attribute latency");
212 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
213 "Request type read directory latency");
214 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
215 "Request type set file lock latency");
216 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
217 "Request type get file lock latency");
218 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
219 "Request type create latency");
220 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
221 "Request type open latency");
222 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
223 "Request type make node latency");
224 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
225 "Request type link latency");
226 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
227 "Request type unlink latency");
228 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
229 "Request type remove directory latency");
230 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
231 "Request type rename latency");
232 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
233 "Request type make directory latency");
234 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
235 "Request type symbolic link latency");
236 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
237 "Request type list snapshot latency");
238 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
239 "Request type make snapshot latency");
240 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
241 "Request type remove snapshot latency");
242 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
243 "Request type rename snapshot latency");
244 plb.add_time_avg(l_mdss_req_snapdiff_latency, "req_snapdiff_latency",
245 "Request type snapshot difference latency");
246
247 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
248 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
249 "Client requests dispatched");
250 plb.add_u64_counter(l_mdss_dispatch_peer_request, "dispatch_server_request",
251 "Server requests dispatched");
252
253 logger = plb.create_perf_counters();
254 g_ceph_context->get_perfcounters_collection()->add(logger);
255 }
256
257 Server::Server(MDSRank *m, MetricsHandler *metrics_handler) :
258 mds(m),
259 mdcache(mds->mdcache), mdlog(mds->mdlog),
260 inject_rename_corrupt_dentry_first(g_conf().get_val<double>("mds_inject_rename_corrupt_dentry_first")),
261 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
262 metrics_handler(metrics_handler)
263 {
264 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
265 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
266 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
267 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
268 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
269 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
270 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
271 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
272 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
273 dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
274 bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
275 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
276 supported_metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
277 }
278
279 void Server::dispatch(const cref_t<Message> &m)
280 {
281 switch (m->get_type()) {
282 case CEPH_MSG_CLIENT_RECONNECT:
283 handle_client_reconnect(ref_cast<MClientReconnect>(m));
284 return;
285 }
286
287 /*
288 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
289
290 1. In reconnect phase, client sent unsafe requests to mds.
291 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
292 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
293 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
294
295 */
296 bool sessionclosed_isok = replay_unsafe_with_closed_session;
297 // active?
298 // handle_peer_request()/handle_client_session() will wait if necessary
299 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
300 const auto &req = ref_cast<MClientRequest>(m);
301 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
302 Session *session = mds->get_session(req);
303 if (!session || (!session->is_open() && !sessionclosed_isok)) {
304 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
305 return;
306 }
307 bool queue_replay = false;
308 if (req->is_replay() || req->is_async()) {
309 dout(3) << "queuing replayed op" << dendl;
310 queue_replay = true;
311 if (req->head.ino &&
312 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
313 inodeno_t ino(req->head.ino);
314 mdcache->add_replay_ino_alloc(ino);
315 if (replay_unsafe_with_closed_session &&
316 session->free_prealloc_inos.contains(ino)) {
317 // don't purge inodes that will be created by later replay
318 session->free_prealloc_inos.erase(ino);
319 session->delegated_inos.insert(ino);
320 }
321 }
322 } else if (req->get_retry_attempt()) {
323 // process completed request in clientreplay stage. The completed request
324 // might have created new file/directorie. This guarantees MDS sends a reply
325 // to client before other request modifies the new file/directorie.
326 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
327 dout(3) << "queuing completed op" << dendl;
328 queue_replay = true;
329 }
330 // this request was created before the cap reconnect message, drop any embedded
331 // cap releases.
332 req->releases.clear();
333 }
334 if (queue_replay) {
335 req->mark_queued_for_replay();
336 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
337 return;
338 }
339 }
340
341 bool wait_for_active = true;
342 if (mds->is_stopping()) {
343 wait_for_active = false;
344 } else if (mds->is_clientreplay()) {
345 if (req->is_queued_for_replay()) {
346 wait_for_active = false;
347 }
348 }
349 if (wait_for_active) {
350 dout(3) << "not active yet, waiting" << dendl;
351 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
352 return;
353 }
354 }
355
356 switch (m->get_type()) {
357 case CEPH_MSG_CLIENT_SESSION:
358 handle_client_session(ref_cast<MClientSession>(m));
359 return;
360 case CEPH_MSG_CLIENT_REQUEST:
361 handle_client_request(ref_cast<MClientRequest>(m));
362 return;
363 case CEPH_MSG_CLIENT_RECLAIM:
364 handle_client_reclaim(ref_cast<MClientReclaim>(m));
365 return;
366 case MSG_MDS_PEER_REQUEST:
367 handle_peer_request(ref_cast<MMDSPeerRequest>(m));
368 return;
369 default:
370 derr << "Server unknown message " << m->get_type() << " from peer type " << m->get_connection()->get_peer_type() << dendl;
371 ceph_abort_msg("server unknown message " + to_string(m->get_type()) + " from peer type " + to_string(m->get_connection()->get_peer_type()));
372 }
373 }
374
375
376
377 // ----------------------------------------------------------
378 // SESSION management
379
380 class C_MDS_session_finish : public ServerLogContext {
381 Session *session;
382 uint64_t state_seq;
383 bool open;
384 version_t cmapv;
385 interval_set<inodeno_t> inos_to_free;
386 version_t inotablev;
387 interval_set<inodeno_t> inos_to_purge;
388 LogSegment *ls = nullptr;
389 Context *fin;
390 public:
391 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = nullptr) :
392 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
393 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv,
394 const interval_set<inodeno_t>& to_free, version_t iv,
395 const interval_set<inodeno_t>& to_purge, LogSegment *_ls, Context *fin_ = nullptr) :
396 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv),
397 inos_to_free(to_free), inotablev(iv), inos_to_purge(to_purge), ls(_ls), fin(fin_) {}
398 void finish(int r) override {
399 ceph_assert(r == 0);
400 server->_session_logged(session, state_seq, open, cmapv, inos_to_free, inotablev, inos_to_purge, ls);
401 if (fin) {
402 fin->complete(r);
403 }
404 }
405 };
406
407 Session* Server::find_session_by_uuid(std::string_view uuid)
408 {
409 Session* session = nullptr;
410 for (auto& it : mds->sessionmap.get_sessions()) {
411 auto& metadata = it.second->info.client_metadata;
412
413 auto p = metadata.find("uuid");
414 if (p == metadata.end() || p->second != uuid)
415 continue;
416
417 if (!session) {
418 session = it.second;
419 } else if (!session->reclaiming_from) {
420 ceph_assert(it.second->reclaiming_from == session);
421 session = it.second;
422 } else {
423 ceph_assert(session->reclaiming_from == it.second);
424 }
425 }
426 return session;
427 }
428
429 void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
430 {
431 if (!session->is_open() && !session->is_stale()) {
432 dout(10) << "session not open, dropping this req" << dendl;
433 return;
434 }
435
436 auto reply = make_message<MClientReclaimReply>(0);
437 if (m->get_uuid().empty()) {
438 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
439 reply->set_result(-CEPHFS_EINVAL);
440 mds->send_message_client(reply, session);
441 return;
442 }
443
444 unsigned flags = m->get_flags();
445 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
446 dout(10) << __func__ << " unsupported flags" << dendl;
447 reply->set_result(-CEPHFS_EINVAL);
448 mds->send_message_client(reply, session);
449 return;
450 }
451
452 Session* target = find_session_by_uuid(m->get_uuid());
453 if (target) {
454 if (session->info.auth_name != target->info.auth_name) {
455 dout(10) << __func__ << " session auth_name " << session->info.auth_name
456 << " != target auth_name " << target->info.auth_name << dendl;
457 reply->set_result(-CEPHFS_EPERM);
458 mds->send_message_client(reply, session);
459 }
460
461 ceph_assert(!target->reclaiming_from);
462 ceph_assert(!session->reclaiming_from);
463 session->reclaiming_from = target;
464 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
465 }
466
467 if (flags & CEPH_RECLAIM_RESET) {
468 finish_reclaim_session(session, reply);
469 } else ceph_assert(0); /* no other flags are handled at this time */
470 }
471
472 void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
473 {
474 Session *target = session->reclaiming_from;
475 if (target) {
476 session->reclaiming_from = nullptr;
477
478 Context *send_reply;
479 if (reply) {
480 int64_t session_id = session->get_client().v;
481 send_reply = new LambdaContext([this, session_id, reply](int r) {
482 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
483 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
484 if (!session) {
485 return;
486 }
487 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
488 reply->set_epoch(epoch);
489 mds->send_message_client(reply, session);
490 });
491 } else {
492 send_reply = nullptr;
493 }
494
495 bool blocklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
496 return map.is_blocklisted(target->info.inst.addr);
497 });
498
499 if (blocklisted || !g_conf()->mds_session_blocklist_on_evict) {
500 kill_session(target, send_reply);
501 } else {
502 CachedStackStringStream css;
503 mds->evict_client(target->get_client().v, false, true, *css, send_reply);
504 }
505 } else if (reply) {
506 mds->send_message_client(reply, session);
507 }
508 }
509
510 void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
511 {
512 Session *session = mds->get_session(m);
513 uint32_t flags = m->get_flags();
514 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
515 ceph_assert(m->is_a_client()); // should _not_ come from an mds!
516
517 if (!session) {
518 dout(0) << " ignoring sessionless msg " << *m << dendl;
519 return;
520 }
521
522 std::string_view fs_name = mds->mdsmap->get_fs_name();
523 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
524 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
525 return;
526 }
527
528 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
529 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
530 return;
531 }
532
533 if (flags & MClientReclaim::FLAG_FINISH) {
534 if (flags ^ MClientReclaim::FLAG_FINISH) {
535 dout(0) << __func__ << " client specified FLAG_FINISH with other flags."
536 " Other flags:" << flags << dendl;
537 auto reply = make_message<MClientReclaimReply>(0);
538 reply->set_result(-CEPHFS_EINVAL);
539 mds->send_message_client(reply, session);
540 return;
541 }
542 finish_reclaim_session(session);
543 } else {
544 reclaim_session(session, m);
545 }
546 }
547
548 void Server::handle_client_session(const cref_t<MClientSession> &m)
549 {
550 version_t pv;
551 Session *session = mds->get_session(m);
552
553 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
554 ceph_assert(m->is_a_client()); // should _not_ come from an mds!
555
556 if (!session) {
557 dout(0) << " ignoring sessionless msg " << *m << dendl;
558 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
559 reply->metadata["error_string"] = "sessionless";
560 mds->send_message(reply, m->get_connection());
561 return;
562 }
563
564 std::string_view fs_name = mds->mdsmap->get_fs_name();
565 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
566 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
567 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
568 reply->metadata["error_string"] = "client doesn't have caps for FS \"" +
569 std::string(fs_name) + "\"";
570 mds->send_message(std::move(reply), m->get_connection());
571 return;
572 }
573
574 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
575 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
576 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
577 // close requests need to be handled when mds is active
578 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
579 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
580 return;
581 }
582 } else {
583 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
584 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
585 return;
586 }
587 }
588
589 if (logger)
590 logger->inc(l_mdss_handle_client_session);
591
592 uint64_t sseq = 0;
593 switch (m->get_op()) {
594 case CEPH_SESSION_REQUEST_OPEN:
595 if(mds->mdsmap->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
596 dout(0) << "new sessions are not permitted, enable again via"
597 "`ceph fs set <fs_name> refuse_client_session false`" << dendl;
598 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
599 reply->metadata["error_string"] = "new sessions are not permitted,"
600 " enable again via `ceph fs set"
601 " <fs_name> refuse_client_session false`";
602 mds->send_message(reply, m->get_connection());
603 return;
604 }
605 if (session->is_opening() ||
606 session->is_open() ||
607 session->is_stale() ||
608 session->is_killing() ||
609 terminating_sessions) {
610 if (m->supported_features.test(CEPHFS_FEATURE_NOTIFY_SESSION_STATE)) {
611 if (session->is_open() && !mds->is_stopping()) {
612 dout(10) << "currently already opened" << dendl;
613
614 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN,
615 session->get_push_seq());
616 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
617 reply->supported_features = supported_features;
618 mds->send_message_client(reply, session);
619 if (mdcache->is_readonly()) {
620 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
621 mds->send_message_client(m, session);
622 }
623 }
624 }
625 dout(10) << "currently " << session->get_state_name()
626 << ", dropping this req" << dendl;
627 return;
628 }
629 ceph_assert(session->is_closed() || session->is_closing());
630
631 if (mds->is_stopping()) {
632 dout(10) << "mds is stopping, dropping open req" << dendl;
633 return;
634 }
635
636 {
637 auto& addr = session->info.inst.addr;
638 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
639 auto& client_metadata = session->info.client_metadata;
640
641 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
642 auto now = ceph_clock_now();
643 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
644 auto elapsed = now - m->get_recv_stamp();
645 CachedStackStringStream css;
646 *css << "New client session:"
647 << " addr=\"" << session->info.inst.addr << "\""
648 << ",elapsed=" << elapsed
649 << ",throttled=" << throttle_elapsed
650 << ",status=\"" << status << "\"";
651 if (!err.empty()) {
652 *css << ",error=\"" << err << "\"";
653 }
654 const auto& metadata = session->info.client_metadata;
655 if (auto it = metadata.find("root"); it != metadata.end()) {
656 *css << ",root=\"" << it->second << "\"";
657 }
658 dout(2) << css->strv() << dendl;
659 };
660
661 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str, unsigned flags=0) {
662 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT, 0, flags);
663 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
664 m->metadata["error_string"] = err_str;
665 mds->send_message_client(m, session);
666 log_session_status("REJECTED", err_str);
667 };
668
669 bool blocklisted = mds->objecter->with_osdmap(
670 [&addr](const OSDMap &osd_map) -> bool {
671 return osd_map.is_blocklisted(addr);
672 });
673
674 if (blocklisted) {
675 dout(10) << "rejecting blocklisted client " << addr << dendl;
676 // This goes on the wire and the "blacklisted" substring is
677 // depended upon by the kernel client for detecting whether it
678 // has been blocklisted. If mounted with recover_session=clean
679 // (since 5.4), it tries to automatically recover itself from
680 // blocklisting.
681 unsigned flags = 0;
682 flags |= MClientSession::SESSION_BLOCKLISTED;
683 send_reject_message("blocklisted (blacklisted)", flags);
684 session->clear();
685 break;
686 }
687
688 if (client_metadata.features.empty())
689 infer_supported_features(session, client_metadata);
690
691 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
692 dout(20) << " features: '" << client_metadata.features << "'" << dendl;
693 dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
694 for (const auto& p : client_metadata) {
695 dout(20) << " " << p.first << ": " << p.second << dendl;
696 }
697
698 feature_bitset_t missing_features = required_client_features;
699 missing_features -= client_metadata.features;
700 if (!missing_features.empty()) {
701 CachedStackStringStream css;
702 *css << "missing required features '" << missing_features << "'";
703 send_reject_message(css->strv());
704 mds->clog->warn() << "client session (" << session->info.inst
705 << ") lacks required features " << missing_features
706 << "; client supports " << client_metadata.features;
707 session->clear();
708 break;
709 }
710
711 // Special case for the 'root' metadata path; validate that the claimed
712 // root is actually within the caps of the session
713 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
714 auto claimed_root = it->second;
715 CachedStackStringStream css;
716 bool denied = false;
717 // claimed_root has a leading "/" which we strip before passing
718 // into caps check
719 if (claimed_root.empty() || claimed_root[0] != '/') {
720 denied = true;
721 *css << "invalue root '" << claimed_root << "'";
722 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
723 denied = true;
724 *css << "non-allowable root '" << claimed_root << "'";
725 }
726
727 if (denied) {
728 // Tell the client we're rejecting their open
729 send_reject_message(css->strv());
730 mds->clog->warn() << "client session with " << css->strv()
731 << " denied (" << session->info.inst << ")";
732 session->clear();
733 break;
734 }
735 }
736
737 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
738 if (find_session_by_uuid(it->second)) {
739 send_reject_message("duplicated session uuid");
740 mds->clog->warn() << "client session with duplicated session uuid '"
741 << it->second << "' denied (" << session->info.inst << ")";
742 session->clear();
743 break;
744 }
745 }
746
747 if (session->is_closed()) {
748 mds->sessionmap.add_session(session);
749 }
750
751 pv = mds->sessionmap.mark_projected(session);
752 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
753 mds->sessionmap.touch_session(session);
754 auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
755 ceph_assert(r == 0);
756 log_session_status("ACCEPTED", "");
757 });
758 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
759 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
760 mdlog->flush();
761 }
762 break;
763
764 case CEPH_SESSION_REQUEST_RENEWCAPS:
765 if (session->is_open() || session->is_stale()) {
766 mds->sessionmap.touch_session(session);
767 if (session->is_stale()) {
768 mds->sessionmap.set_state(session, Session::STATE_OPEN);
769 mds->locker->resume_stale_caps(session);
770 mds->sessionmap.touch_session(session);
771 }
772 auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
773 mds->send_message_client(reply, session);
774 } else {
775 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
776 }
777 break;
778
779 case CEPH_SESSION_REQUEST_CLOSE:
780 {
781 if (session->is_closed() ||
782 session->is_closing() ||
783 session->is_killing()) {
784 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
785 return;
786 }
787 if (session->is_importing()) {
788 dout(10) << "ignoring close req on importing session" << dendl;
789 return;
790 }
791 ceph_assert(session->is_open() ||
792 session->is_stale() ||
793 session->is_opening());
794 if (m->get_seq() < session->get_push_seq()) {
795 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
796 << ", dropping" << dendl;
797 return;
798 }
799 // We are getting a seq that is higher than expected.
800 // Handle the same as any other seqn error.
801 //
802 if (m->get_seq() != session->get_push_seq()) {
803 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
804 << ", BUGGY!" << dendl;
805 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
806 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
807 return;
808 }
809 journal_close_session(session, Session::STATE_CLOSING, NULL);
810 }
811 break;
812
813 case CEPH_SESSION_FLUSHMSG_ACK:
814 finish_flush_session(session, m->get_seq());
815 break;
816
817 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
818 if (mds->is_active())
819 mdlog->flush();
820 break;
821
822 default:
823 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT);
824 mds->send_message_client(m, session);
825 derr << "Server received unknown message " << m->get_type() << ", closing session and blocklisting the client " << session->get_client() << dendl;
826 CachedStackStringStream css;
827 mds->evict_client(session->get_client().v, false, true, *css, nullptr);
828 }
829 }
830
831 void Server::flush_session(Session *session, MDSGatherBuilder& gather) {
832 if (!session->is_open() ||
833 !session->get_connection() ||
834 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
835 return;
836 }
837
838 version_t seq = session->wait_for_flush(gather.new_sub());
839 mds->send_message_client(
840 make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
841 }
842
843 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
844 {
845 for (const auto& client : client_set) {
846 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
847 ceph_assert(session);
848 flush_session(session, gather);
849 }
850 }
851
852 void Server::finish_flush_session(Session *session, version_t seq)
853 {
854 MDSContext::vec finished;
855 session->finish_flush(seq, finished);
856 mds->queue_waiters(finished);
857 }
858
859 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
860 const interval_set<inodeno_t>& inos_to_free, version_t piv,
861 const interval_set<inodeno_t>& inos_to_purge, LogSegment *ls)
862 {
863 dout(10) << "_session_logged " << session->info.inst
864 << " state_seq " << state_seq
865 << " " << (open ? "open":"close") << " " << pv
866 << " inos_to_free " << inos_to_free << " inotablev " << piv
867 << " inos_to_purge " << inos_to_purge << dendl;
868
869 if (!open) {
870 if (inos_to_purge.size()){
871 ceph_assert(ls);
872 session->info.prealloc_inos.subtract(inos_to_purge);
873 ls->purging_inodes.insert(inos_to_purge);
874 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping())
875 mdcache->purge_inodes(inos_to_purge, ls);
876 }
877
878 if (inos_to_free.size()) {
879 ceph_assert(piv);
880 ceph_assert(session->is_closing() || session->is_killing() ||
881 session->is_opening()); // re-open closing session
882 session->info.prealloc_inos.subtract(inos_to_free);
883 mds->inotable->apply_release_ids(inos_to_free);
884 ceph_assert(mds->inotable->get_version() == piv);
885 }
886 session->free_prealloc_inos = session->info.prealloc_inos;
887 session->delegated_inos.clear();
888 }
889
890 mds->sessionmap.mark_dirty(session);
891
892 // apply
893 if (session->get_state_seq() != state_seq) {
894 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
895 << ", noop" << dendl;
896 // close must have been canceled (by an import?), or any number of other things..
897 } else if (open) {
898 ceph_assert(session->is_opening());
899 mds->sessionmap.set_state(session, Session::STATE_OPEN);
900 mds->sessionmap.touch_session(session);
901 metrics_handler->add_session(session);
902 ceph_assert(session->get_connection());
903 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
904 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
905 reply->supported_features = supported_features;
906 reply->metric_spec = supported_metric_spec;
907 }
908 mds->send_message_client(reply, session);
909 if (mdcache->is_readonly()) {
910 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
911 mds->send_message_client(m, session);
912 }
913 } else if (session->is_closing() ||
914 session->is_killing()) {
915 // kill any lingering capabilities, leases, requests
916 bool killing = session->is_killing();
917 while (!session->caps.empty()) {
918 Capability *cap = session->caps.front();
919 CInode *in = cap->get_inode();
920 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
921 mds->locker->remove_client_cap(in, cap, killing);
922 }
923 while (!session->leases.empty()) {
924 ClientLease *r = session->leases.front();
925 CDentry *dn = static_cast<CDentry*>(r->parent);
926 dout(20) << " killing client lease of " << *dn << dendl;
927 dn->remove_client_lease(r, mds->locker);
928 }
929 if (client_reconnect_gather.erase(session->info.get_client())) {
930 dout(20) << " removing client from reconnect set" << dendl;
931 if (client_reconnect_gather.empty()) {
932 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
933 reconnect_gather_finish();
934 }
935 }
936 if (client_reclaim_gather.erase(session->info.get_client())) {
937 dout(20) << " removing client from reclaim set" << dendl;
938 if (client_reclaim_gather.empty()) {
939 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
940 mds->maybe_clientreplay_done();
941 }
942 }
943
944 if (session->is_closing()) {
945 // mark con disposable. if there is a fault, we will get a
946 // reset and clean it up. if the client hasn't received the
947 // CLOSE message yet, they will reconnect and get an
948 // ms_handle_remote_reset() and realize they had in fact closed.
949 // do this *before* sending the message to avoid a possible
950 // race.
951 if (session->get_connection()) {
952 // Conditional because terminate_sessions will indiscrimately
953 // put sessions in CLOSING whether they ever had a conn or not.
954 session->get_connection()->mark_disposable();
955 }
956
957 // reset session
958 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
959 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
960 session->clear();
961 metrics_handler->remove_session(session);
962 mds->sessionmap.remove_session(session);
963 } else if (session->is_killing()) {
964 // destroy session, close connection
965 if (session->get_connection()) {
966 session->get_connection()->mark_down();
967 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
968 session->set_connection(nullptr);
969 }
970 metrics_handler->remove_session(session);
971 mds->sessionmap.remove_session(session);
972 } else {
973 ceph_abort();
974 }
975 } else {
976 ceph_abort();
977 }
978 }
979
980 /**
981 * Inject sessions from some source other than actual connections.
982 *
983 * For example:
984 * - sessions inferred from journal replay
985 * - sessions learned from other MDSs during rejoin
986 * - sessions learned from other MDSs during dir/caps migration
987 * - sessions learned from other MDSs during a cross-MDS rename
988 */
989 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
990 map<client_t,client_metadata_t>& cmm,
991 map<client_t, pair<Session*,uint64_t> >& smap)
992 {
993 version_t pv = mds->sessionmap.get_projected();
994
995 dout(10) << "prepare_force_open_sessions " << pv
996 << " on " << cm.size() << " clients"
997 << dendl;
998
999 mds->objecter->with_osdmap(
1000 [this, &cm, &cmm](const OSDMap &osd_map) {
1001 for (auto p = cm.begin(); p != cm.end(); ) {
1002 if (osd_map.is_blocklisted(p->second.addr)) {
1003 dout(10) << " ignoring blocklisted client." << p->first
1004 << " (" << p->second.addr << ")" << dendl;
1005 cmm.erase(p->first);
1006 cm.erase(p++);
1007 } else {
1008 ++p;
1009 }
1010 }
1011 });
1012
1013 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
1014 Session *session = mds->sessionmap.get_or_add_session(p->second);
1015 pv = mds->sessionmap.mark_projected(session);
1016 uint64_t sseq;
1017 if (session->is_closed() ||
1018 session->is_closing() ||
1019 session->is_killing()) {
1020 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
1021 auto q = cmm.find(p->first);
1022 if (q != cmm.end())
1023 session->info.client_metadata.merge(q->second);
1024 } else {
1025 ceph_assert(session->is_open() ||
1026 session->is_opening() ||
1027 session->is_stale());
1028 sseq = 0;
1029 }
1030 smap[p->first] = make_pair(session, sseq);
1031 session->inc_importing();
1032 }
1033 return pv;
1034 }
1035
1036 void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
1037 bool dec_import)
1038 {
1039 /*
1040 * FIXME: need to carefully consider the race conditions between a
1041 * client trying to close a session and an MDS doing an import
1042 * trying to force open a session...
1043 */
1044 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
1045 << " initial v " << mds->sessionmap.get_version() << dendl;
1046
1047 for (auto &it : smap) {
1048 Session *session = it.second.first;
1049 uint64_t sseq = it.second.second;
1050 if (sseq > 0) {
1051 if (session->get_state_seq() != sseq) {
1052 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
1053 } else {
1054 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
1055 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1056 mds->sessionmap.touch_session(session);
1057 metrics_handler->add_session(session);
1058
1059 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
1060 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
1061 reply->supported_features = supported_features;
1062 reply->metric_spec = supported_metric_spec;
1063 }
1064 mds->send_message_client(reply, session);
1065
1066 if (mdcache->is_readonly())
1067 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
1068 }
1069 } else {
1070 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
1071 ceph_assert(session->is_open() || session->is_stale());
1072 }
1073
1074 if (dec_import) {
1075 session->dec_importing();
1076 }
1077
1078 mds->sessionmap.mark_dirty(session);
1079 }
1080
1081 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
1082 }
1083
1084 class C_MDS_TerminatedSessions : public ServerContext {
1085 void finish(int r) override {
1086 server->terminating_sessions = false;
1087 }
1088 public:
1089 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
1090 };
1091
1092 void Server::terminate_sessions()
1093 {
1094 dout(5) << "terminating all sessions..." << dendl;
1095
1096 terminating_sessions = true;
1097
1098 // kill them off. clients will retry etc.
1099 set<Session*> sessions;
1100 mds->sessionmap.get_client_session_set(sessions);
1101 for (set<Session*>::const_iterator p = sessions.begin();
1102 p != sessions.end();
1103 ++p) {
1104 Session *session = *p;
1105 if (session->is_closing() ||
1106 session->is_killing() ||
1107 session->is_closed())
1108 continue;
1109 journal_close_session(session, Session::STATE_CLOSING, NULL);
1110 }
1111
1112 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
1113 }
1114
1115
1116 void Server::find_idle_sessions()
1117 {
1118 auto now = clock::now();
1119 auto last_cleared_laggy = mds->last_cleared_laggy();
1120
1121 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
1122
1123 // timeout/stale
1124 // (caps go stale, lease die)
1125 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
1126 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
1127
1128 // don't kick clients if we've been laggy
1129 if (last_cleared_laggy < cutoff) {
1130 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
1131 << "), not marking any client stale" << dendl;
1132 return;
1133 }
1134
1135 std::vector<Session*> to_evict;
1136
1137 bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
1138 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
1139 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
1140 std::vector<Session*> new_stale;
1141
1142 for (auto session : *(sessions_p1->second)) {
1143 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1144 if (last_cap_renew_span < cutoff) {
1145 dout(20) << "laggiest active session is " << session->info.inst
1146 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1147 break;
1148 }
1149
1150 if (session->last_seen > session->last_cap_renew) {
1151 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
1152 if (last_cap_renew_span < cutoff) {
1153 dout(20) << "laggiest active session is " << session->info.inst
1154 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1155 continue;
1156 }
1157 }
1158
1159 if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
1160 dout(20) << "evicting session " << session->info.inst << " since autoclose "
1161 "has arrived" << dendl;
1162 // evict session without marking it stale
1163 to_evict.push_back(session);
1164 continue;
1165 }
1166
1167 if (defer_session_stale &&
1168 !session->is_any_flush_waiter() &&
1169 !mds->locker->is_revoking_any_caps_from(session->get_client())) {
1170 dout(20) << "deferring marking session " << session->info.inst << " stale "
1171 "since it holds no caps" << dendl;
1172 continue;
1173 }
1174
1175 auto it = session->info.client_metadata.find("timeout");
1176 if (it != session->info.client_metadata.end()) {
1177 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
1178 if (timeout == 0) {
1179 dout(10) << "skipping session " << session->info.inst
1180 << ", infinite timeout specified" << dendl;
1181 continue;
1182 }
1183 double cutoff = queue_max_age + timeout;
1184 if (last_cap_renew_span < cutoff) {
1185 dout(10) << "skipping session " << session->info.inst
1186 << ", timeout (" << timeout << ") specified"
1187 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1188 continue;
1189 }
1190
1191 // do not go through stale, evict it directly.
1192 to_evict.push_back(session);
1193 } else {
1194 dout(10) << "new stale session " << session->info.inst
1195 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1196 new_stale.push_back(session);
1197 }
1198 }
1199
1200 for (auto session : new_stale) {
1201 mds->sessionmap.set_state(session, Session::STATE_STALE);
1202 if (mds->locker->revoke_stale_caps(session)) {
1203 mds->locker->remove_stale_leases(session);
1204 finish_flush_session(session, session->get_push_seq());
1205 auto m = make_message<MClientSession>(CEPH_SESSION_STALE);
1206 mds->send_message_client(m, session);
1207 } else {
1208 to_evict.push_back(session);
1209 }
1210 }
1211 }
1212
1213 // autoclose
1214 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
1215
1216 // Collect a list of sessions exceeding the autoclose threshold
1217 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
1218 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1219 for (auto session : *(sessions_p2->second)) {
1220 ceph_assert(session->is_stale());
1221 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1222 if (last_cap_renew_span < cutoff) {
1223 dout(20) << "oldest stale session is " << session->info.inst
1224 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1225 break;
1226 }
1227 to_evict.push_back(session);
1228 }
1229 }
1230
1231 for (auto session: to_evict) {
1232 if (session->is_importing()) {
1233 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1234 continue;
1235 }
1236
1237 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1238 mds->clog->warn() << "evicting unresponsive client " << *session
1239 << ", after " << last_cap_renew_span << " seconds";
1240 dout(10) << "autoclosing stale session " << session->info.inst
1241 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1242
1243 if (g_conf()->mds_session_blocklist_on_timeout) {
1244 CachedStackStringStream css;
1245 mds->evict_client(session->get_client().v, false, true, *css, nullptr);
1246 } else {
1247 kill_session(session, NULL);
1248 }
1249 }
1250 }
1251
1252 void Server::evict_cap_revoke_non_responders() {
1253 if (!cap_revoke_eviction_timeout) {
1254 return;
1255 }
1256
1257 auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
1258
1259 for (auto const &client: to_evict) {
1260 mds->clog->warn() << "client id " << client << " has not responded to"
1261 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1262 << " seconds, evicting";
1263 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1264 << client << dendl;
1265
1266 CachedStackStringStream css;
1267 bool evicted = mds->evict_client(client.v, false,
1268 g_conf()->mds_session_blocklist_on_evict,
1269 *css, nullptr);
1270 if (evicted && logger) {
1271 logger->inc(l_mdss_cap_revoke_eviction);
1272 }
1273 }
1274 }
1275
1276 void Server::handle_conf_change(const std::set<std::string>& changed) {
1277 if (changed.count("mds_forward_all_requests_to_auth")){
1278 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
1279 }
1280 if (changed.count("mds_cap_revoke_eviction_timeout")) {
1281 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
1282 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1283 << cap_revoke_eviction_timeout << dendl;
1284 }
1285 if (changed.count("mds_recall_max_decay_rate")) {
1286 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
1287 }
1288 if (changed.count("mds_max_snaps_per_dir")) {
1289 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
1290 dout(20) << __func__ << " max snapshots per directory changed to "
1291 << max_snaps_per_dir << dendl;
1292 }
1293 if (changed.count("mds_client_delegate_inos_pct")) {
1294 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
1295 }
1296 if (changed.count("mds_max_caps_per_client")) {
1297 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1298 }
1299 if (changed.count("mds_session_cap_acquisition_throttle")) {
1300 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
1301 }
1302 if (changed.count("mds_session_max_caps_throttle_ratio")) {
1303 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
1304 }
1305 if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1306 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1307 }
1308 if (changed.count("mds_alternate_name_max")) {
1309 alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
1310 }
1311 if (changed.count("mds_fscrypt_last_block_max_size")) {
1312 fscrypt_last_block_max_size = g_conf().get_val<Option::size_t>("mds_fscrypt_last_block_max_size");
1313 }
1314 if (changed.count("mds_dir_max_entries")) {
1315 dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
1316 dout(20) << __func__ << " max entries per directory changed to "
1317 << dir_max_entries << dendl;
1318 }
1319 if (changed.count("mds_bal_fragment_size_max")) {
1320 bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
1321 dout(20) << __func__ << " max fragment size changed to "
1322 << bal_fragment_size_max << dendl;
1323 }
1324 if (changed.count("mds_inject_rename_corrupt_dentry_first")) {
1325 inject_rename_corrupt_dentry_first = g_conf().get_val<double>("mds_inject_rename_corrupt_dentry_first");
1326 }
1327 }
1328
1329 /*
1330 * XXX bump in the interface here, not using an MDSContext here
1331 * because all the callers right now happen to use a SaferCond
1332 */
1333 void Server::kill_session(Session *session, Context *on_safe)
1334 {
1335 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
1336
1337 if ((session->is_opening() ||
1338 session->is_open() ||
1339 session->is_stale()) &&
1340 !session->is_importing()) {
1341 dout(10) << "kill_session " << session << dendl;
1342 journal_close_session(session, Session::STATE_KILLING, on_safe);
1343 } else {
1344 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
1345 if (session->is_closing() ||
1346 session->is_killing()) {
1347 if (on_safe)
1348 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1349 } else {
1350 ceph_assert(session->is_closed() ||
1351 session->is_importing());
1352 if (on_safe)
1353 on_safe->complete(0);
1354 }
1355 }
1356 }
1357
1358 size_t Server::apply_blocklist()
1359 {
1360 std::vector<Session*> victims;
1361 const auto& sessions = mds->sessionmap.get_sessions();
1362 mds->objecter->with_osdmap(
1363 [&](const OSDMap& o) {
1364 for (const auto& p : sessions) {
1365 if (!p.first.is_client()) {
1366 // Do not apply OSDMap blocklist to MDS daemons, we find out
1367 // about their death via MDSMap.
1368 continue;
1369 }
1370 if (o.is_blocklisted(p.second->info.inst.addr)) {
1371 victims.push_back(p.second);
1372 }
1373 }
1374 });
1375
1376 for (const auto& s : victims) {
1377 kill_session(s, nullptr);
1378 }
1379
1380 dout(10) << "apply_blocklist: killed " << victims.size() << dendl;
1381
1382 return victims.size();
1383 }
1384
1385 void Server::journal_close_session(Session *session, int state, Context *on_safe)
1386 {
1387 dout(10) << __func__ << " : "
1388 << session->info.inst
1389 << " pending_prealloc_inos " << session->pending_prealloc_inos
1390 << " free_prealloc_inos " << session->free_prealloc_inos
1391 << " delegated_inos " << session->delegated_inos << dendl;
1392
1393 uint64_t sseq = mds->sessionmap.set_state(session, state);
1394 version_t pv = mds->sessionmap.mark_projected(session);
1395 version_t piv = 0;
1396
1397 // release alloc and pending-alloc inos for this session
1398 // and wipe out session state, in case the session close aborts for some reason
1399 interval_set<inodeno_t> inos_to_free;
1400 inos_to_free.insert(session->pending_prealloc_inos);
1401 inos_to_free.insert(session->free_prealloc_inos);
1402 if (inos_to_free.size()) {
1403 mds->inotable->project_release_ids(inos_to_free);
1404 piv = mds->inotable->get_projected_version();
1405 } else
1406 piv = 0;
1407
1408 auto le = new ESession(session->info.inst, false, pv, inos_to_free, piv, session->delegated_inos);
1409 auto fin = new C_MDS_session_finish(this, session, sseq, false, pv, inos_to_free, piv,
1410 session->delegated_inos, mdlog->get_current_segment(), on_safe);
1411 mdlog->start_submit_entry(le, fin);
1412 mdlog->flush();
1413
1414 // clean up requests, too
1415 while(!session->requests.empty()) {
1416 auto mdr = MDRequestRef(*session->requests.begin());
1417 mdcache->request_kill(mdr);
1418 }
1419
1420 finish_flush_session(session, session->get_push_seq());
1421 }
1422
1423 void Server::reconnect_clients(MDSContext *reconnect_done_)
1424 {
1425 reconnect_done = reconnect_done_;
1426
1427 auto now = clock::now();
1428 set<Session*> sessions;
1429 mds->sessionmap.get_client_session_set(sessions);
1430 for (auto session : sessions) {
1431 if (session->is_open()) {
1432 client_reconnect_gather.insert(session->get_client());
1433 session->set_reconnecting(true);
1434 session->last_cap_renew = now;
1435 }
1436 }
1437
1438 if (client_reconnect_gather.empty()) {
1439 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1440 reconnect_gather_finish();
1441 return;
1442 }
1443
1444 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1445
1446 reconnect_start = now;
1447 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1448 mds->sessionmap.dump();
1449 }
1450
1451 void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
1452 {
1453 dout(7) << "handle_client_reconnect " << m->get_source()
1454 << (m->has_more() ? " (more)" : "") << dendl;
1455 client_t from = m->get_source().num();
1456 Session *session = mds->get_session(m);
1457 if (!session) {
1458 dout(0) << " ignoring sessionless msg " << *m << dendl;
1459 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
1460 reply->metadata["error_string"] = "sessionless";
1461 mds->send_message(reply, m->get_connection());
1462 return;
1463 }
1464
1465 if(mds->mdsmap->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
1466 mds->clog->warn() << "client could not reconnect as"
1467 " file system flag refuse_client_session is set";
1468 dout(0) << "client cannot reconnect when file system flag"
1469 " refuse_client_session is set" << dendl;
1470 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1471 reply->metadata["error_string"] = "client cannot reconnect when file system flag"
1472 " refuse_client_session is set";
1473 mds->send_message(reply, m->get_connection());
1474 return;
1475 }
1476
1477 if (!session->is_open()) {
1478 dout(0) << " ignoring msg from not-open session" << *m << dendl;
1479 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1480 mds->send_message(reply, m->get_connection());
1481 return;
1482 }
1483
1484 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1485
1486 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1487 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1488 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1489 return;
1490 }
1491
1492 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
1493 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1494
1495 bool deny = false;
1496 if (reconnect_all_deny || !mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
1497 // XXX maybe in the future we can do better than this?
1498 if (reconnect_all_deny) {
1499 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl;
1500 } else {
1501 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1502 }
1503 mds->clog->info() << "denied reconnect attempt (mds is "
1504 << ceph_mds_state_name(mds->get_state())
1505 << ") from " << m->get_source_inst()
1506 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
1507 deny = true;
1508 } else {
1509 std::string error_str;
1510 if (!session->is_open()) {
1511 error_str = "session is closed";
1512 } else if (mdcache->is_readonly()) {
1513 error_str = "mds is readonly";
1514 } else {
1515 if (session->info.client_metadata.features.empty())
1516 infer_supported_features(session, session->info.client_metadata);
1517
1518 feature_bitset_t missing_features = required_client_features;
1519 missing_features -= session->info.client_metadata.features;
1520 if (!missing_features.empty()) {
1521 CachedStackStringStream css;
1522 *css << "missing required features '" << missing_features << "'";
1523 error_str = css->strv();
1524 }
1525 }
1526
1527 if (!error_str.empty()) {
1528 deny = true;
1529 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1530 mds->clog->info() << "denied reconnect attempt from "
1531 << m->get_source_inst() << " (" << error_str << ")";
1532 }
1533 }
1534
1535 if (deny) {
1536 auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1537 mds->send_message_client(r, session);
1538 if (session->is_open()) {
1539 client_reconnect_denied.insert(session->get_client());
1540 }
1541 return;
1542 }
1543
1544 if (!m->has_more()) {
1545 metrics_handler->add_session(session);
1546 // notify client of success with an OPEN
1547 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
1548 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
1549 reply->supported_features = supported_features;
1550 reply->metric_spec = supported_metric_spec;
1551 }
1552 mds->send_message_client(reply, session);
1553 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1554 }
1555
1556 session->last_cap_renew = clock::now();
1557
1558 // snaprealms
1559 for (const auto &r : m->realms) {
1560 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
1561 if (in && in->state_test(CInode::STATE_PURGING))
1562 continue;
1563 if (in) {
1564 if (in->snaprealm) {
1565 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
1566 } else {
1567 // this can happen if we are non-auth or we rollback snaprealm
1568 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
1569 }
1570 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1571 } else {
1572 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1573 << " seq " << r.realm.seq << dendl;
1574 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1575 }
1576 }
1577
1578 // caps
1579 for (const auto &p : m->caps) {
1580 // make sure our last_cap_id is MAX over all issued caps
1581 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1582 mdcache->last_cap_id = p.second.capinfo.cap_id;
1583
1584 CInode *in = mdcache->get_inode(p.first);
1585 if (in && in->state_test(CInode::STATE_PURGING))
1586 continue;
1587 if (in && in->is_auth()) {
1588 // we recovered it, and it's ours. take note.
1589 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
1590 << " on " << *in << dendl;
1591 in->reconnect_cap(from, p.second, session);
1592 mdcache->add_reconnected_cap(from, p.first, p.second);
1593 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
1594 continue;
1595 }
1596
1597 if (in && !in->is_auth()) {
1598 // not mine.
1599 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1600 // add to cap export list.
1601 mdcache->rejoin_export_caps(p.first, from, p.second,
1602 in->authority().first, true);
1603 } else {
1604 // don't know if the inode is mine
1605 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1606 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
1607 }
1608 }
1609
1610 reconnect_last_seen = clock::now();
1611
1612 if (!m->has_more()) {
1613 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1614
1615 // remove from gather set
1616 client_reconnect_gather.erase(from);
1617 session->set_reconnecting(false);
1618 if (client_reconnect_gather.empty())
1619 reconnect_gather_finish();
1620 }
1621 }
1622
1623 void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1624 {
1625 int supported = -1;
1626 auto it = client_metadata.find("ceph_version");
1627 if (it != client_metadata.end()) {
1628 // user space client
1629 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1630 supported = CEPHFS_FEATURE_LUMINOUS;
1631 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1632 supported = CEPHFS_FEATURE_KRAKEN;
1633 } else {
1634 it = client_metadata.find("kernel_version");
1635 if (it != client_metadata.end()) {
1636 // kernel client
1637 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1638 supported = CEPHFS_FEATURE_LUMINOUS;
1639 }
1640 }
1641 if (supported == -1 &&
1642 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1643 supported = CEPHFS_FEATURE_JEWEL;
1644
1645 if (supported >= 0) {
1646 unsigned long value = (1UL << (supported + 1)) - 1;
1647 client_metadata.features = feature_bitset_t(value);
1648 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1649 }
1650 }
1651
1652 void Server::update_required_client_features()
1653 {
1654 required_client_features = mds->mdsmap->get_required_client_features();
1655 dout(7) << "required_client_features: " << required_client_features << dendl;
1656
1657 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1658 set<Session*> sessions;
1659 mds->sessionmap.get_client_session_set(sessions);
1660 for (auto session : sessions) {
1661 feature_bitset_t missing_features = required_client_features;
1662 missing_features -= session->info.client_metadata.features;
1663 if (!missing_features.empty()) {
1664 bool blocklisted = mds->objecter->with_osdmap(
1665 [session](const OSDMap &osd_map) -> bool {
1666 return osd_map.is_blocklisted(session->info.inst.addr);
1667 });
1668 if (blocklisted)
1669 continue;
1670
1671 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1672 << missing_features << "'";
1673 CachedStackStringStream css;
1674 mds->evict_client(session->get_client().v, false,
1675 g_conf()->mds_session_blocklist_on_evict, *css);
1676 }
1677 }
1678 }
1679 }
1680
1681 void Server::reconnect_gather_finish()
1682 {
1683 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1684 ceph_assert(reconnect_done);
1685
1686 if (!mds->snapclient->is_synced()) {
1687 // make sure snaptable cache is populated. snaprealms will be
1688 // extensively used in rejoin stage.
1689 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1690 mds->snapclient->wait_for_sync(reconnect_done);
1691 } else {
1692 reconnect_done->complete(0);
1693 }
1694 reconnect_done = NULL;
1695 }
1696
1697 void Server::reconnect_tick()
1698 {
1699 bool reject_all_reconnect = false;
1700 if (reconnect_evicting) {
1701 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
1702 return;
1703 }
1704
1705 /*
1706 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1707 * then load less meta information in rejoin phase. This will shorten reboot time.
1708 * Moreover, loading less meta increases the chance standby with less memory can failover.
1709
1710 * Why not shorten reconnect period?
1711 * Clients may send unsafe or retry requests, which haven't been
1712 * completed before old mds stop, to new mds. These requests may
1713 * need to be processed during new mds's clientreplay phase,
1714 * see: #https://github.com/ceph/ceph/pull/29059.
1715 */
1716 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1717 if (client_reconnect_gather.empty())
1718 return;
1719
1720 if (reconnect_all_deny && (client_reconnect_gather == client_reconnect_denied))
1721 reject_all_reconnect = true;
1722
1723 auto now = clock::now();
1724 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
1725 if (elapse1 < g_conf()->mds_reconnect_timeout && !reject_all_reconnect)
1726 return;
1727
1728 vector<Session*> remaining_sessions;
1729 remaining_sessions.reserve(client_reconnect_gather.size());
1730 for (auto c : client_reconnect_gather) {
1731 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1732 ceph_assert(session);
1733 remaining_sessions.push_back(session);
1734 // client re-sends cap flush messages before the reconnect message
1735 if (session->last_seen > reconnect_last_seen)
1736 reconnect_last_seen = session->last_seen;
1737 }
1738
1739 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
1740 if (elapse2 < g_conf()->mds_reconnect_timeout / 2 && !reject_all_reconnect) {
1741 dout(7) << "reconnect_tick: last seen " << elapse2
1742 << " seconds ago, extending reconnect interval" << dendl;
1743 return;
1744 }
1745
1746 dout(7) << "reconnect timed out, " << remaining_sessions.size()
1747 << " clients have not reconnected in time" << dendl;
1748
1749 // If we're doing blocklist evictions, use this to wait for them before
1750 // proceeding to reconnect_gather_finish
1751 MDSGatherBuilder gather(g_ceph_context);
1752
1753 for (auto session : remaining_sessions) {
1754 // Keep sessions that have specified timeout. These sessions will prevent
1755 // mds from going to active. MDS goes to active after they all have been
1756 // killed or reclaimed.
1757 if (session->info.client_metadata.find("timeout") !=
1758 session->info.client_metadata.end()) {
1759 dout(1) << "reconnect keeps " << session->info.inst
1760 << ", need to be reclaimed" << dendl;
1761 client_reclaim_gather.insert(session->get_client());
1762 continue;
1763 }
1764
1765 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
1766
1767 mds->clog->warn() << "evicting unresponsive client " << *session
1768 << ", after waiting " << elapse1
1769 << " seconds during MDS startup";
1770
1771 // make _session_logged() purge orphan objects of lost async/unsafe requests
1772 session->delegated_inos.swap(session->free_prealloc_inos);
1773
1774 if (g_conf()->mds_session_blocklist_on_timeout) {
1775 CachedStackStringStream css;
1776 mds->evict_client(session->get_client().v, false, true, *css,
1777 gather.new_sub());
1778 } else {
1779 kill_session(session, NULL);
1780 }
1781
1782 failed_reconnects++;
1783 }
1784 client_reconnect_gather.clear();
1785 client_reconnect_denied.clear();
1786
1787 if (gather.has_subs()) {
1788 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1789 gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
1790 [this](int r){reconnect_gather_finish();})));
1791 gather.activate();
1792 reconnect_evicting = true;
1793 } else {
1794 reconnect_gather_finish();
1795 }
1796 }
1797
1798 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1799 {
1800 if (!locks.length()) return;
1801 int numlocks;
1802 ceph_filelock lock;
1803 auto p = locks.cbegin();
1804 decode(numlocks, p);
1805 for (int i = 0; i < numlocks; ++i) {
1806 decode(lock, p);
1807 lock.client = client;
1808 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1809 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1810 }
1811 decode(numlocks, p);
1812 for (int i = 0; i < numlocks; ++i) {
1813 decode(lock, p);
1814 lock.client = client;
1815 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1816 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1817 }
1818 }
1819
1820 /**
1821 * Call this when the MDCache is oversized, to send requests to the clients
1822 * to trim some caps, and consequently unpin some inodes in the MDCache so
1823 * that it can trim too.
1824 */
1825 std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1826 {
1827 const auto now = clock::now();
1828 const bool steady = !!(flags&RecallFlags::STEADY);
1829 const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
1830 const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
1831 const bool trim = !!(flags&RecallFlags::TRIM);
1832
1833 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1834 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1835 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1836 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1837 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
1838 const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
1839
1840 dout(7) << __func__ << ":"
1841 << " min=" << min_caps_per_client
1842 << " max=" << max_caps_per_client
1843 << " total=" << Capability::count()
1844 << " flags=" << flags
1845 << dendl;
1846
1847 /* trim caps of sessions with the most caps first */
1848 std::multimap<uint64_t, Session*> caps_session;
1849 auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
1850 auto num_caps = s->caps.size();
1851 auto cache_liveness = s->get_session_cache_liveness();
1852 if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
1853 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1854 }
1855 };
1856 mds->sessionmap.get_client_sessions(std::move(f));
1857
1858 std::pair<bool, uint64_t> result = {false, 0};
1859 auto& [throttled, caps_recalled] = result;
1860 last_recall_state = now;
1861 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
1862 if (!session->is_open() ||
1863 !session->get_connection() ||
1864 !session->info.inst.name.is_client())
1865 continue;
1866
1867 dout(10) << __func__ << ":"
1868 << " session " << session->info.inst
1869 << " caps " << num_caps
1870 << ", leases " << session->leases.size()
1871 << dendl;
1872
1873 uint64_t newlim;
1874 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1875 newlim = min_caps_per_client;
1876 } else {
1877 newlim = num_caps-recall_max_caps;
1878 }
1879 if (num_caps > newlim) {
1880 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1881 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1882 newlim = num_caps-recall;
1883 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
1884 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1885 const uint64_t global_recall_throttle = recall_throttle.get();
1886 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1887 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1888 throttled = true;
1889 continue;
1890 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1891 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1892 throttled = true;
1893 continue;
1894 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1895 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1896 throttled = true;
1897 break;
1898 }
1899
1900 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1901 if (steady) {
1902 const auto session_recall = session->get_recall_caps();
1903 const auto session_release = session->get_release_caps();
1904 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1905 /* The session has been unable to keep up with the number of caps
1906 * recalled (by half); additionally, to prevent marking sessions
1907 * we've just begun to recall from, the session_recall counter
1908 * (decayed count of caps recently recalled) is **greater** than the
1909 * session threshold for the session's cap recall throttle.
1910 */
1911 dout(15) << " 2*session_release < session_recall"
1912 " (2*" << session_release << " < " << session_recall << ") &&"
1913 " 2*session_recall < recall_max_decay_threshold"
1914 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
1915 " Skipping because we are unlikely to get more released." << dendl;
1916 continue;
1917 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1918 /* The number of caps recalled is less than the number we *could*
1919 * recall (so there isn't much left to recall?) and the number of
1920 * caps is less than the current recall_caps counter (decayed count
1921 * of caps recently recalled).
1922 */
1923 dout(15) << " 2*recall < session_recall "
1924 " (2*" << recall << " < " << session_recall << ") &&"
1925 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1926 " Skipping because we are unlikely to get more released." << dendl;
1927 continue;
1928 }
1929 }
1930
1931 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1932
1933 auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
1934 m->head.max_caps = newlim;
1935 mds->send_message_client(m, session);
1936 if (gather) {
1937 flush_session(session, *gather);
1938 }
1939 caps_recalled += session->notify_recall_sent(newlim);
1940 recall_throttle.hit(recall);
1941 }
1942 }
1943
1944 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1945
1946 return result;
1947 }
1948
1949 void Server::force_clients_readonly()
1950 {
1951 dout(10) << "force_clients_readonly" << dendl;
1952 set<Session*> sessions;
1953 mds->sessionmap.get_client_session_set(sessions);
1954 for (set<Session*>::const_iterator p = sessions.begin();
1955 p != sessions.end();
1956 ++p) {
1957 Session *session = *p;
1958 if (!session->info.inst.name.is_client() ||
1959 !(session->is_open() || session->is_stale()))
1960 continue;
1961 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
1962 }
1963 }
1964
1965 /*******
1966 * some generic stuff for finishing off requests
1967 */
1968 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1969 {
1970 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1971 ceph_assert(!mdr->has_completed);
1972
1973 // note trace items for eventual reply.
1974 mdr->tracei = in;
1975 if (in)
1976 mdr->pin(in);
1977
1978 mdr->tracedn = dn;
1979 if (dn)
1980 mdr->pin(dn);
1981
1982 early_reply(mdr, in, dn);
1983
1984 mdr->committing = true;
1985 submit_mdlog_entry(le, fin, mdr, __func__);
1986
1987 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1988 if (mds->queue_one_replay()) {
1989 dout(10) << " queued next replay op" << dendl;
1990 } else {
1991 dout(10) << " journaled last replay op" << dendl;
1992 }
1993 } else if (mdr->did_early_reply)
1994 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
1995 else
1996 mdlog->flush();
1997 }
1998
1999 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
2000 std::string_view event)
2001 {
2002 if (mdr) {
2003 string event_str("submit entry: ");
2004 event_str += event;
2005 mdr->mark_event(event_str);
2006 }
2007 mdlog->submit_entry(le, fin);
2008 }
2009
2010 /*
2011 * send response built from mdr contents and error code; clean up mdr
2012 */
2013 void Server::respond_to_request(MDRequestRef& mdr, int r)
2014 {
2015 if (mdr->client_request) {
2016 if (mdr->is_batch_head()) {
2017 dout(20) << __func__ << " batch head " << *mdr << dendl;
2018 mdr->release_batch_op()->respond(r);
2019 } else {
2020 reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
2021 }
2022 } else if (mdr->internal_op > -1) {
2023 dout(10) << "respond_to_request on internal request " << mdr << dendl;
2024 if (!mdr->internal_op_finish)
2025 ceph_abort_msg("trying to respond to internal op without finisher");
2026 mdr->internal_op_finish->complete(r);
2027 mdcache->request_finish(mdr);
2028 }
2029 }
2030
2031 // statistics mds req op number and latency
2032 void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
2033 {
2034 int code = l_mdss_first;
2035 switch(req->get_op()) {
2036 case CEPH_MDS_OP_LOOKUPHASH:
2037 code = l_mdss_req_lookuphash_latency;
2038 break;
2039 case CEPH_MDS_OP_LOOKUPINO:
2040 code = l_mdss_req_lookupino_latency;
2041 break;
2042 case CEPH_MDS_OP_LOOKUPPARENT:
2043 code = l_mdss_req_lookupparent_latency;
2044 break;
2045 case CEPH_MDS_OP_LOOKUPNAME:
2046 code = l_mdss_req_lookupname_latency;
2047 break;
2048 case CEPH_MDS_OP_LOOKUP:
2049 code = l_mdss_req_lookup_latency;
2050 break;
2051 case CEPH_MDS_OP_LOOKUPSNAP:
2052 code = l_mdss_req_lookupsnap_latency;
2053 break;
2054 case CEPH_MDS_OP_GETATTR:
2055 code = l_mdss_req_getattr_latency;
2056 break;
2057 case CEPH_MDS_OP_SETATTR:
2058 code = l_mdss_req_setattr_latency;
2059 break;
2060 case CEPH_MDS_OP_SETLAYOUT:
2061 code = l_mdss_req_setlayout_latency;
2062 break;
2063 case CEPH_MDS_OP_SETDIRLAYOUT:
2064 code = l_mdss_req_setdirlayout_latency;
2065 break;
2066 case CEPH_MDS_OP_GETVXATTR:
2067 code = l_mdss_req_getvxattr_latency;
2068 break;
2069 case CEPH_MDS_OP_SETXATTR:
2070 code = l_mdss_req_setxattr_latency;
2071 break;
2072 case CEPH_MDS_OP_RMXATTR:
2073 code = l_mdss_req_rmxattr_latency;
2074 break;
2075 case CEPH_MDS_OP_READDIR:
2076 code = l_mdss_req_readdir_latency;
2077 break;
2078 case CEPH_MDS_OP_SETFILELOCK:
2079 code = l_mdss_req_setfilelock_latency;
2080 break;
2081 case CEPH_MDS_OP_GETFILELOCK:
2082 code = l_mdss_req_getfilelock_latency;
2083 break;
2084 case CEPH_MDS_OP_CREATE:
2085 code = l_mdss_req_create_latency;
2086 break;
2087 case CEPH_MDS_OP_OPEN:
2088 code = l_mdss_req_open_latency;
2089 break;
2090 case CEPH_MDS_OP_MKNOD:
2091 code = l_mdss_req_mknod_latency;
2092 break;
2093 case CEPH_MDS_OP_LINK:
2094 code = l_mdss_req_link_latency;
2095 break;
2096 case CEPH_MDS_OP_UNLINK:
2097 code = l_mdss_req_unlink_latency;
2098 break;
2099 case CEPH_MDS_OP_RMDIR:
2100 code = l_mdss_req_rmdir_latency;
2101 break;
2102 case CEPH_MDS_OP_RENAME:
2103 code = l_mdss_req_rename_latency;
2104 break;
2105 case CEPH_MDS_OP_MKDIR:
2106 code = l_mdss_req_mkdir_latency;
2107 break;
2108 case CEPH_MDS_OP_SYMLINK:
2109 code = l_mdss_req_symlink_latency;
2110 break;
2111 case CEPH_MDS_OP_LSSNAP:
2112 code = l_mdss_req_lssnap_latency;
2113 break;
2114 case CEPH_MDS_OP_MKSNAP:
2115 code = l_mdss_req_mksnap_latency;
2116 break;
2117 case CEPH_MDS_OP_RMSNAP:
2118 code = l_mdss_req_rmsnap_latency;
2119 break;
2120 case CEPH_MDS_OP_RENAMESNAP:
2121 code = l_mdss_req_renamesnap_latency;
2122 break;
2123 case CEPH_MDS_OP_READDIR_SNAPDIFF:
2124 code = l_mdss_req_snapdiff_latency;
2125 break;
2126 default:
2127 dout(1) << ": unknown client op" << dendl;
2128 return;
2129 }
2130 logger->tinc(code, lat);
2131 }
2132
2133 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
2134 {
2135 if (!g_conf()->mds_early_reply)
2136 return;
2137
2138 if (mdr->no_early_reply) {
2139 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
2140 return;
2141 }
2142
2143 if (mdr->has_more() && mdr->more()->has_journaled_peers) {
2144 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl;
2145 return;
2146 }
2147
2148 if (mdr->alloc_ino) {
2149 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
2150 return;
2151 }
2152
2153 const cref_t<MClientRequest> &req = mdr->client_request;
2154 entity_inst_t client_inst = req->get_source_inst();
2155 if (client_inst.name.is_mds())
2156 return;
2157
2158 if (req->is_replay()) {
2159 dout(10) << " no early reply on replay op" << dendl;
2160 return;
2161 }
2162
2163
2164 auto reply = make_message<MClientReply>(*req, 0);
2165 reply->set_unsafe();
2166
2167 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2168 //
2169 //_rename_finish() does not send dentry link/unlink message to replicas.
2170 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2171 // that have projected linkages from getting new replica.
2172 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
2173
2174 dout(10) << "early_reply " << reply->get_result()
2175 << " (" << cpp_strerror(reply->get_result())
2176 << ") " << *req << dendl;
2177
2178 if (tracei || tracedn) {
2179 if (tracei)
2180 mdr->cap_releases.erase(tracei->vino());
2181 if (tracedn)
2182 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2183
2184 set_trace_dist(reply, tracei, tracedn, mdr);
2185 }
2186
2187 reply->set_extra_bl(mdr->reply_extra_bl);
2188 mds->send_message_client(reply, mdr->session);
2189
2190 mdr->did_early_reply = true;
2191
2192 mds->logger->inc(l_mds_reply);
2193 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
2194 mds->logger->tinc(l_mds_reply_latency, lat);
2195 if (lat >= g_conf()->mds_op_complaint_time) {
2196 mds->logger->inc(l_mds_slow_reply);
2197 }
2198 if (client_inst.name.is_client()) {
2199 mds->sessionmap.hit_session(mdr->session);
2200 }
2201 perf_gather_op_latency(req, lat);
2202 dout(20) << "lat " << lat << dendl;
2203
2204 mdr->mark_event("early_replied");
2205 }
2206
2207 /*
2208 * send given reply
2209 * include a trace to tracei
2210 * Clean up mdr
2211 */
2212 void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
2213 {
2214 ceph_assert(mdr.get());
2215 const cref_t<MClientRequest> &req = mdr->client_request;
2216
2217 dout(7) << "reply_client_request " << reply->get_result()
2218 << " (" << cpp_strerror(reply->get_result())
2219 << ") " << *req << dendl;
2220
2221 mdr->mark_event("replying");
2222
2223 Session *session = mdr->session;
2224
2225 // note successful request in session map?
2226 //
2227 // setfilelock requests are special, they only modify states in MDS memory.
2228 // The states get lost when MDS fails. If Client re-send a completed
2229 // setfilelock request, it means that client did not receive corresponding
2230 // setfilelock reply. So MDS should re-execute the setfilelock request.
2231 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
2232 reply->get_result() == 0 && session) {
2233 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
2234 session->add_completed_request(mdr->reqid.tid, created);
2235 if (mdr->ls) {
2236 mdr->ls->touched_sessions.insert(session->info.inst.name);
2237 }
2238 }
2239
2240 // give any preallocated inos to the session
2241 apply_allocated_inos(mdr, session);
2242
2243 // get tracei/tracedn from mdr?
2244 CInode *tracei = mdr->tracei;
2245 CDentry *tracedn = mdr->tracedn;
2246
2247 bool is_replay = mdr->client_request->is_replay();
2248 bool did_early_reply = mdr->did_early_reply;
2249 entity_inst_t client_inst = req->get_source_inst();
2250
2251 if (!did_early_reply && !is_replay) {
2252
2253 mds->logger->inc(l_mds_reply);
2254 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
2255 mds->logger->tinc(l_mds_reply_latency, lat);
2256 if (lat >= g_conf()->mds_op_complaint_time) {
2257 mds->logger->inc(l_mds_slow_reply);
2258 }
2259 if (session && client_inst.name.is_client()) {
2260 mds->sessionmap.hit_session(session);
2261 }
2262 perf_gather_op_latency(req, lat);
2263 dout(20) << "lat " << lat << dendl;
2264
2265 if (tracei)
2266 mdr->cap_releases.erase(tracei->vino());
2267 if (tracedn)
2268 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2269 }
2270
2271 // drop non-rdlocks before replying, so that we can issue leases
2272 mdcache->request_drop_non_rdlocks(mdr);
2273
2274 // reply at all?
2275 if (session && !client_inst.name.is_mds()) {
2276 // send reply.
2277 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
2278 (tracei || tracedn)) {
2279 if (is_replay) {
2280 if (tracei)
2281 mdcache->try_reconnect_cap(tracei, session);
2282 } else {
2283 // include metadata in reply
2284 set_trace_dist(reply, tracei, tracedn, mdr);
2285 }
2286 }
2287
2288 // We can set the extra bl unconditionally: if it's already been sent in the
2289 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2290 reply->set_extra_bl(mdr->reply_extra_bl);
2291
2292 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
2293 mds->send_message_client(reply, session);
2294 }
2295
2296 if (req->is_queued_for_replay() &&
2297 (mdr->has_completed || reply->get_result() < 0)) {
2298 if (reply->get_result() < 0) {
2299 int r = reply->get_result();
2300 derr << "reply_client_request: failed to replay " << *req
2301 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2302 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2303 }
2304 mds->queue_one_replay();
2305 }
2306
2307 // clean up request
2308 mdcache->request_finish(mdr);
2309
2310 // take a closer look at tracei, if it happens to be a remote link
2311 if (tracei &&
2312 tracedn &&
2313 tracedn->get_projected_linkage()->is_remote()) {
2314 mdcache->eval_remote(tracedn);
2315 }
2316 }
2317
2318 /*
2319 * pass inode OR dentry (not both, or we may get confused)
2320 *
2321 * trace is in reverse order (i.e. root inode comes last)
2322 */
2323 void Server::set_trace_dist(const ref_t<MClientReply> &reply,
2324 CInode *in, CDentry *dn,
2325 MDRequestRef& mdr)
2326 {
2327 // skip doing this for debugging purposes?
2328 if (g_conf()->mds_inject_traceless_reply_probability &&
2329 mdr->ls && !mdr->o_trunc &&
2330 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
2331 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2332 return;
2333 }
2334
2335 // inode, dentry, dir, ..., inode
2336 bufferlist bl;
2337 mds_rank_t whoami = mds->get_nodeid();
2338 Session *session = mdr->session;
2339 snapid_t snapid = mdr->snapid;
2340 utime_t now = ceph_clock_now();
2341
2342 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2343
2344 // realm
2345 if (snapid == CEPH_NOSNAP) {
2346 SnapRealm *realm;
2347 if (in)
2348 realm = in->find_snaprealm();
2349 else
2350 realm = dn->get_dir()->get_inode()->find_snaprealm();
2351 reply->snapbl = get_snap_trace(session, realm);
2352 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2353 }
2354
2355 // dir + dentry?
2356 if (dn) {
2357 reply->head.is_dentry = 1;
2358 CDir *dir = dn->get_dir();
2359 CInode *diri = dir->get_inode();
2360
2361 diri->encode_inodestat(bl, session, NULL, snapid);
2362 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2363
2364 #ifdef MDS_VERIFY_FRAGSTAT
2365 if (dir->is_complete())
2366 dir->verify_fragstat();
2367 #endif
2368 DirStat ds;
2369 ds.frag = dir->get_frag();
2370 ds.auth = dir->get_dir_auth().first;
2371 if (dir->is_auth() && !forward_all_requests_to_auth)
2372 dir->get_dist_spec(ds.dist, whoami);
2373
2374 dir->encode_dirstat(bl, session->info, ds);
2375 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2376
2377 encode(dn->get_name(), bl);
2378 mds->locker->issue_client_lease(dn, in, mdr, now, bl);
2379 } else
2380 reply->head.is_dentry = 0;
2381
2382 // inode
2383 if (in) {
2384 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2385 dout(20) << "set_trace_dist added snap " << snapid << " in " << *in
2386 << dendl;
2387 reply->head.is_target = 1;
2388 } else
2389 reply->head.is_target = 0;
2390
2391 reply->set_trace(bl);
2392 }
2393
2394 void Server::handle_client_request(const cref_t<MClientRequest> &req)
2395 {
2396 dout(4) << "handle_client_request " << *req << dendl;
2397
2398 if (mds->logger)
2399 mds->logger->inc(l_mds_request);
2400 if (logger)
2401 logger->inc(l_mdss_handle_client_request);
2402
2403 if (!mdcache->is_open()) {
2404 dout(5) << "waiting for root" << dendl;
2405 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2406 return;
2407 }
2408
2409 bool sessionclosed_isok = replay_unsafe_with_closed_session;
2410 // active session?
2411 Session *session = 0;
2412 if (req->is_a_client()) {
2413 session = mds->get_session(req);
2414 if (!session) {
2415 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
2416 } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
2417 session->is_closing() ||
2418 session->is_killing()) {
2419 dout(5) << "session closed|closing|killing, dropping" << dendl;
2420 session = NULL;
2421 }
2422 if (!session) {
2423 if (req->is_queued_for_replay())
2424 mds->queue_one_replay();
2425 return;
2426 }
2427 }
2428
2429 // old mdsmap?
2430 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2431 // send it? hrm, this isn't ideal; they may get a lot of copies if
2432 // they have a high request rate.
2433 }
2434
2435 // completed request?
2436 bool has_completed = false;
2437 if (req->is_replay() || req->get_retry_attempt()) {
2438 ceph_assert(session);
2439 inodeno_t created;
2440 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2441 has_completed = true;
2442 if (!session->is_open())
2443 return;
2444 // Don't send traceless reply if the completed request has created
2445 // new inode. Treat the request as lookup request instead.
2446 if (req->is_replay() ||
2447 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2448 req->get_op() != CEPH_MDS_OP_OPEN &&
2449 req->get_op() != CEPH_MDS_OP_CREATE)) {
2450 dout(5) << "already completed " << req->get_reqid() << dendl;
2451 auto reply = make_message<MClientReply>(*req, 0);
2452 if (created != inodeno_t()) {
2453 bufferlist extra;
2454 encode(created, extra);
2455 reply->set_extra_bl(extra);
2456 }
2457 mds->send_message_client(reply, session);
2458
2459 if (req->is_queued_for_replay())
2460 mds->queue_one_replay();
2461
2462 return;
2463 }
2464 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2465 req->get_op() != CEPH_MDS_OP_CREATE) {
2466 dout(10) << " completed request which created new inode " << created
2467 << ", convert it to lookup request" << dendl;
2468 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2469 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2470 }
2471 }
2472 }
2473
2474 // trim completed_request list
2475 if (req->get_oldest_client_tid() > 0) {
2476 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
2477 ceph_assert(session);
2478 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2479 // Sessions 'completed_requests' was dirtied, mark it to be
2480 // potentially flushed at segment expiry.
2481 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2482
2483 if (session->get_num_trim_requests_warnings() > 0 &&
2484 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
2485 session->reset_num_trim_requests_warnings();
2486 } else {
2487 if (session->get_num_completed_requests() >=
2488 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
2489 session->inc_num_trim_requests_warnings();
2490 CachedStackStringStream css;
2491 *css << "client." << session->get_client() << " does not advance its oldest_client_tid ("
2492 << req->get_oldest_client_tid() << "), "
2493 << session->get_num_completed_requests()
2494 << " completed requests recorded in session\n";
2495 mds->clog->warn() << css->strv();
2496 dout(20) << __func__ << " " << css->strv() << dendl;
2497 }
2498 }
2499 }
2500
2501 // register + dispatch
2502 MDRequestRef mdr = mdcache->request_start(req);
2503 if (!mdr.get())
2504 return;
2505
2506 if (session) {
2507 mdr->session = session;
2508 session->requests.push_back(&mdr->item_session_request);
2509 }
2510
2511 if (has_completed)
2512 mdr->has_completed = true;
2513
2514 // process embedded cap releases?
2515 // (only if NOT replay!)
2516 if (!req->releases.empty() && req->is_a_client() && !req->is_replay()) {
2517 client_t client = req->get_source().num();
2518 for (const auto &r : req->releases) {
2519 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2520 }
2521 req->releases.clear();
2522 }
2523
2524 dispatch_client_request(mdr);
2525 return;
2526 }
2527
2528 void Server::handle_osd_map()
2529 {
2530 /* Note that we check the OSDMAP_FULL flag directly rather than
2531 * using osdmap_full_flag(), because we want to know "is the flag set"
2532 * rather than "does the flag apply to us?" */
2533 mds->objecter->with_osdmap([this](const OSDMap& o) {
2534 auto pi = o.get_pg_pool(mds->get_metadata_pool());
2535 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
2536 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2537 << o.get_epoch() << dendl;
2538 });
2539 }
2540
2541 void Server::dispatch_client_request(MDRequestRef& mdr)
2542 {
2543 // we shouldn't be waiting on anyone.
2544 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_peer.empty());
2545
2546 if (mdr->killed) {
2547 dout(10) << "request " << *mdr << " was killed" << dendl;
2548 //if the mdr is a "batch_op" and it has followers, pick a follower as
2549 //the new "head of the batch ops" and go on processing the new one.
2550 if (mdr->is_batch_head()) {
2551 int mask = mdr->client_request->head.args.getattr.mask;
2552 auto it = mdr->batch_op_map->find(mask);
2553 auto new_batch_head = it->second->find_new_head();
2554 if (!new_batch_head) {
2555 mdr->batch_op_map->erase(it);
2556 return;
2557 }
2558 mdr = std::move(new_batch_head);
2559 } else {
2560 return;
2561 }
2562 } else if (mdr->aborted) {
2563 mdr->aborted = false;
2564 mdcache->request_kill(mdr);
2565 return;
2566 }
2567
2568 const cref_t<MClientRequest> &req = mdr->client_request;
2569
2570 if (logger) logger->inc(l_mdss_dispatch_client_request);
2571
2572 dout(7) << "dispatch_client_request " << *req << dendl;
2573
2574 if (req->may_write() && mdcache->is_readonly()) {
2575 dout(10) << " read-only FS" << dendl;
2576 respond_to_request(mdr, -CEPHFS_EROFS);
2577 return;
2578 }
2579 if (mdr->has_more() && mdr->more()->peer_error) {
2580 dout(10) << " got error from peers" << dendl;
2581 respond_to_request(mdr, mdr->more()->peer_error);
2582 return;
2583 }
2584
2585 if (is_full) {
2586 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
2587 if (!cur) {
2588 // the request is already responded to
2589 return;
2590 }
2591 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2592 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2593 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2594 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2595 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2596 req->get_op() == CEPH_MDS_OP_CREATE ||
2597 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2598 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2599 ((req->get_op() == CEPH_MDS_OP_LINK ||
2600 req->get_op() == CEPH_MDS_OP_RENAME) &&
2601 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started peer request
2602 ) {
2603
2604 if (check_access(mdr, cur, MAY_FULL)) {
2605 dout(20) << __func__ << ": full, has FULL caps, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2606 } else {
2607 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2608 respond_to_request(mdr, -CEPHFS_ENOSPC);
2609 return;
2610 }
2611 } else {
2612 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2613 }
2614 }
2615
2616 switch (req->get_op()) {
2617 case CEPH_MDS_OP_LOOKUPHASH:
2618 case CEPH_MDS_OP_LOOKUPINO:
2619 handle_client_lookup_ino(mdr, false, false);
2620 break;
2621 case CEPH_MDS_OP_LOOKUPPARENT:
2622 handle_client_lookup_ino(mdr, true, false);
2623 break;
2624 case CEPH_MDS_OP_LOOKUPNAME:
2625 handle_client_lookup_ino(mdr, false, true);
2626 break;
2627
2628 // inodes ops.
2629 case CEPH_MDS_OP_LOOKUP:
2630 handle_client_getattr(mdr, true);
2631 break;
2632
2633 case CEPH_MDS_OP_LOOKUPSNAP:
2634 // lookupsnap does not reference a CDentry; treat it as a getattr
2635 case CEPH_MDS_OP_GETATTR:
2636 handle_client_getattr(mdr, false);
2637 break;
2638 case CEPH_MDS_OP_GETVXATTR:
2639 handle_client_getvxattr(mdr);
2640 break;
2641
2642 case CEPH_MDS_OP_SETATTR:
2643 handle_client_setattr(mdr);
2644 break;
2645 case CEPH_MDS_OP_SETLAYOUT:
2646 handle_client_setlayout(mdr);
2647 break;
2648 case CEPH_MDS_OP_SETDIRLAYOUT:
2649 handle_client_setdirlayout(mdr);
2650 break;
2651 case CEPH_MDS_OP_SETXATTR:
2652 handle_client_setxattr(mdr);
2653 break;
2654 case CEPH_MDS_OP_RMXATTR:
2655 handle_client_removexattr(mdr);
2656 break;
2657
2658 case CEPH_MDS_OP_READDIR:
2659 handle_client_readdir(mdr);
2660 break;
2661
2662 case CEPH_MDS_OP_SETFILELOCK:
2663 handle_client_file_setlock(mdr);
2664 break;
2665
2666 case CEPH_MDS_OP_GETFILELOCK:
2667 handle_client_file_readlock(mdr);
2668 break;
2669
2670 // funky.
2671 case CEPH_MDS_OP_CREATE:
2672 if (mdr->has_completed)
2673 handle_client_open(mdr); // already created.. just open
2674 else
2675 handle_client_openc(mdr);
2676 break;
2677
2678 case CEPH_MDS_OP_OPEN:
2679 handle_client_open(mdr);
2680 break;
2681
2682 // namespace.
2683 // no prior locks.
2684 case CEPH_MDS_OP_MKNOD:
2685 handle_client_mknod(mdr);
2686 break;
2687 case CEPH_MDS_OP_LINK:
2688 handle_client_link(mdr);
2689 break;
2690 case CEPH_MDS_OP_UNLINK:
2691 case CEPH_MDS_OP_RMDIR:
2692 handle_client_unlink(mdr);
2693 break;
2694 case CEPH_MDS_OP_RENAME:
2695 handle_client_rename(mdr);
2696 break;
2697 case CEPH_MDS_OP_MKDIR:
2698 handle_client_mkdir(mdr);
2699 break;
2700 case CEPH_MDS_OP_SYMLINK:
2701 handle_client_symlink(mdr);
2702 break;
2703
2704
2705 // snaps
2706 case CEPH_MDS_OP_LSSNAP:
2707 handle_client_lssnap(mdr);
2708 break;
2709 case CEPH_MDS_OP_MKSNAP:
2710 handle_client_mksnap(mdr);
2711 break;
2712 case CEPH_MDS_OP_RMSNAP:
2713 handle_client_rmsnap(mdr);
2714 break;
2715 case CEPH_MDS_OP_RENAMESNAP:
2716 handle_client_renamesnap(mdr);
2717 break;
2718 case CEPH_MDS_OP_READDIR_SNAPDIFF:
2719 handle_client_readdir_snapdiff(mdr);
2720 break;
2721
2722 default:
2723 dout(1) << " unknown client op " << req->get_op() << dendl;
2724 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
2725 }
2726 }
2727
2728
2729 // ---------------------------------------
2730 // PEER REQUESTS
2731
2732 void Server::handle_peer_request(const cref_t<MMDSPeerRequest> &m)
2733 {
2734 dout(4) << "handle_peer_request " << m->get_reqid() << " from " << m->get_source() << dendl;
2735 mds_rank_t from = mds_rank_t(m->get_source().num());
2736
2737 if (logger) logger->inc(l_mdss_handle_peer_request);
2738
2739 // reply?
2740 if (m->is_reply())
2741 return handle_peer_request_reply(m);
2742
2743 // the purpose of rename notify is enforcing causal message ordering. making sure
2744 // bystanders have received all messages from rename srcdn's auth MDS.
2745 if (m->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY) {
2746 auto reply = make_message<MMDSPeerRequest>(m->get_reqid(), m->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK);
2747 mds->send_message(reply, m->get_connection());
2748 return;
2749 }
2750
2751 CDentry *straydn = NULL;
2752 if (m->straybl.length() > 0) {
2753 mdcache->decode_replica_stray(straydn, nullptr, m->straybl, from);
2754 ceph_assert(straydn);
2755 m->straybl.clear();
2756 }
2757
2758 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2759 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2760 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2761 return;
2762 }
2763
2764 // am i a new peer?
2765 MDRequestRef mdr;
2766 if (mdcache->have_request(m->get_reqid())) {
2767 // existing?
2768 mdr = mdcache->request_get(m->get_reqid());
2769
2770 // is my request newer?
2771 if (mdr->attempt > m->get_attempt()) {
2772 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2773 << ", dropping " << *m << dendl;
2774 return;
2775 }
2776
2777 if (mdr->attempt < m->get_attempt()) {
2778 // mine is old, close it out
2779 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2780 << ", closing out" << dendl;
2781 mdcache->request_finish(mdr);
2782 mdr.reset();
2783 } else if (mdr->peer_to_mds != from) {
2784 dout(10) << "local request " << *mdr << " not peer to mds." << from << dendl;
2785 return;
2786 }
2787
2788 // may get these while mdr->peer_request is non-null
2789 if (m->get_op() == MMDSPeerRequest::OP_DROPLOCKS) {
2790 mds->locker->drop_locks(mdr.get());
2791 return;
2792 }
2793 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2794 if (m->is_abort()) {
2795 mdr->aborted = true;
2796 if (mdr->peer_request) {
2797 // only abort on-going xlock, wrlock and auth pin
2798 ceph_assert(!mdr->peer_did_prepare());
2799 } else {
2800 mdcache->request_finish(mdr);
2801 }
2802 } else {
2803 if (m->inode_export.length() > 0)
2804 mdr->more()->inode_import = m->inode_export;
2805 // finish off request.
2806 mdcache->request_finish(mdr);
2807 }
2808 return;
2809 }
2810 }
2811 if (!mdr.get()) {
2812 // new?
2813 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2814 dout(10) << "missing peer request for " << m->get_reqid()
2815 << " OP_FINISH, must have lost race with a forward" << dendl;
2816 return;
2817 }
2818 mdr = mdcache->request_start_peer(m->get_reqid(), m->get_attempt(), m);
2819 mdr->set_op_stamp(m->op_stamp);
2820 }
2821 ceph_assert(mdr->peer_request == 0); // only one at a time, please!
2822
2823 if (straydn) {
2824 mdr->pin(straydn);
2825 mdr->straydn = straydn;
2826 }
2827
2828 if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2829 mdr->locks.empty()) {
2830 dout(3) << "not active yet, waiting" << dendl;
2831 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2832 return;
2833 }
2834
2835 mdr->reset_peer_request(m);
2836
2837 dispatch_peer_request(mdr);
2838 }
2839
2840 void Server::handle_peer_request_reply(const cref_t<MMDSPeerRequest> &m)
2841 {
2842 mds_rank_t from = mds_rank_t(m->get_source().num());
2843
2844 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2845 metareqid_t r = m->get_reqid();
2846 if (!mdcache->have_uncommitted_leader(r, from)) {
2847 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
2848 << from << " reqid " << r << dendl;
2849 return;
2850 }
2851 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2852 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2853 return;
2854 }
2855
2856 if (m->get_op() == MMDSPeerRequest::OP_COMMITTED) {
2857 metareqid_t r = m->get_reqid();
2858 mdcache->committed_leader_peer(r, from);
2859 return;
2860 }
2861
2862 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2863 if (m->get_attempt() != mdr->attempt) {
2864 dout(10) << "handle_peer_request_reply " << *mdr << " ignoring reply from other attempt "
2865 << m->get_attempt() << dendl;
2866 return;
2867 }
2868
2869 switch (m->get_op()) {
2870 case MMDSPeerRequest::OP_XLOCKACK:
2871 {
2872 // identify lock, leader request
2873 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2874 m->get_object_info());
2875 mdr->more()->peers.insert(from);
2876 lock->decode_locked_state(m->get_lock_data());
2877 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2878 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
2879 mdr->finish_locking(lock);
2880 lock->get_xlock(mdr, mdr->get_client());
2881
2882 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2883 mdr->more()->waiting_on_peer.erase(from);
2884 ceph_assert(mdr->more()->waiting_on_peer.empty());
2885 mdcache->dispatch_request(mdr);
2886 }
2887 break;
2888
2889 case MMDSPeerRequest::OP_WRLOCKACK:
2890 {
2891 // identify lock, leader request
2892 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2893 m->get_object_info());
2894 mdr->more()->peers.insert(from);
2895 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2896 auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
2897 ceph_assert(it->is_remote_wrlock());
2898 ceph_assert(it->wrlock_target == from);
2899
2900 mdr->finish_locking(lock);
2901
2902 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2903 mdr->more()->waiting_on_peer.erase(from);
2904 ceph_assert(mdr->more()->waiting_on_peer.empty());
2905 mdcache->dispatch_request(mdr);
2906 }
2907 break;
2908
2909 case MMDSPeerRequest::OP_AUTHPINACK:
2910 handle_peer_auth_pin_ack(mdr, m);
2911 break;
2912
2913 case MMDSPeerRequest::OP_LINKPREPACK:
2914 handle_peer_link_prep_ack(mdr, m);
2915 break;
2916
2917 case MMDSPeerRequest::OP_RMDIRPREPACK:
2918 handle_peer_rmdir_prep_ack(mdr, m);
2919 break;
2920
2921 case MMDSPeerRequest::OP_RENAMEPREPACK:
2922 handle_peer_rename_prep_ack(mdr, m);
2923 break;
2924
2925 case MMDSPeerRequest::OP_RENAMENOTIFYACK:
2926 handle_peer_rename_notify_ack(mdr, m);
2927 break;
2928
2929 default:
2930 ceph_abort_msg("unknown op " + to_string(m->get_op()) + " requested");
2931 }
2932 }
2933
2934 void Server::dispatch_peer_request(MDRequestRef& mdr)
2935 {
2936 dout(7) << "dispatch_peer_request " << *mdr << " " << *mdr->peer_request << dendl;
2937
2938 if (mdr->aborted) {
2939 dout(7) << " abort flag set, finishing" << dendl;
2940 mdcache->request_finish(mdr);
2941 return;
2942 }
2943
2944 if (logger) logger->inc(l_mdss_dispatch_peer_request);
2945
2946 int op = mdr->peer_request->get_op();
2947 switch (op) {
2948 case MMDSPeerRequest::OP_XLOCK:
2949 case MMDSPeerRequest::OP_WRLOCK:
2950 {
2951 // identify object
2952 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2953 mdr->peer_request->get_object_info());
2954
2955 if (!lock) {
2956 dout(10) << "don't have object, dropping" << dendl;
2957 ceph_abort_msg("don't have object"); // can this happen, if we auth pinned properly.
2958 }
2959 if (op == MMDSPeerRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2960 dout(10) << "not auth for remote xlock attempt, dropping on "
2961 << *lock << " on " << *lock->get_parent() << dendl;
2962 } else {
2963 // use acquire_locks so that we get auth_pinning.
2964 MutationImpl::LockOpVec lov;
2965 for (const auto& p : mdr->locks) {
2966 if (p.is_xlock())
2967 lov.add_xlock(p.lock);
2968 else if (p.is_wrlock())
2969 lov.add_wrlock(p.lock);
2970 }
2971
2972 int replycode = 0;
2973 switch (op) {
2974 case MMDSPeerRequest::OP_XLOCK:
2975 lov.add_xlock(lock);
2976 replycode = MMDSPeerRequest::OP_XLOCKACK;
2977 break;
2978 case MMDSPeerRequest::OP_WRLOCK:
2979 lov.add_wrlock(lock);
2980 replycode = MMDSPeerRequest::OP_WRLOCKACK;
2981 break;
2982 }
2983
2984 if (!mds->locker->acquire_locks(mdr, lov))
2985 return;
2986
2987 // ack
2988 auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, replycode);
2989 r->set_lock_type(lock->get_type());
2990 lock->get_parent()->set_object_info(r->get_object_info());
2991 if (replycode == MMDSPeerRequest::OP_XLOCKACK)
2992 lock->encode_locked_state(r->get_lock_data());
2993 mds->send_message(r, mdr->peer_request->get_connection());
2994 }
2995
2996 // done.
2997 mdr->reset_peer_request();
2998 }
2999 break;
3000
3001 case MMDSPeerRequest::OP_UNXLOCK:
3002 case MMDSPeerRequest::OP_UNWRLOCK:
3003 {
3004 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
3005 mdr->peer_request->get_object_info());
3006 ceph_assert(lock);
3007 auto it = mdr->locks.find(lock);
3008 ceph_assert(it != mdr->locks.end());
3009 bool need_issue = false;
3010 switch (op) {
3011 case MMDSPeerRequest::OP_UNXLOCK:
3012 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
3013 break;
3014 case MMDSPeerRequest::OP_UNWRLOCK:
3015 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
3016 break;
3017 }
3018 if (need_issue)
3019 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
3020
3021 // done. no ack necessary.
3022 mdr->reset_peer_request();
3023 }
3024 break;
3025
3026 case MMDSPeerRequest::OP_AUTHPIN:
3027 handle_peer_auth_pin(mdr);
3028 break;
3029
3030 case MMDSPeerRequest::OP_LINKPREP:
3031 case MMDSPeerRequest::OP_UNLINKPREP:
3032 handle_peer_link_prep(mdr);
3033 break;
3034
3035 case MMDSPeerRequest::OP_RMDIRPREP:
3036 handle_peer_rmdir_prep(mdr);
3037 break;
3038
3039 case MMDSPeerRequest::OP_RENAMEPREP:
3040 handle_peer_rename_prep(mdr);
3041 break;
3042
3043 default:
3044 ceph_abort_msg("unknown op "+ to_string(op)+ " received");
3045 }
3046 }
3047
3048 void Server::handle_peer_auth_pin(MDRequestRef& mdr)
3049 {
3050 dout(10) << "handle_peer_auth_pin " << *mdr << dendl;
3051
3052 // build list of objects
3053 list<MDSCacheObject*> objects;
3054 CInode *auth_pin_freeze = NULL;
3055 bool nonblocking = mdr->peer_request->is_nonblocking();
3056 bool fail = false, wouldblock = false, readonly = false;
3057 ref_t<MMDSPeerRequest> reply;
3058
3059 if (mdcache->is_readonly()) {
3060 dout(10) << " read-only FS" << dendl;
3061 readonly = true;
3062 fail = true;
3063 }
3064
3065 if (!fail) {
3066 for (const auto &oi : mdr->peer_request->get_authpins()) {
3067 MDSCacheObject *object = mdcache->get_object(oi);
3068 if (!object) {
3069 dout(10) << " don't have " << oi << dendl;
3070 fail = true;
3071 break;
3072 }
3073
3074 objects.push_back(object);
3075 if (oi == mdr->peer_request->get_authpin_freeze())
3076 auth_pin_freeze = static_cast<CInode*>(object);
3077 }
3078 }
3079
3080 // can we auth pin them?
3081 if (!fail) {
3082 for (const auto& obj : objects) {
3083 if (!obj->is_auth()) {
3084 dout(10) << " not auth for " << *obj << dendl;
3085 fail = true;
3086 break;
3087 }
3088 if (mdr->is_auth_pinned(obj))
3089 continue;
3090 if (!mdr->can_auth_pin(obj)) {
3091 if (nonblocking) {
3092 dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
3093 fail = true;
3094 wouldblock = true;
3095 break;
3096 }
3097 // wait
3098 dout(10) << " waiting for authpinnable on " << *obj << dendl;
3099 obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3100 mdr->drop_local_auth_pins();
3101
3102 mds->locker->notify_freeze_waiter(obj);
3103 goto blocked;
3104 }
3105 }
3106 }
3107
3108 if (!fail) {
3109 /* freeze authpin wrong inode */
3110 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
3111 mdr->more()->rename_inode != auth_pin_freeze)
3112 mdr->unfreeze_auth_pin(true);
3113
3114 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
3115 * on the source inode to complete. This happens after all locks for the rename
3116 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3117 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3118 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
3119 * The solution is freeze the inode and prevent other MDRequests from getting new
3120 * auth pins.
3121 */
3122 if (auth_pin_freeze) {
3123 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
3124 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
3125 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
3126 mds->mdlog->flush();
3127 goto blocked;
3128 }
3129 }
3130 }
3131
3132 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
3133
3134 if (fail) {
3135 mdr->drop_local_auth_pins(); // just in case
3136 if (readonly)
3137 reply->mark_error_rofs();
3138 if (wouldblock)
3139 reply->mark_error_wouldblock();
3140 } else {
3141 // auth pin!
3142 for (const auto& obj : objects) {
3143 dout(10) << "auth_pinning " << *obj << dendl;
3144 mdr->auth_pin(obj);
3145 }
3146 // return list of my auth_pins (if any)
3147 for (const auto &p : mdr->object_states) {
3148 if (!p.second.auth_pinned)
3149 continue;
3150 MDSCacheObjectInfo info;
3151 p.first->set_object_info(info);
3152 reply->get_authpins().push_back(info);
3153 if (p.first == (MDSCacheObject*)auth_pin_freeze)
3154 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
3155 }
3156 }
3157
3158 mds->send_message_mds(reply, mdr->peer_to_mds);
3159
3160 // clean up this request
3161 mdr->reset_peer_request();
3162 return;
3163
3164 blocked:
3165 if (mdr->peer_request->should_notify_blocking()) {
3166 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
3167 reply->mark_req_blocked();
3168 mds->send_message_mds(reply, mdr->peer_to_mds);
3169 mdr->peer_request->clear_notify_blocking();
3170 }
3171 return;
3172 }
3173
3174 void Server::handle_peer_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
3175 {
3176 dout(10) << "handle_peer_auth_pin_ack on " << *mdr << " " << *ack << dendl;
3177 mds_rank_t from = mds_rank_t(ack->get_source().num());
3178
3179 if (ack->is_req_blocked()) {
3180 mdr->disable_lock_cache();
3181 // peer auth pin is blocked, drop locks to avoid deadlock
3182 mds->locker->drop_locks(mdr.get(), nullptr);
3183 return;
3184 }
3185
3186 // added auth pins?
3187 set<MDSCacheObject*> pinned;
3188 for (const auto &oi : ack->get_authpins()) {
3189 MDSCacheObject *object = mdcache->get_object(oi);
3190 ceph_assert(object); // we pinned it
3191 dout(10) << " remote has pinned " << *object << dendl;
3192 mdr->set_remote_auth_pinned(object, from);
3193 if (oi == ack->get_authpin_freeze())
3194 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
3195 pinned.insert(object);
3196 }
3197
3198 // removed frozen auth pin ?
3199 if (mdr->more()->is_remote_frozen_authpin &&
3200 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
3201 auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
3202 ceph_assert(stat_p);
3203 if (stat_p->remote_auth_pinned == from) {
3204 mdr->more()->is_remote_frozen_authpin = false;
3205 }
3206 }
3207
3208 // removed auth pins?
3209 for (auto& p : mdr->object_states) {
3210 if (p.second.remote_auth_pinned == MDS_RANK_NONE)
3211 continue;
3212 MDSCacheObject* object = p.first;
3213 if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
3214 dout(10) << " remote has unpinned " << *object << dendl;
3215 mdr->_clear_remote_auth_pinned(p.second);
3216 }
3217 }
3218
3219 // note peer
3220 mdr->more()->peers.insert(from);
3221
3222 // clear from waiting list
3223 auto ret = mdr->more()->waiting_on_peer.erase(from);
3224 ceph_assert(ret);
3225
3226 if (ack->is_error_rofs()) {
3227 mdr->more()->peer_error = -CEPHFS_EROFS;
3228 } else if (ack->is_error_wouldblock()) {
3229 mdr->more()->peer_error = -CEPHFS_EWOULDBLOCK;
3230 }
3231
3232 // go again?
3233 if (mdr->more()->waiting_on_peer.empty())
3234 mdcache->dispatch_request(mdr);
3235 else
3236 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
3237 }
3238
3239
3240 // ---------------------------------------
3241 // HELPERS
3242
3243
3244 /**
3245 * check whether we are permitted to complete a request
3246 *
3247 * Check whether we have permission to perform the operation specified
3248 * by mask on the given inode, based on the capability in the mdr's
3249 * session.
3250 */
3251 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
3252 {
3253 if (mdr->session) {
3254 int r = mdr->session->check_access(
3255 in, mask,
3256 mdr->client_request->get_caller_uid(),
3257 mdr->client_request->get_caller_gid(),
3258 &mdr->client_request->get_caller_gid_list(),
3259 mdr->client_request->head.args.setattr.uid,
3260 mdr->client_request->head.args.setattr.gid);
3261 if (r < 0) {
3262 respond_to_request(mdr, r);
3263 return false;
3264 }
3265 }
3266 return true;
3267 }
3268
3269 /**
3270 * check whether fragment has reached maximum size
3271 *
3272 */
3273 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *dir)
3274 {
3275 const auto size = dir->get_frag_size();
3276 const auto max = bal_fragment_size_max;
3277 if (size >= max) {
3278 dout(10) << "fragment " << *dir << " size exceeds " << max << " (CEPHFS_ENOSPC)" << dendl;
3279 respond_to_request(mdr, -CEPHFS_ENOSPC);
3280 return false;
3281 } else {
3282 dout(20) << "fragment " << *dir << " size " << size << " < " << max << dendl;
3283 }
3284
3285 return true;
3286 }
3287
3288 /**
3289 * check whether entries in a dir reached maximum size
3290 *
3291 */
3292 bool Server::check_dir_max_entries(MDRequestRef &mdr, CDir *in)
3293 {
3294 const uint64_t size = in->inode->get_projected_inode()->dirstat.nfiles +
3295 in->inode->get_projected_inode()->dirstat.nsubdirs;
3296 if (dir_max_entries && size >= dir_max_entries) {
3297 dout(10) << "entries per dir " << *in << " size exceeds " << dir_max_entries << " (ENOSPC)" << dendl;
3298 respond_to_request(mdr, -ENOSPC);
3299 return false;
3300 }
3301 return true;
3302 }
3303
3304
3305 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3306 {
3307 string straydname;
3308 in->name_stray_dentry(straydname);
3309
3310 CDentry *straydn = mdr->straydn;
3311 if (straydn) {
3312 ceph_assert(straydn->get_name() == straydname);
3313 return straydn;
3314 }
3315 CDir *straydir = mdcache->get_stray_dir(in);
3316
3317 if (!mdr->client_request->is_replay() &&
3318 !check_fragment_space(mdr, straydir))
3319 return nullptr;
3320
3321 straydn = straydir->lookup(straydname);
3322 if (!straydn) {
3323 if (straydir->is_frozen_dir()) {
3324 dout(10) << __func__ << ": " << *straydir << " is frozen, waiting" << dendl;
3325 mds->locker->drop_locks(mdr.get());
3326 mdr->drop_local_auth_pins();
3327 straydir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3328 return nullptr;
3329 }
3330 straydn = straydir->add_null_dentry(straydname);
3331 straydn->mark_new();
3332 } else {
3333 ceph_assert(straydn->get_projected_linkage()->is_null());
3334 }
3335
3336 straydn->state_set(CDentry::STATE_STRAY);
3337 mdr->straydn = straydn;
3338 mdr->pin(straydn);
3339
3340 return straydn;
3341 }
3342
3343 /** prepare_new_inode
3344 *
3345 * create a new inode. set c/m/atime. hit dir pop.
3346 */
3347 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
3348 const file_layout_t *layout)
3349 {
3350 CInode *in = new CInode(mdcache);
3351 auto _inode = in->_get_inode();
3352
3353 // Server::prepare_force_open_sessions() can re-open session in closing
3354 // state. In that corner case, session's prealloc_inos are being freed.
3355 // To simplify the code, we disallow using/refilling session's prealloc_ino
3356 // while session is opening.
3357 bool allow_prealloc_inos = mdr->session->is_open();
3358
3359 inodeno_t _useino = useino;
3360
3361 // assign ino
3362 do {
3363 if (allow_prealloc_inos && (mdr->used_prealloc_ino = _inode->ino = mdr->session->take_ino(_useino))) {
3364 if (mdcache->test_and_clear_taken_inos(_inode->ino)) {
3365 _inode->ino = 0;
3366 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3367 << " (" << mdr->session->info.prealloc_inos.size() << " left)"
3368 << " but has been taken, will try again!" << dendl;
3369 } else {
3370 mds->sessionmap.mark_projected(mdr->session);
3371 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3372 << " (" << mdr->session->info.prealloc_inos.size() << " left)"
3373 << dendl;
3374 }
3375 } else {
3376 mdr->alloc_ino =
3377 _inode->ino = mds->inotable->project_alloc_id(_useino);
3378 if (mdcache->test_and_clear_taken_inos(_inode->ino)) {
3379 mds->inotable->apply_alloc_id(_inode->ino);
3380 _inode->ino = 0;
3381 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino
3382 << " but has been taken, will try again!" << dendl;
3383 } else {
3384 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3385 }
3386 }
3387 _useino = 0;
3388 } while (!_inode->ino);
3389
3390 if (useino && useino != _inode->ino) {
3391 dout(0) << "WARNING: client specified " << useino << " and i allocated " << _inode->ino << dendl;
3392 mds->clog->error() << mdr->client_request->get_source()
3393 << " specified ino " << useino
3394 << " but mds." << mds->get_nodeid() << " allocated " << _inode->ino;
3395 //ceph_abort(); // just for now.
3396 }
3397
3398 if (allow_prealloc_inos &&
3399 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3400 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
3401 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
3402 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
3403 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3404 mds->sessionmap.mark_projected(mdr->session);
3405 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3406 }
3407
3408 _inode->version = 1;
3409 _inode->xattr_version = 1;
3410 _inode->nlink = 1; // FIXME
3411
3412 _inode->mode = mode;
3413
3414 // FIPS zeroization audit 20191117: this memset is not security related.
3415 memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
3416 if (_inode->is_dir()) {
3417 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
3418 } else if (layout) {
3419 _inode->layout = *layout;
3420 } else {
3421 _inode->layout = mdcache->default_file_layout;
3422 }
3423
3424 _inode->truncate_size = -1ull; // not truncated, yet!
3425 _inode->truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
3426
3427 CInode *diri = dir->get_inode();
3428 auto pip = diri->get_projected_inode();
3429
3430 dout(10) << oct << " dir mode 0" << pip->mode << " new mode 0" << mode << dec << dendl;
3431
3432 if (pip->mode & S_ISGID) {
3433 dout(10) << " dir is sticky" << dendl;
3434 _inode->gid = pip->gid;
3435 if (S_ISDIR(mode)) {
3436 dout(10) << " new dir also sticky" << dendl;
3437 _inode->mode |= S_ISGID;
3438 }
3439 } else {
3440 _inode->gid = mdr->client_request->get_owner_gid();
3441 ceph_assert(_inode->gid != (unsigned)-1);
3442 }
3443
3444 _inode->uid = mdr->client_request->get_owner_uid();
3445 ceph_assert(_inode->uid != (unsigned)-1);
3446
3447 _inode->btime = _inode->ctime = _inode->mtime = _inode->atime =
3448 mdr->get_op_stamp();
3449
3450 _inode->change_attr = 0;
3451
3452 const cref_t<MClientRequest> &req = mdr->client_request;
3453
3454 dout(10) << "copying fscrypt_auth len " << req->fscrypt_auth.size() << dendl;
3455 _inode->fscrypt_auth = req->fscrypt_auth;
3456 _inode->fscrypt_file = req->fscrypt_file;
3457
3458 if (req->get_data().length()) {
3459 auto p = req->get_data().cbegin();
3460
3461 // xattrs on new inode?
3462 auto _xattrs = CInode::allocate_xattr_map();
3463 decode_noshare(*_xattrs, p);
3464 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs << dendl;
3465 in->reset_xattrs(std::move(_xattrs));
3466 }
3467
3468 if (!mds->mdsmap->get_inline_data_enabled() ||
3469 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
3470 _inode->inline_data.version = CEPH_INLINE_NONE;
3471
3472 mdcache->add_inode(in); // add
3473 dout(10) << "prepare_new_inode " << *in << dendl;
3474 return in;
3475 }
3476
3477 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3478 {
3479 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3480 << " inotablev " << mds->inotable->get_projected_version()
3481 << dendl;
3482 blob->set_ino_alloc(mdr->alloc_ino,
3483 mdr->used_prealloc_ino,
3484 mdr->prealloc_inos,
3485 mdr->client_request->get_source(),
3486 mds->sessionmap.get_projected(),
3487 mds->inotable->get_projected_version());
3488 }
3489
3490 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3491 {
3492 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3493 << " / " << mdr->prealloc_inos
3494 << " / " << mdr->used_prealloc_ino << dendl;
3495
3496 if (mdr->alloc_ino) {
3497 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3498 }
3499 if (mdr->prealloc_inos.size()) {
3500 ceph_assert(session);
3501 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
3502 session->free_prealloc_inos.insert(mdr->prealloc_inos);
3503 session->info.prealloc_inos.insert(mdr->prealloc_inos);
3504 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
3505 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3506 }
3507 if (mdr->used_prealloc_ino) {
3508 ceph_assert(session);
3509 session->info.prealloc_inos.erase(mdr->used_prealloc_ino);
3510 mds->sessionmap.mark_dirty(session);
3511 }
3512 }
3513
3514 struct C_MDS_TryOpenInode : public ServerContext {
3515 MDRequestRef mdr;
3516 inodeno_t ino;
3517 C_MDS_TryOpenInode(Server *s, MDRequestRef& r, inodeno_t i) :
3518 ServerContext(s), mdr(r), ino(i) {}
3519 void finish(int r) override {
3520 server->_try_open_ino(mdr, r, ino);
3521 }
3522 };
3523
3524 void Server::_try_open_ino(MDRequestRef& mdr, int r, inodeno_t ino)
3525 {
3526 dout(10) << "_try_open_ino " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3527
3528 // `r` is a rank if >=0, else an error code
3529 if (r >= 0) {
3530 mds_rank_t dest_rank(r);
3531 if (dest_rank == mds->get_nodeid())
3532 dispatch_client_request(mdr);
3533 else
3534 mdcache->request_forward(mdr, dest_rank);
3535 return;
3536 }
3537
3538 // give up
3539 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
3540 r = -CEPHFS_ESTALE;
3541 respond_to_request(mdr, r);
3542 }
3543
3544 class C_MDS_TryFindInode : public ServerContext {
3545 MDRequestRef mdr;
3546 MDCache *mdcache;
3547 inodeno_t ino;
3548 public:
3549 C_MDS_TryFindInode(Server *s, MDRequestRef& r, MDCache *m, inodeno_t i) :
3550 ServerContext(s), mdr(r), mdcache(m), ino(i) {}
3551 void finish(int r) override {
3552 if (r == -CEPHFS_ESTALE) { // :( find_ino_peers failed
3553 /*
3554 * There has one case that when the MDS crashes and the
3555 * openfiletable journal couldn't be flushed and then
3556 * the replacing MDS is possibly won't load some already
3557 * opened CInodes into the MDCache. And if the clients
3558 * will retry some requests after reconnected, the MDS
3559 * will return -ESTALE after failing to find the ino in
3560 * all active peers.
3561 *
3562 * As a workaround users can run `ls -R ${mountpoint}`
3563 * to list all the sub-files or sub-direcotries from the
3564 * mountpoint.
3565 *
3566 * We need try to open the ino and try it again.
3567 */
3568 CInode *in = mdcache->get_inode(ino);
3569 if (in && in->state_test(CInode::STATE_PURGING))
3570 server->respond_to_request(mdr, r);
3571 else
3572 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_TryOpenInode(server, mdr, ino));
3573 } else {
3574 server->dispatch_client_request(mdr);
3575 }
3576 }
3577 };
3578
3579 /* If this returns null, the request has been handled
3580 * as appropriate: forwarded on, or the client's been replied to */
3581 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
3582 bool want_auth,
3583 bool no_want_auth)
3584 {
3585 const filepath& refpath = mdr->get_filepath();
3586 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3587
3588 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3589 return mdr->in[0];
3590
3591 // traverse
3592 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3593 int flags = 0;
3594 if (refpath.is_last_snap()) {
3595 if (!no_want_auth)
3596 want_auth = true;
3597 } else {
3598 if (!no_want_auth && forward_all_requests_to_auth)
3599 want_auth = true;
3600 flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
3601 }
3602 if (want_auth)
3603 flags |= MDS_TRAVERSE_WANT_AUTH;
3604 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
3605 if (r > 0)
3606 return nullptr; // delayed
3607 if (r < 0) { // error
3608 if (r == -CEPHFS_ENOENT && !mdr->dn[0].empty()) {
3609 if (mdr->client_request &&
3610 mdr->client_request->get_dentry_wanted())
3611 mdr->tracedn = mdr->dn[0].back();
3612 respond_to_request(mdr, r);
3613 } else if (r == -CEPHFS_ESTALE) {
3614 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
3615 inodeno_t ino = refpath.get_ino();
3616 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
3617 } else {
3618 dout(10) << "FAIL on error " << r << dendl;
3619 respond_to_request(mdr, r);
3620 }
3621 return nullptr;
3622 }
3623 CInode *ref = mdr->in[0];
3624 dout(10) << "ref is " << *ref << dendl;
3625
3626 if (want_auth) {
3627 // auth_pin?
3628 // do NOT proceed if freezing, as cap release may defer in that case, and
3629 // we could deadlock when we try to lock @ref.
3630 // if we're already auth_pinned, continue; the release has already been processed.
3631 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3632 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3633 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
3634 ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
3635 if (mdr->is_any_remote_auth_pin())
3636 mds->locker->notify_freeze_waiter(ref);
3637 return 0;
3638 }
3639 mdr->auth_pin(ref);
3640 }
3641
3642 // set and pin ref
3643 mdr->pin(ref);
3644 return ref;
3645 }
3646
3647
3648 /** rdlock_path_xlock_dentry
3649 * traverse path to the directory that could/would contain dentry.
3650 * make sure i am auth for that dentry (or target inode if it exists and authexist),
3651 * forward as necessary. create null dentry in place (or use existing if okexist).
3652 * get rdlocks on traversed dentries, xlock on new dentry.
3653 *
3654 * set authexist true if caller requires the target inode to be auth when it exists.
3655 * the tail dentry is not always auth any more if authexist because it is impossible
3656 * to ensure tail dentry and target inode are both auth in one mds. the tail dentry
3657 * will not be xlocked too if authexist and the target inode exists.
3658 */
3659 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
3660 bool create, bool okexist, bool authexist,
3661 bool want_layout)
3662 {
3663 const filepath& refpath = mdr->get_filepath();
3664 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3665
3666 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3667 return mdr->dn[0].back();
3668
3669 // figure parent dir vs dname
3670 if (refpath.depth() == 0) {
3671 dout(7) << "invalid path (zero length)" << dendl;
3672 respond_to_request(mdr, -CEPHFS_EINVAL);
3673 return nullptr;
3674 }
3675
3676 if (refpath.is_last_snap()) {
3677 respond_to_request(mdr, -CEPHFS_EROFS);
3678 return nullptr;
3679 }
3680
3681 if (refpath.is_last_dot_or_dotdot()) {
3682 dout(7) << "invalid path (last dot or dot_dot)" << dendl;
3683 if (create)
3684 respond_to_request(mdr, -CEPHFS_EEXIST);
3685 else
3686 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
3687 return nullptr;
3688 }
3689
3690 // traverse to parent dir
3691 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3692 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
3693 MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
3694 MDS_TRAVERSE_WANT_AUTH;
3695 if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
3696 flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
3697 if (create)
3698 flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
3699 if (authexist)
3700 flags |= MDS_TRAVERSE_WANT_INODE;
3701 if (want_layout)
3702 flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
3703 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3704 if (r > 0)
3705 return nullptr; // delayed
3706 if (r < 0) {
3707 if (r == -CEPHFS_ESTALE) {
3708 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
3709 inodeno_t ino = refpath.get_ino();
3710 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
3711 return nullptr;
3712 }
3713 respond_to_request(mdr, r);
3714 return nullptr;
3715 }
3716
3717 CDentry *dn = mdr->dn[0].back();
3718 CDir *dir = dn->get_dir();
3719 CInode *diri = dir->get_inode();
3720
3721 if (!mdr->reqid.name.is_mds()) {
3722 if (diri->is_system() && !diri->is_root() &&
3723 (!diri->is_lost_and_found() ||
3724 mdr->client_request->get_op() != CEPH_MDS_OP_UNLINK)) {
3725 respond_to_request(mdr, -CEPHFS_EROFS);
3726 return nullptr;
3727 }
3728 }
3729
3730 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
3731 respond_to_request(mdr, -CEPHFS_ENOENT);
3732 return nullptr;
3733 }
3734
3735 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3736 if (dnl->is_null()) {
3737 if (!create && okexist) {
3738 respond_to_request(mdr, -CEPHFS_ENOENT);
3739 return nullptr;
3740 }
3741
3742 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3743 dn->first = std::max(dn->first, next_snap);
3744 } else {
3745 if (!okexist) {
3746 respond_to_request(mdr, -CEPHFS_EEXIST);
3747 return nullptr;
3748 }
3749 mdr->in[0] = dnl->get_inode();
3750 }
3751
3752 return dn;
3753 }
3754
3755 /** rdlock_two_paths_xlock_destdn
3756 * traverse two paths and lock the two paths in proper order.
3757 * The order of taking locks is:
3758 * 1. Lock directory inodes or dentries according to which trees they
3759 * are under. Lock objects under fs root before objects under mdsdir.
3760 * 2. Lock directory inodes or dentries according to their depth, in
3761 * ascending order.
3762 * 3. Lock directory inodes or dentries according to inode numbers or
3763 * dentries' parent inode numbers, in ascending order.
3764 * 4. Lock dentries in the same directory in order of their keys.
3765 * 5. Lock non-directory inodes according to inode numbers, in ascending
3766 * order.
3767 */
3768 std::pair<CDentry*, CDentry*>
3769 Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
3770 {
3771
3772 const filepath& refpath = mdr->get_filepath();
3773 const filepath& refpath2 = mdr->get_filepath2();
3774
3775 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
3776
3777 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3778 return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
3779
3780 if (refpath.depth() != 1 || refpath2.depth() != 1) {
3781 respond_to_request(mdr, -CEPHFS_EINVAL);
3782 return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
3783 }
3784
3785 if (refpath.is_last_snap() || refpath2.is_last_snap()) {
3786 respond_to_request(mdr, -CEPHFS_EROFS);
3787 return std::make_pair(nullptr, nullptr);
3788 }
3789
3790 // traverse to parent dir
3791 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3792 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
3793 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3794 if (r != 0) {
3795 if (r == -CEPHFS_ESTALE) {
3796 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl;
3797 inodeno_t ino = refpath.get_ino();
3798 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
3799 } else if (r < 0) {
3800 respond_to_request(mdr, r);
3801 }
3802 return std::make_pair(nullptr, nullptr);
3803 }
3804
3805 flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
3806 r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
3807 if (r != 0) {
3808 if (r == -CEPHFS_ESTALE) {
3809 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
3810 inodeno_t ino = refpath2.get_ino();
3811 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
3812 } else if (r < 0) {
3813 respond_to_request(mdr, r);
3814 }
3815 return std::make_pair(nullptr, nullptr);
3816 }
3817
3818 CDentry *srcdn = mdr->dn[1].back();
3819 CDir *srcdir = srcdn->get_dir();
3820 CDentry *destdn = mdr->dn[0].back();
3821 CDir *destdir = destdn->get_dir();
3822
3823 if (!mdr->reqid.name.is_mds()) {
3824 if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
3825 (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
3826 respond_to_request(mdr, -CEPHFS_EROFS);
3827 return std::make_pair(nullptr, nullptr);
3828 }
3829 }
3830
3831 if (!destdir->get_inode()->is_base() &&
3832 destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
3833 respond_to_request(mdr, -CEPHFS_ENOENT);
3834 return std::make_pair(nullptr, nullptr);
3835 }
3836
3837 MutationImpl::LockOpVec lov;
3838 if (srcdir->get_inode() == destdir->get_inode()) {
3839 lov.add_wrlock(&destdir->inode->filelock);
3840 lov.add_wrlock(&destdir->inode->nestlock);
3841 if (xlock_srcdn && srcdir != destdir) {
3842 mds_rank_t srcdir_auth = srcdir->authority().first;
3843 if (srcdir_auth != mds->get_nodeid()) {
3844 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3845 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3846 }
3847 }
3848
3849 if (srcdn->get_name() > destdn->get_name())
3850 lov.add_xlock(&destdn->lock);
3851
3852 if (xlock_srcdn)
3853 lov.add_xlock(&srcdn->lock);
3854 else
3855 lov.add_rdlock(&srcdn->lock);
3856
3857 if (srcdn->get_name() < destdn->get_name())
3858 lov.add_xlock(&destdn->lock);
3859 } else {
3860 int cmp = mdr->compare_paths();
3861 bool lock_destdir_first =
3862 (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
3863
3864 if (lock_destdir_first) {
3865 lov.add_wrlock(&destdir->inode->filelock);
3866 lov.add_wrlock(&destdir->inode->nestlock);
3867 lov.add_xlock(&destdn->lock);
3868 }
3869
3870 if (xlock_srcdn) {
3871 mds_rank_t srcdir_auth = srcdir->authority().first;
3872 if (srcdir_auth == mds->get_nodeid()) {
3873 lov.add_wrlock(&srcdir->inode->filelock);
3874 lov.add_wrlock(&srcdir->inode->nestlock);
3875 } else {
3876 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3877 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3878 }
3879 lov.add_xlock(&srcdn->lock);
3880 } else {
3881 lov.add_rdlock(&srcdn->lock);
3882 }
3883
3884 if (!lock_destdir_first) {
3885 lov.add_wrlock(&destdir->inode->filelock);
3886 lov.add_wrlock(&destdir->inode->nestlock);
3887 lov.add_xlock(&destdn->lock);
3888 }
3889 }
3890
3891 CInode *auth_pin_freeze = nullptr;
3892 // XXX any better way to do this?
3893 if (xlock_srcdn && !srcdn->is_auth()) {
3894 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
3895 auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
3896 }
3897 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
3898 return std::make_pair(nullptr, nullptr);
3899
3900 if (srcdn->get_projected_linkage()->is_null()) {
3901 respond_to_request(mdr, -CEPHFS_ENOENT);
3902 return std::make_pair(nullptr, nullptr);
3903 }
3904
3905 if (destdn->get_projected_linkage()->is_null()) {
3906 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3907 destdn->first = std::max(destdn->first, next_snap);
3908 }
3909
3910 mdr->locking_state |= MutationImpl::PATH_LOCKED;
3911
3912 return std::make_pair(destdn, srcdn);
3913 }
3914
3915 /**
3916 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3917 *
3918 * @param diri base inode
3919 * @param fg the exact frag we want
3920 * @param mdr request
3921 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3922 */
3923 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3924 {
3925 CDir *dir = diri->get_dirfrag(fg);
3926
3927 if (dir) {
3928 // am i auth for the dirfrag?
3929 if (!dir->is_auth()) {
3930 mds_rank_t auth = dir->authority().first;
3931 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3932 << ", fw to mds." << auth << dendl;
3933 mdcache->request_forward(mdr, auth);
3934 return nullptr;
3935 }
3936 } else {
3937 // not open and inode not mine?
3938 if (!diri->is_auth()) {
3939 mds_rank_t inauth = diri->authority().first;
3940 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3941 mdcache->request_forward(mdr, inauth);
3942 return nullptr;
3943 }
3944
3945 // not open and inode frozen?
3946 if (diri->is_frozen()) {
3947 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3948 ceph_assert(diri->get_parent_dir());
3949 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3950 return nullptr;
3951 }
3952
3953 // invent?
3954 dir = diri->get_or_open_dirfrag(mdcache, fg);
3955 }
3956
3957 return dir;
3958 }
3959
3960
3961 // ===============================================================================
3962 // STAT
3963
3964 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3965 {
3966 const cref_t<MClientRequest> &req = mdr->client_request;
3967
3968 if (req->get_filepath().depth() == 0 && is_lookup) {
3969 // refpath can't be empty for lookup but it can for
3970 // getattr (we do getattr with empty refpath for mount of '/')
3971 respond_to_request(mdr, -CEPHFS_EINVAL);
3972 return;
3973 }
3974
3975 bool want_auth = false;
3976 int mask = req->head.args.getattr.mask;
3977 if (mask & CEPH_STAT_RSTAT)
3978 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3979
3980 if (!mdr->is_batch_head() && mdr->can_batch()) {
3981 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
3982 int r = mdcache->path_traverse(mdr, cf, mdr->get_filepath(),
3983 (want_auth ? MDS_TRAVERSE_WANT_AUTH : 0),
3984 &mdr->dn[0], &mdr->in[0]);
3985 if (r > 0)
3986 return; // delayed
3987
3988 if (r < 0) {
3989 // fall-thru. let rdlock_path_pin_ref() check again.
3990 } else if (is_lookup) {
3991 CDentry* dn = mdr->dn[0].back();
3992 mdr->pin(dn);
3993 auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3994 if (em.second) {
3995 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
3996 } else {
3997 dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3998 em.first->second->add_request(mdr);
3999 mdr->mark_event("joining batch lookup");
4000 return;
4001 }
4002 } else {
4003 CInode *in = mdr->in[0];
4004 mdr->pin(in);
4005 auto em = in->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
4006 if (em.second) {
4007 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
4008 } else {
4009 dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
4010 em.first->second->add_request(mdr);
4011 mdr->mark_event("joining batch getattr");
4012 return;
4013 }
4014 }
4015 }
4016
4017 CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
4018 if (!ref)
4019 return;
4020
4021 /*
4022 * if client currently holds the EXCL cap on a field, do not rdlock
4023 * it; client's stat() will result in valid info if _either_ EXCL
4024 * cap is held or MDS rdlocks and reads the value here.
4025 *
4026 * handling this case here is easier than weakening rdlock
4027 * semantics... that would cause problems elsewhere.
4028 */
4029 client_t client = mdr->get_client();
4030 int issued = 0;
4031 Capability *cap = ref->get_client_cap(client);
4032 if (cap && (mdr->snapid == CEPH_NOSNAP ||
4033 mdr->snapid <= cap->client_follows))
4034 issued = cap->issued();
4035
4036 // FIXME
4037 MutationImpl::LockOpVec lov;
4038 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
4039 lov.add_rdlock(&ref->linklock);
4040 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
4041 lov.add_rdlock(&ref->authlock);
4042 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
4043 lov.add_rdlock(&ref->xattrlock);
4044 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
4045 // Don't wait on unstable filelock if client is allowed to read file size.
4046 // This can reduce the response time of getattr in the case that multiple
4047 // clients do stat(2) and there are writers.
4048 // The downside of this optimization is that mds may not issue Fs caps along
4049 // with getattr reply. Client may need to send more getattr requests.
4050 if (mdr->is_rdlocked(&ref->filelock)) {
4051 lov.add_rdlock(&ref->filelock);
4052 } else if (ref->filelock.is_stable() ||
4053 ref->filelock.get_num_wrlocks() > 0 ||
4054 !ref->filelock.can_read(mdr->get_client())) {
4055 /* Since we're taking advantage of an optimization here:
4056 *
4057 * We cannot suddenly, due to a changing condition, add this filelock as
4058 * it can cause lock-order deadlocks. In this case, that condition is the
4059 * lock state changes between request retries. If that happens, we need
4060 * to check if we've acquired the other locks in this vector. If we have,
4061 * then we need to drop those locks and retry.
4062 */
4063 if (mdr->is_rdlocked(&ref->linklock) ||
4064 mdr->is_rdlocked(&ref->authlock) ||
4065 mdr->is_rdlocked(&ref->xattrlock)) {
4066 /* start over */
4067 dout(20) << " dropping locks and restarting request because filelock state change" << dendl;
4068 mds->locker->drop_locks(mdr.get());
4069 mdr->drop_local_auth_pins();
4070 mds->queue_waiter(new C_MDS_RetryRequest(mdcache, mdr));
4071 return;
4072 }
4073 lov.add_rdlock(&ref->filelock);
4074 mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
4075 }
4076 }
4077
4078 if (!mds->locker->acquire_locks(mdr, lov))
4079 return;
4080
4081 if (!check_access(mdr, ref, MAY_READ))
4082 return;
4083
4084 utime_t now = ceph_clock_now();
4085 mdr->set_mds_stamp(now);
4086
4087 // note which caps are requested, so we return at least a snapshot
4088 // value for them. (currently this matters for xattrs and inline data)
4089 mdr->getattr_caps = mask;
4090
4091 mds->balancer->hit_inode(ref, META_POP_IRD);
4092
4093 // reply
4094 dout(10) << "reply to stat on " << *req << dendl;
4095 mdr->tracei = ref;
4096 if (is_lookup)
4097 mdr->tracedn = mdr->dn[0].back();
4098 respond_to_request(mdr, 0);
4099 }
4100
4101 struct C_MDS_LookupIno2 : public ServerContext {
4102 MDRequestRef mdr;
4103 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
4104 void finish(int r) override {
4105 server->_lookup_ino_2(mdr, r);
4106 }
4107 };
4108
4109 /*
4110 * filepath: ino
4111 */
4112 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
4113 bool want_parent, bool want_dentry)
4114 {
4115 const cref_t<MClientRequest> &req = mdr->client_request;
4116
4117 if ((uint64_t)req->head.args.lookupino.snapid > 0)
4118 return _lookup_snap_ino(mdr);
4119
4120 inodeno_t ino = req->get_filepath().get_ino();
4121 auto _ino = ino.val;
4122
4123 /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
4124 * I do not have an explanation for how that happened organically but this
4125 * check will ensure that the client can no longer do that.
4126 *
4127 * [1] https://tracker.ceph.com/issues/49922
4128 */
4129 if (MDS_IS_PRIVATE_INO(_ino)) {
4130 respond_to_request(mdr, -CEPHFS_ESTALE);
4131 return;
4132 }
4133
4134 CInode *in = mdcache->get_inode(ino);
4135 if (in && in->state_test(CInode::STATE_PURGING)) {
4136 respond_to_request(mdr, -CEPHFS_ESTALE);
4137 return;
4138 }
4139 if (!in) {
4140 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
4141 return;
4142 }
4143
4144 // check for nothing (not read or write); this still applies the
4145 // path check.
4146 if (!check_access(mdr, in, 0))
4147 return;
4148
4149 CDentry *dn = in->get_projected_parent_dn();
4150 CInode *diri = dn ? dn->get_dir()->inode : NULL;
4151
4152 MutationImpl::LockOpVec lov;
4153 if (dn && (want_parent || want_dentry)) {
4154 mdr->pin(dn);
4155 lov.add_rdlock(&dn->lock);
4156 }
4157
4158 unsigned mask = req->head.args.lookupino.mask;
4159 if (mask) {
4160 Capability *cap = in->get_client_cap(mdr->get_client());
4161 int issued = 0;
4162 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4163 issued = cap->issued();
4164 // FIXME
4165 // permission bits, ACL/security xattrs
4166 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
4167 lov.add_rdlock(&in->authlock);
4168 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
4169 lov.add_rdlock(&in->xattrlock);
4170
4171 mdr->getattr_caps = mask;
4172 }
4173
4174 if (!lov.empty()) {
4175 if (!mds->locker->acquire_locks(mdr, lov))
4176 return;
4177
4178 if (diri != NULL) {
4179 // need read access to directory inode
4180 if (!check_access(mdr, diri, MAY_READ))
4181 return;
4182 }
4183 }
4184
4185 if (want_parent) {
4186 if (in->is_base()) {
4187 respond_to_request(mdr, -CEPHFS_EINVAL);
4188 return;
4189 }
4190 if (!diri || diri->is_stray()) {
4191 respond_to_request(mdr, -CEPHFS_ESTALE);
4192 return;
4193 }
4194 dout(10) << "reply to lookup_parent " << *in << dendl;
4195 mdr->tracei = diri;
4196 respond_to_request(mdr, 0);
4197 } else {
4198 if (want_dentry) {
4199 inodeno_t dirino = req->get_filepath2().get_ino();
4200 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
4201 respond_to_request(mdr, -CEPHFS_ENOENT);
4202 return;
4203 }
4204 dout(10) << "reply to lookup_name " << *in << dendl;
4205 } else
4206 dout(10) << "reply to lookup_ino " << *in << dendl;
4207
4208 mdr->tracei = in;
4209 if (want_dentry)
4210 mdr->tracedn = dn;
4211 respond_to_request(mdr, 0);
4212 }
4213 }
4214
4215 void Server::_lookup_snap_ino(MDRequestRef& mdr)
4216 {
4217 const cref_t<MClientRequest> &req = mdr->client_request;
4218
4219 vinodeno_t vino;
4220 vino.ino = req->get_filepath().get_ino();
4221 vino.snapid = (__u64)req->head.args.lookupino.snapid;
4222 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
4223 __u32 hash = req->head.args.lookupino.hash;
4224
4225 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
4226
4227 CInode *in = mdcache->lookup_snap_inode(vino);
4228 if (!in) {
4229 in = mdcache->get_inode(vino.ino);
4230 if (in) {
4231 if (in->state_test(CInode::STATE_PURGING) ||
4232 !in->has_snap_data(vino.snapid)) {
4233 if (in->is_dir() || !parent_ino) {
4234 respond_to_request(mdr, -CEPHFS_ESTALE);
4235 return;
4236 }
4237 in = NULL;
4238 }
4239 }
4240 }
4241
4242 if (in) {
4243 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
4244 mdr->snapid = vino.snapid;
4245 mdr->tracei = in;
4246 respond_to_request(mdr, 0);
4247 return;
4248 }
4249
4250 CInode *diri = NULL;
4251 if (parent_ino) {
4252 diri = mdcache->get_inode(parent_ino);
4253 if (!diri) {
4254 mdcache->open_ino(parent_ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
4255 return;
4256 }
4257
4258 if (!diri->is_dir()) {
4259 respond_to_request(mdr, -CEPHFS_EINVAL);
4260 return;
4261 }
4262
4263 MutationImpl::LockOpVec lov;
4264 lov.add_rdlock(&diri->dirfragtreelock);
4265 if (!mds->locker->acquire_locks(mdr, lov))
4266 return;
4267
4268 frag_t frag = diri->dirfragtree[hash];
4269 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
4270 if (!dir)
4271 return;
4272
4273 if (!dir->is_complete()) {
4274 if (dir->is_frozen()) {
4275 mds->locker->drop_locks(mdr.get());
4276 mdr->drop_local_auth_pins();
4277 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4278 return;
4279 }
4280 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4281 return;
4282 }
4283
4284 respond_to_request(mdr, -CEPHFS_ESTALE);
4285 } else {
4286 mdcache->open_ino(vino.ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
4287 }
4288 }
4289
4290 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
4291 {
4292 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
4293 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
4294
4295 // `r` is a rank if >=0, else an error code
4296 if (r >= 0) {
4297 mds_rank_t dest_rank(r);
4298 if (dest_rank == mds->get_nodeid())
4299 dispatch_client_request(mdr);
4300 else
4301 mdcache->request_forward(mdr, dest_rank);
4302 return;
4303 }
4304
4305 // give up
4306 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
4307 r = -CEPHFS_ESTALE;
4308 respond_to_request(mdr, r);
4309 }
4310
4311
4312 /* This function takes responsibility for the passed mdr*/
4313 void Server::handle_client_open(MDRequestRef& mdr)
4314 {
4315 const cref_t<MClientRequest> &req = mdr->client_request;
4316 dout(7) << "open on " << req->get_filepath() << dendl;
4317
4318 int flags = req->head.args.open.flags;
4319 int cmode = ceph_flags_to_mode(flags);
4320 if (cmode < 0) {
4321 respond_to_request(mdr, -CEPHFS_EINVAL);
4322 return;
4323 }
4324
4325 bool need_auth = !file_mode_is_readonly(cmode) ||
4326 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
4327
4328 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
4329 dout(7) << "read-only FS" << dendl;
4330 respond_to_request(mdr, -CEPHFS_EROFS);
4331 return;
4332 }
4333
4334 CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
4335 if (!cur)
4336 return;
4337
4338 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
4339 ceph_assert(!need_auth);
4340 mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
4341 CInode *cur = rdlock_path_pin_ref(mdr, true);
4342 if (!cur)
4343 return;
4344 }
4345
4346 if (!cur->is_file()) {
4347 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4348 cmode = CEPH_FILE_MODE_PIN;
4349 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4350 if (cur->is_symlink() && !(flags & CEPH_O_NOFOLLOW))
4351 flags &= ~CEPH_O_TRUNC;
4352 }
4353
4354 dout(10) << "open flags = " << flags
4355 << ", filemode = " << cmode
4356 << ", need_auth = " << need_auth
4357 << dendl;
4358
4359 // regular file?
4360 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4361 dout(7) << "not a file or dir " << *cur << dendl;
4362 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
4363 return;
4364 }*/
4365 if ((flags & CEPH_O_DIRECTORY) && !cur->is_dir() && !cur->is_symlink()) {
4366 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
4367 respond_to_request(mdr, -CEPHFS_EINVAL);
4368 return;
4369 }
4370
4371 if ((flags & CEPH_O_TRUNC) && !cur->is_file()) {
4372 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
4373 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4374 respond_to_request(mdr, cur->is_dir() ? -CEPHFS_EISDIR : -CEPHFS_EINVAL);
4375 return;
4376 }
4377
4378 if (cur->get_inode()->inline_data.version != CEPH_INLINE_NONE &&
4379 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
4380 dout(7) << "old client cannot open inline data file " << *cur << dendl;
4381 respond_to_request(mdr, -CEPHFS_EPERM);
4382 return;
4383 }
4384
4385 // snapped data is read only
4386 if (mdr->snapid != CEPH_NOSNAP &&
4387 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
4388 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
4389 respond_to_request(mdr, -CEPHFS_EROFS);
4390 return;
4391 }
4392
4393 MutationImpl::LockOpVec lov;
4394 lov.add_rdlock(&cur->snaplock);
4395
4396 unsigned mask = req->head.args.open.mask;
4397 if (mask) {
4398 Capability *cap = cur->get_client_cap(mdr->get_client());
4399 int issued = 0;
4400 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4401 issued = cap->issued();
4402 // permission bits, ACL/security xattrs
4403 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
4404 lov.add_rdlock(&cur->authlock);
4405 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
4406 lov.add_rdlock(&cur->xattrlock);
4407
4408 mdr->getattr_caps = mask;
4409 }
4410
4411 // O_TRUNC
4412 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
4413 ceph_assert(cur->is_auth());
4414
4415 lov.add_xlock(&cur->filelock);
4416 if (!mds->locker->acquire_locks(mdr, lov))
4417 return;
4418
4419 if (!check_access(mdr, cur, MAY_WRITE))
4420 return;
4421
4422 // wait for pending truncate?
4423 const auto& pi = cur->get_projected_inode();
4424 if (pi->is_truncating()) {
4425 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4426 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4427 mds->locker->drop_locks(mdr.get());
4428 mdr->drop_local_auth_pins();
4429 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4430 return;
4431 }
4432
4433 do_open_truncate(mdr, cmode);
4434 return;
4435 }
4436
4437 // sync filelock if snapped.
4438 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4439 // and that data itself is flushed so that we can read the snapped data off disk.
4440 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
4441 lov.add_rdlock(&cur->filelock);
4442 }
4443
4444 if (!mds->locker->acquire_locks(mdr, lov))
4445 return;
4446
4447 mask = MAY_READ;
4448 if (cmode & CEPH_FILE_MODE_WR)
4449 mask |= MAY_WRITE;
4450 if (!check_access(mdr, cur, mask))
4451 return;
4452
4453 utime_t now = ceph_clock_now();
4454 mdr->set_mds_stamp(now);
4455
4456 if (cur->is_file() || cur->is_dir()) {
4457 if (mdr->snapid == CEPH_NOSNAP) {
4458 // register new cap
4459 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
4460 if (cap)
4461 dout(12) << "open issued caps " << ccap_string(cap->pending())
4462 << " for " << req->get_source()
4463 << " on " << *cur << dendl;
4464 } else {
4465 int caps = ceph_caps_for_mode(cmode);
4466 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
4467 << " for " << req->get_source()
4468 << " snapid " << mdr->snapid
4469 << " on " << *cur << dendl;
4470 mdr->snap_caps = caps;
4471 }
4472 }
4473
4474 // increase max_size?
4475 if (cmode & CEPH_FILE_MODE_WR)
4476 mds->locker->check_inode_max_size(cur);
4477
4478 // make sure this inode gets into the journal
4479 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
4480 mdcache->open_file_table.should_log_open(cur)) {
4481 EOpen *le = new EOpen(mds->mdlog);
4482 mdlog->start_entry(le);
4483 le->add_clean_inode(cur);
4484 mdlog->submit_entry(le);
4485 }
4486
4487 // hit pop
4488 if (cmode & CEPH_FILE_MODE_WR)
4489 mds->balancer->hit_inode(cur, META_POP_IWR);
4490 else
4491 mds->balancer->hit_inode(cur, META_POP_IRD);
4492
4493 CDentry *dn = 0;
4494 if (req->get_dentry_wanted()) {
4495 ceph_assert(mdr->dn[0].size());
4496 dn = mdr->dn[0].back();
4497 }
4498
4499 mdr->tracei = cur;
4500 mdr->tracedn = dn;
4501 respond_to_request(mdr, 0);
4502 }
4503
4504 class C_MDS_openc_finish : public ServerLogContext {
4505 CDentry *dn;
4506 CInode *newi;
4507 public:
4508 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4509 ServerLogContext(s, r), dn(d), newi(ni) {}
4510 void finish(int r) override {
4511 ceph_assert(r == 0);
4512
4513 // crash current MDS and the replacing MDS will test the journal
4514 ceph_assert(!g_conf()->mds_kill_skip_replaying_inotable);
4515
4516 dn->pop_projected_linkage();
4517
4518 // dirty inode, dn, dir
4519 newi->mark_dirty(mdr->ls);
4520 newi->mark_dirty_parent(mdr->ls, true);
4521
4522 mdr->apply();
4523
4524 get_mds()->locker->share_inode_max_size(newi);
4525
4526 MDRequestRef null_ref;
4527 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4528
4529 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
4530
4531 server->respond_to_request(mdr, 0);
4532
4533 ceph_assert(g_conf()->mds_kill_openc_at != 1);
4534 }
4535 };
4536
4537 /* This function takes responsibility for the passed mdr*/
4538 void Server::handle_client_openc(MDRequestRef& mdr)
4539 {
4540 const cref_t<MClientRequest> &req = mdr->client_request;
4541 client_t client = mdr->get_client();
4542
4543 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4544
4545 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4546 if (cmode < 0) {
4547 respond_to_request(mdr, -CEPHFS_EINVAL);
4548 return;
4549 }
4550
4551 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
4552 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true, true);
4553 if (!dn)
4554 return;
4555
4556 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4557 if (!excl && !dnl->is_null()) {
4558 // it existed.
4559 ceph_assert(mdr.get()->is_rdlocked(&dn->lock));
4560
4561 handle_client_open(mdr);
4562 return;
4563 }
4564
4565 ceph_assert(dnl->is_null());
4566
4567 if (req->get_alternate_name().size() > alternate_name_max) {
4568 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
4569 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
4570 return;
4571 }
4572 dn->set_alternate_name(req->get_alternate_name());
4573
4574 // set layout
4575 file_layout_t layout;
4576 if (mdr->dir_layout != file_layout_t())
4577 layout = mdr->dir_layout;
4578 else
4579 layout = mdcache->default_file_layout;
4580
4581 // What kind of client caps are required to complete this operation
4582 uint64_t access = MAY_WRITE;
4583
4584 const auto default_layout = layout;
4585
4586 // fill in any special params from client
4587 if (req->head.args.open.stripe_unit)
4588 layout.stripe_unit = req->head.args.open.stripe_unit;
4589 if (req->head.args.open.stripe_count)
4590 layout.stripe_count = req->head.args.open.stripe_count;
4591 if (req->head.args.open.object_size)
4592 layout.object_size = req->head.args.open.object_size;
4593 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4594 (__s32)req->head.args.open.pool >= 0) {
4595 layout.pool_id = req->head.args.open.pool;
4596
4597 // make sure we have as new a map as the client
4598 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4599 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4600 return;
4601 }
4602 }
4603
4604 // If client doesn't have capability to modify layout pools, then
4605 // only permit this request if the requested pool matches what the
4606 // file would have inherited anyway from its parent.
4607 if (default_layout != layout) {
4608 access |= MAY_SET_VXATTR;
4609 }
4610
4611 if (!layout.is_valid()) {
4612 dout(10) << " invalid initial file layout" << dendl;
4613 respond_to_request(mdr, -CEPHFS_EINVAL);
4614 return;
4615 }
4616 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4617 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4618 respond_to_request(mdr, -CEPHFS_EINVAL);
4619 return;
4620 }
4621
4622 // created null dn.
4623 CDir *dir = dn->get_dir();
4624 CInode *diri = dir->get_inode();
4625 if (!check_access(mdr, diri, access))
4626 return;
4627 if (!check_fragment_space(mdr, dir))
4628 return;
4629 if (!check_dir_max_entries(mdr, dir))
4630 return;
4631
4632 if (mdr->dn[0].size() == 1)
4633 mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
4634
4635 // create inode.
4636 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4637 req->head.args.open.mode | S_IFREG, &layout);
4638 ceph_assert(newi);
4639
4640 // it's a file.
4641 dn->push_projected_linkage(newi);
4642
4643 auto _inode = newi->_get_inode();
4644 _inode->version = dn->pre_dirty();
4645 if (layout.pool_id != mdcache->default_file_layout.pool_id)
4646 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
4647 _inode->update_backtrace();
4648 _inode->rstat.rfiles = 1;
4649 _inode->accounted_rstat = _inode->rstat;
4650
4651 SnapRealm *realm = diri->find_snaprealm();
4652 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4653 ceph_assert(follows >= realm->get_newest_seq());
4654
4655 ceph_assert(dn->first == follows+1);
4656 newi->first = dn->first;
4657
4658 // do the open
4659 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
4660 newi->authlock.set_state(LOCK_EXCL);
4661 newi->xattrlock.set_state(LOCK_EXCL);
4662
4663 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
4664 _inode->client_ranges[client].range.first = 0;
4665 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
4666 _inode->client_ranges[client].follows = follows;
4667 newi->mark_clientwriteable();
4668 cap->mark_clientwriteable();
4669 }
4670
4671 // prepare finisher
4672 mdr->ls = mdlog->get_current_segment();
4673 EUpdate *le = new EUpdate(mdlog, "openc");
4674 mdlog->start_entry(le);
4675 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4676 journal_allocated_inos(mdr, &le->metablob);
4677 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4678 le->metablob.add_primary_dentry(dn, newi, true, true, true);
4679
4680 // make sure this inode gets into the journal
4681 le->metablob.add_opened_ino(newi->ino());
4682
4683 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, newi);
4684
4685 if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
4686 openc_response_t ocresp;
4687
4688 dout(10) << "adding created_ino and delegated_inos" << dendl;
4689 ocresp.created_ino = _inode->ino;
4690
4691 if (delegate_inos_pct && !req->is_queued_for_replay()) {
4692 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4693 unsigned frac = 100 / delegate_inos_pct;
4694 if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
4695 mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
4696 }
4697
4698 encode(ocresp, mdr->reply_extra_bl);
4699 } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
4700 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4701 // add the file created flag onto the reply if create_flags features is supported
4702 encode(newi->ino(), mdr->reply_extra_bl);
4703 }
4704
4705 journal_and_reply(mdr, newi, dn, le, fin);
4706
4707 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4708 // have overshot the split size (multiple opencs in flight), so here is
4709 // an early chance to split the dir if this openc makes it oversized.
4710 mds->balancer->maybe_fragment(dir, false);
4711 }
4712
4713
4714 void Server::_finalize_readdir(MDRequestRef& mdr,
4715 CInode *diri,
4716 CDir* dir,
4717 bool start,
4718 bool end,
4719 __u16 flags,
4720 __u32 numfiles,
4721 bufferlist& dirbl,
4722 bufferlist& dnbl)
4723 {
4724 const cref_t<MClientRequest> &req = mdr->client_request;
4725 Session *session = mds->get_session(req);
4726
4727 session->touch_readdir_cap(numfiles);
4728
4729 if (end) {
4730 flags |= CEPH_READDIR_FRAG_END;
4731 if (start)
4732 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4733 }
4734
4735 // finish final blob
4736 encode(numfiles, dirbl);
4737 encode(flags, dirbl);
4738 dirbl.claim_append(dnbl);
4739
4740 // yay, reply
4741 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4742 << " bytes=" << dirbl.length()
4743 << " start=" << (int)start
4744 << " end=" << (int)end
4745 << dendl;
4746 mdr->reply_extra_bl = dirbl;
4747
4748 // bump popularity. NOTE: this doesn't quite capture it.
4749 mds->balancer->hit_dir(dir, META_POP_READDIR, numfiles);
4750
4751 // reply
4752 mdr->tracei = diri;
4753 respond_to_request(mdr, 0);
4754 }
4755
4756 void Server::handle_client_readdir(MDRequestRef& mdr)
4757 {
4758 const cref_t<MClientRequest> &req = mdr->client_request;
4759 Session *session = mds->get_session(req);
4760 client_t client = req->get_source().num();
4761 MutationImpl::LockOpVec lov;
4762 CInode *diri = rdlock_path_pin_ref(mdr, false, true);
4763 if (!diri) return;
4764
4765 // it's a directory, right?
4766 if (!diri->is_dir()) {
4767 // not a dir
4768 dout(10) << "reply to " << *req << " readdir -CEPHFS_ENOTDIR" << dendl;
4769 respond_to_request(mdr, -CEPHFS_ENOTDIR);
4770 return;
4771 }
4772
4773 auto num_caps = session->get_num_caps();
4774 auto session_cap_acquisition = session->get_cap_acquisition();
4775
4776 if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
4777 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
4778 << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
4779 if (logger)
4780 logger->inc(l_mdss_cap_acquisition_throttle);
4781
4782 mdr->mark_event("cap_acquisition_throttle");
4783 mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
4784 return;
4785 }
4786
4787 lov.add_rdlock(&diri->filelock);
4788 lov.add_rdlock(&diri->dirfragtreelock);
4789
4790 if (!mds->locker->acquire_locks(mdr, lov))
4791 return;
4792
4793 if (!check_access(mdr, diri, MAY_READ))
4794 return;
4795
4796 // which frag?
4797 frag_t fg = (__u32)req->head.args.readdir.frag;
4798 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4799 string offset_str = req->get_path2();
4800
4801 __u32 offset_hash = 0;
4802 if (!offset_str.empty())
4803 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4804 else
4805 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4806
4807 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4808 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4809
4810 // does the frag exist?
4811 if (diri->dirfragtree[fg.value()] != fg) {
4812 frag_t newfg;
4813 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4814 if (fg.contains((unsigned)offset_hash)) {
4815 newfg = diri->dirfragtree[offset_hash];
4816 } else {
4817 // client actually wants next frag
4818 newfg = diri->dirfragtree[fg.value()];
4819 }
4820 } else {
4821 offset_str.clear();
4822 newfg = diri->dirfragtree[fg.value()];
4823 }
4824 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4825 fg = newfg;
4826 }
4827
4828 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4829 if (!dir) return;
4830
4831 // ok!
4832 dout(10) << "handle_client_readdir on " << *dir << dendl;
4833 ceph_assert(dir->is_auth());
4834
4835 if (!dir->is_complete()) {
4836 if (dir->is_frozen()) {
4837 dout(7) << "dir is frozen " << *dir << dendl;
4838 mds->locker->drop_locks(mdr.get());
4839 mdr->drop_local_auth_pins();
4840 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4841 return;
4842 }
4843 // fetch
4844 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4845 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4846 return;
4847 }
4848
4849 #ifdef MDS_VERIFY_FRAGSTAT
4850 dir->verify_fragstat();
4851 #endif
4852
4853 utime_t now = ceph_clock_now();
4854 mdr->set_mds_stamp(now);
4855
4856 snapid_t snapid = mdr->snapid;
4857 dout(10) << "snapid " << snapid << dendl;
4858
4859 SnapRealm *realm = diri->find_snaprealm();
4860
4861 unsigned max = req->head.args.readdir.max_entries;
4862 if (!max)
4863 max = dir->get_num_any(); // whatever, something big.
4864 unsigned max_bytes = req->head.args.readdir.max_bytes;
4865 if (!max_bytes)
4866 // make sure at least one item can be encoded
4867 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
4868
4869 // start final blob
4870 bufferlist dirbl;
4871 DirStat ds;
4872 ds.frag = dir->get_frag();
4873 ds.auth = dir->get_dir_auth().first;
4874 if (dir->is_auth() && !forward_all_requests_to_auth)
4875 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4876
4877 dir->encode_dirstat(dirbl, mdr->session->info, ds);
4878
4879 // count bytes available.
4880 // this isn't perfect, but we should capture the main variable/unbounded size items!
4881 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4882 int bytes_left = max_bytes - front_bytes;
4883 bytes_left -= get_snap_trace(session, realm).length();
4884
4885 // build dir contents
4886 bufferlist dnbl;
4887 __u32 numfiles = 0;
4888 bool start = !offset_hash && offset_str.empty();
4889 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4890 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
4891 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4892 bool end = (it == dir->end());
4893 for (; !end && numfiles < max; end = (it == dir->end())) {
4894 CDentry *dn = it->second;
4895 ++it;
4896
4897 if (dn->state_test(CDentry::STATE_PURGING))
4898 continue;
4899
4900 bool dnp = dn->use_projected(client, mdr);
4901 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4902
4903 if (dnl->is_null()) {
4904 if (dn->get_num_ref() == 0 && !dn->is_projected())
4905 dir->remove_dentry(dn);
4906 continue;
4907 }
4908
4909 if (dn->last < snapid || dn->first > snapid) {
4910 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4911 continue;
4912 }
4913
4914 if (!start) {
4915 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4916 if (!(offset_key < dn->key()))
4917 continue;
4918 }
4919
4920 CInode *in = dnl->get_inode();
4921
4922 if (in && in->ino() == CEPH_INO_CEPH)
4923 continue;
4924
4925 // remote link?
4926 // better for the MDS to do the work, if we think the client will stat any of these files.
4927 if (dnl->is_remote() && !in) {
4928 in = mdcache->get_inode(dnl->get_remote_ino());
4929 if (in) {
4930 dn->link_remote(dnl, in);
4931 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4932 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4933 continue;
4934 } else {
4935 // touch everything i _do_ have
4936 for (auto &p : *dir) {
4937 if (!p.second->get_linkage()->is_null())
4938 mdcache->lru.lru_touch(p.second);
4939 }
4940
4941 // already issued caps and leases, reply immediately.
4942 if (dnbl.length() > 0) {
4943 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4944 dout(10) << " open remote dentry after caps were issued, stopping at "
4945 << dnbl.length() << " < " << bytes_left << dendl;
4946 break;
4947 }
4948
4949 mds->locker->drop_locks(mdr.get());
4950 mdr->drop_local_auth_pins();
4951 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4952 return;
4953 }
4954 }
4955 ceph_assert(in);
4956
4957 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
4958 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4959 break;
4960 }
4961
4962 unsigned start_len = dnbl.length();
4963
4964 // dentry
4965 dout(12) << "including dn " << *dn << dendl;
4966 encode(dn->get_name(), dnbl);
4967 mds->locker->issue_client_lease(dn, in, mdr, now, dnbl);
4968
4969 // inode
4970 dout(12) << "including inode in " << *in << " snap " << snapid << dendl;
4971 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4972 if (r < 0) {
4973 // chop off dn->name, lease
4974 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4975 bufferlist keep;
4976 keep.substr_of(dnbl, 0, start_len);
4977 dnbl.swap(keep);
4978 break;
4979 }
4980 ceph_assert(r >= 0);
4981 numfiles++;
4982
4983 // touch dn
4984 mdcache->lru.lru_touch(dn);
4985 }
4986 __u16 flags = 0;
4987 // client only understand END and COMPLETE flags ?
4988 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4989 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4990 }
4991 _finalize_readdir(mdr, diri, dir, start, end, flags, numfiles, dirbl, dnbl);
4992 }
4993
4994
4995
4996 // ===============================================================================
4997 // INODE UPDATES
4998
4999
5000 /*
5001 * finisher for basic inode updates
5002 */
5003 class C_MDS_inode_update_finish : public ServerLogContext {
5004 CInode *in;
5005 bool truncating_smaller, changed_ranges, adjust_realm;
5006 public:
5007 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
5008 bool sm=false, bool cr=false, bool ar=false) :
5009 ServerLogContext(s, r), in(i),
5010 truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { }
5011 void finish(int r) override {
5012 ceph_assert(r == 0);
5013
5014 int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT);
5015
5016 // apply
5017 mdr->apply();
5018
5019 MDSRank *mds = get_mds();
5020
5021 // notify any clients
5022 if (truncating_smaller && in->get_inode()->is_truncating()) {
5023 mds->locker->issue_truncate(in);
5024 mds->mdcache->truncate_inode(in, mdr->ls);
5025 }
5026
5027 if (adjust_realm) {
5028 mds->mdcache->send_snap_update(in, 0, snap_op);
5029 mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op);
5030 }
5031
5032 get_mds()->balancer->hit_inode(in, META_POP_IWR);
5033
5034 server->respond_to_request(mdr, 0);
5035
5036 if (changed_ranges)
5037 get_mds()->locker->share_inode_max_size(in);
5038 }
5039 };
5040
5041 void Server::handle_client_file_setlock(MDRequestRef& mdr)
5042 {
5043 const cref_t<MClientRequest> &req = mdr->client_request;
5044 MutationImpl::LockOpVec lov;
5045
5046 // get the inode to operate on, and set up any locks needed for that
5047 CInode *cur = rdlock_path_pin_ref(mdr, true);
5048 if (!cur)
5049 return;
5050
5051 lov.add_xlock(&cur->flocklock);
5052 /* acquire_locks will return true if it gets the locks. If it fails,
5053 it will redeliver this request at a later date, so drop the request.
5054 */
5055 if (!mds->locker->acquire_locks(mdr, lov)) {
5056 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
5057 return;
5058 }
5059
5060 // copy the lock change into a ceph_filelock so we can store/apply it
5061 ceph_filelock set_lock;
5062 set_lock.start = req->head.args.filelock_change.start;
5063 set_lock.length = req->head.args.filelock_change.length;
5064 set_lock.client = req->get_orig_source().num();
5065 set_lock.owner = req->head.args.filelock_change.owner;
5066 set_lock.pid = req->head.args.filelock_change.pid;
5067 set_lock.type = req->head.args.filelock_change.type;
5068 bool will_wait = req->head.args.filelock_change.wait;
5069
5070 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
5071
5072 ceph_lock_state_t *lock_state = NULL;
5073 bool interrupt = false;
5074
5075 // get the appropriate lock state
5076 switch (req->head.args.filelock_change.rule) {
5077 case CEPH_LOCK_FLOCK_INTR:
5078 interrupt = true;
5079 // fall-thru
5080 case CEPH_LOCK_FLOCK:
5081 lock_state = cur->get_flock_lock_state();
5082 break;
5083
5084 case CEPH_LOCK_FCNTL_INTR:
5085 interrupt = true;
5086 // fall-thru
5087 case CEPH_LOCK_FCNTL:
5088 lock_state = cur->get_fcntl_lock_state();
5089 break;
5090
5091 default:
5092 dout(10) << "got unknown lock type " << set_lock.type
5093 << ", dropping request!" << dendl;
5094 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
5095 return;
5096 }
5097
5098 dout(10) << " state prior to lock change: " << *lock_state << dendl;
5099 if (CEPH_LOCK_UNLOCK == set_lock.type) {
5100 list<ceph_filelock> activated_locks;
5101 MDSContext::vec waiters;
5102 if (lock_state->is_waiting(set_lock)) {
5103 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
5104 lock_state->remove_waiting(set_lock);
5105 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
5106 } else if (!interrupt) {
5107 dout(10) << " unlock attempt on " << set_lock << dendl;
5108 lock_state->remove_lock(set_lock, activated_locks);
5109 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
5110 }
5111 mds->queue_waiters(waiters);
5112
5113 respond_to_request(mdr, 0);
5114 } else {
5115 dout(10) << " lock attempt on " << set_lock << dendl;
5116 bool deadlock = false;
5117 if (mdr->more()->flock_was_waiting &&
5118 !lock_state->is_waiting(set_lock)) {
5119 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
5120 respond_to_request(mdr, -CEPHFS_EINTR);
5121 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
5122 dout(10) << " it failed on this attempt" << dendl;
5123 // couldn't set lock right now
5124 if (deadlock) {
5125 respond_to_request(mdr, -CEPHFS_EDEADLK);
5126 } else if (!will_wait) {
5127 respond_to_request(mdr, -CEPHFS_EWOULDBLOCK);
5128 } else {
5129 dout(10) << " added to waiting list" << dendl;
5130 ceph_assert(lock_state->is_waiting(set_lock));
5131 mdr->more()->flock_was_waiting = true;
5132 mds->locker->drop_locks(mdr.get());
5133 mdr->drop_local_auth_pins();
5134 mdr->mark_event("failed to add lock, waiting");
5135 mdr->mark_nowarn();
5136 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
5137 }
5138 } else
5139 respond_to_request(mdr, 0);
5140 }
5141 dout(10) << " state after lock change: " << *lock_state << dendl;
5142 }
5143
5144 void Server::handle_client_file_readlock(MDRequestRef& mdr)
5145 {
5146 const cref_t<MClientRequest> &req = mdr->client_request;
5147 MutationImpl::LockOpVec lov;
5148
5149 // get the inode to operate on, and set up any locks needed for that
5150 CInode *cur = rdlock_path_pin_ref(mdr, true);
5151 if (!cur)
5152 return;
5153
5154 /* acquire_locks will return true if it gets the locks. If it fails,
5155 it will redeliver this request at a later date, so drop the request.
5156 */
5157 lov.add_rdlock(&cur->flocklock);
5158 if (!mds->locker->acquire_locks(mdr, lov)) {
5159 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
5160 return;
5161 }
5162
5163 // copy the lock change into a ceph_filelock so we can store/apply it
5164 ceph_filelock checking_lock;
5165 checking_lock.start = req->head.args.filelock_change.start;
5166 checking_lock.length = req->head.args.filelock_change.length;
5167 checking_lock.client = req->get_orig_source().num();
5168 checking_lock.owner = req->head.args.filelock_change.owner;
5169 checking_lock.pid = req->head.args.filelock_change.pid;
5170 checking_lock.type = req->head.args.filelock_change.type;
5171
5172 // get the appropriate lock state
5173 ceph_lock_state_t *lock_state = NULL;
5174 switch (req->head.args.filelock_change.rule) {
5175 case CEPH_LOCK_FLOCK:
5176 lock_state = cur->get_flock_lock_state();
5177 break;
5178
5179 case CEPH_LOCK_FCNTL:
5180 lock_state = cur->get_fcntl_lock_state();
5181 break;
5182
5183 default:
5184 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
5185 respond_to_request(mdr, -CEPHFS_EINVAL);
5186 return;
5187 }
5188 lock_state->look_for_lock(checking_lock);
5189
5190 bufferlist lock_bl;
5191 encode(checking_lock, lock_bl);
5192
5193 mdr->reply_extra_bl = lock_bl;
5194 respond_to_request(mdr, 0);
5195 }
5196
5197 void Server::handle_client_setattr(MDRequestRef& mdr)
5198 {
5199 const cref_t<MClientRequest> &req = mdr->client_request;
5200 MutationImpl::LockOpVec lov;
5201 CInode *cur = rdlock_path_pin_ref(mdr, true);
5202 if (!cur) return;
5203
5204 if (mdr->snapid != CEPH_NOSNAP) {
5205 respond_to_request(mdr, -CEPHFS_EROFS);
5206 return;
5207 }
5208 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
5209 respond_to_request(mdr, -CEPHFS_EPERM);
5210 return;
5211 }
5212
5213 __u32 mask = req->head.args.setattr.mask;
5214 __u32 access_mask = MAY_WRITE;
5215
5216 if (req->get_header().version < 6) {
5217 // No changes to fscrypted inodes by downrevved clients
5218 if (!cur->get_inode()->fscrypt_auth.empty()) {
5219 respond_to_request(mdr, -CEPHFS_EPERM);
5220 return;
5221 }
5222
5223 // Only allow fscrypt field changes by capable clients
5224 if (mask & (CEPH_SETATTR_FSCRYPT_FILE|CEPH_SETATTR_FSCRYPT_AUTH)) {
5225 respond_to_request(mdr, -CEPHFS_EINVAL);
5226 return;
5227 }
5228 }
5229
5230 // xlock inode
5231 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID|CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_KILL_SUID|CEPH_SETATTR_KILL_SGID))
5232 lov.add_xlock(&cur->authlock);
5233 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE|CEPH_SETATTR_FSCRYPT_FILE))
5234 lov.add_xlock(&cur->filelock);
5235 if (mask & CEPH_SETATTR_CTIME)
5236 lov.add_wrlock(&cur->versionlock);
5237
5238 if (!mds->locker->acquire_locks(mdr, lov))
5239 return;
5240
5241 if ((mask & CEPH_SETATTR_UID) && (cur->get_inode()->uid != req->head.args.setattr.uid))
5242 access_mask |= MAY_CHOWN;
5243
5244 if ((mask & CEPH_SETATTR_GID) && (cur->get_inode()->gid != req->head.args.setattr.gid))
5245 access_mask |= MAY_CHGRP;
5246
5247 if (!check_access(mdr, cur, access_mask))
5248 return;
5249
5250 // trunc from bigger -> smaller?
5251 const auto& pip = cur->get_projected_inode();
5252
5253 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
5254
5255 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
5256 if (is_full && req->head.args.setattr.size > old_size) {
5257 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl;
5258 respond_to_request(mdr, -CEPHFS_ENOSPC);
5259 return;
5260 }
5261
5262 bool truncating_smaller = false;
5263 if (mask & CEPH_SETATTR_SIZE) {
5264 if (req->get_data().length() >
5265 sizeof(struct ceph_fscrypt_last_block_header) + fscrypt_last_block_max_size) {
5266 dout(10) << __func__ << ": the last block size is too large" << dendl;
5267 respond_to_request(mdr, -CEPHFS_EINVAL);
5268 return;
5269 }
5270
5271 truncating_smaller = req->head.args.setattr.size < old_size ||
5272 (req->head.args.setattr.size == old_size && req->get_data().length());
5273 if (truncating_smaller && pip->is_truncating()) {
5274 dout(10) << " waiting for pending truncate from " << pip->truncate_from
5275 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
5276 mds->locker->drop_locks(mdr.get());
5277 mdr->drop_local_auth_pins();
5278 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
5279 return;
5280 }
5281
5282 if (truncating_smaller && req->get_data().length()) {
5283 struct ceph_fscrypt_last_block_header header;
5284 memset(&header, 0, sizeof(header));
5285 auto bl = req->get_data().cbegin();
5286 DECODE_START(1, bl);
5287 decode(header.change_attr, bl);
5288 DECODE_FINISH(bl);
5289
5290 dout(20) << __func__ << " mdr->retry:" << mdr->retry
5291 << " header.change_attr: " << header.change_attr
5292 << " header.file_offset: " << header.file_offset
5293 << " header.block_size: " << header.block_size
5294 << dendl;
5295
5296 if (header.change_attr != pip->change_attr) {
5297 dout(5) << __func__ << ": header.change_attr:" << header.change_attr
5298 << " != current change_attr:" << pip->change_attr
5299 << ", let client retry it!" << dendl;
5300 // flush the journal to make sure the clients will get the lasted
5301 // change_attr as possible for the next retry
5302 mds->mdlog->flush();
5303 respond_to_request(mdr, -CEPHFS_EAGAIN);
5304 return;
5305 }
5306 }
5307 }
5308
5309 bool changed_ranges = false;
5310
5311 // project update
5312 mdr->ls = mdlog->get_current_segment();
5313 EUpdate *le = new EUpdate(mdlog, "setattr");
5314 mdlog->start_entry(le);
5315
5316 auto pi = cur->project_inode(mdr);
5317
5318 if (mask & CEPH_SETATTR_UID)
5319 pi.inode->uid = req->head.args.setattr.uid;
5320 if (mask & CEPH_SETATTR_GID)
5321 pi.inode->gid = req->head.args.setattr.gid;
5322
5323 if (mask & CEPH_SETATTR_MODE)
5324 pi.inode->mode = (pi.inode->mode & ~07777) | (req->head.args.setattr.mode & 07777);
5325 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID|
5326 CEPH_SETATTR_KILL_SUID|CEPH_SETATTR_KILL_SGID)) &&
5327 S_ISREG(pi.inode->mode)) {
5328 if (mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID) &&
5329 (pi.inode->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
5330 pi.inode->mode &= ~(S_ISUID|S_ISGID);
5331 } else {
5332 if (mask & CEPH_SETATTR_KILL_SUID) {
5333 pi.inode->mode &= ~S_ISUID;
5334 }
5335 if (mask & CEPH_SETATTR_KILL_SGID) {
5336 pi.inode->mode &= ~S_ISGID;
5337 }
5338 }
5339 }
5340
5341 if (mask & CEPH_SETATTR_MTIME)
5342 pi.inode->mtime = req->head.args.setattr.mtime;
5343 if (mask & CEPH_SETATTR_ATIME)
5344 pi.inode->atime = req->head.args.setattr.atime;
5345 if (mask & CEPH_SETATTR_BTIME)
5346 pi.inode->btime = req->head.args.setattr.btime;
5347 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
5348 pi.inode->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
5349 if (mask & CEPH_SETATTR_SIZE) {
5350 if (truncating_smaller) {
5351 pi.inode->truncate(old_size, req->head.args.setattr.size, req->get_data());
5352 le->metablob.add_truncate_start(cur->ino());
5353 } else {
5354 pi.inode->size = req->head.args.setattr.size;
5355 pi.inode->rstat.rbytes = pi.inode->size;
5356 }
5357 pi.inode->mtime = mdr->get_op_stamp();
5358
5359 // adjust client's max_size?
5360 if (mds->locker->calc_new_client_ranges(cur, pi.inode->size)) {
5361 dout(10) << " client_ranges " << cur->get_previous_projected_inode()->client_ranges
5362 << " -> " << pi.inode->client_ranges << dendl;
5363 changed_ranges = true;
5364 }
5365 }
5366
5367 if (mask & CEPH_SETATTR_FSCRYPT_AUTH)
5368 pi.inode->fscrypt_auth = req->fscrypt_auth;
5369 if (mask & CEPH_SETATTR_FSCRYPT_FILE)
5370 pi.inode->fscrypt_file = req->fscrypt_file;
5371
5372 pi.inode->version = cur->pre_dirty();
5373 pi.inode->ctime = mdr->get_op_stamp();
5374 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5375 pi.inode->rstat.rctime = mdr->get_op_stamp();
5376 pi.inode->change_attr++;
5377
5378 // log + wait
5379 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5380 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5381 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5382
5383 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5384 truncating_smaller, changed_ranges));
5385
5386 // flush immediately if there are readers/writers waiting
5387 if (mdr->is_xlocked(&cur->filelock) &&
5388 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
5389 mds->mdlog->flush();
5390 }
5391
5392 /* Takes responsibility for mdr */
5393 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
5394 {
5395 CInode *in = mdr->in[0];
5396 client_t client = mdr->get_client();
5397 ceph_assert(in);
5398
5399 dout(10) << "do_open_truncate " << *in << dendl;
5400
5401 SnapRealm *realm = in->find_snaprealm();
5402 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
5403
5404 mdr->ls = mdlog->get_current_segment();
5405 EUpdate *le = new EUpdate(mdlog, "open_truncate");
5406 mdlog->start_entry(le);
5407
5408 // prepare
5409 auto pi = in->project_inode(mdr);
5410 pi.inode->version = in->pre_dirty();
5411 pi.inode->mtime = pi.inode->ctime = mdr->get_op_stamp();
5412 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5413 pi.inode->rstat.rctime = mdr->get_op_stamp();
5414 pi.inode->change_attr++;
5415
5416 uint64_t old_size = std::max<uint64_t>(pi.inode->size, mdr->client_request->head.args.open.old_size);
5417 if (old_size > 0) {
5418 pi.inode->truncate(old_size, 0);
5419 le->metablob.add_truncate_start(in->ino());
5420 }
5421
5422 bool changed_ranges = false;
5423 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
5424 pi.inode->client_ranges[client].range.first = 0;
5425 pi.inode->client_ranges[client].range.last = pi.inode->get_layout_size_increment();
5426 pi.inode->client_ranges[client].follows = realm->get_newest_seq();
5427 changed_ranges = true;
5428 in->mark_clientwriteable();
5429 cap->mark_clientwriteable();
5430 }
5431
5432 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5433
5434 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5435 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5436
5437 // make sure ino gets into the journal
5438 le->metablob.add_opened_ino(in->ino());
5439
5440 mdr->o_trunc = true;
5441
5442 CDentry *dn = 0;
5443 if (mdr->client_request->get_dentry_wanted()) {
5444 ceph_assert(mdr->dn[0].size());
5445 dn = mdr->dn[0].back();
5446 }
5447
5448 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
5449 changed_ranges));
5450 // Although the `open` part can give an early reply, the truncation won't
5451 // happen until our EUpdate is persistent, to give the client a prompt
5452 // response we must also flush that event.
5453 mdlog->flush();
5454 }
5455
5456
5457 /* This function cleans up the passed mdr */
5458 void Server::handle_client_setlayout(MDRequestRef& mdr)
5459 {
5460 const cref_t<MClientRequest> &req = mdr->client_request;
5461 CInode *cur = rdlock_path_pin_ref(mdr, true);
5462 if (!cur) return;
5463
5464 if (mdr->snapid != CEPH_NOSNAP) {
5465 respond_to_request(mdr, -CEPHFS_EROFS);
5466 return;
5467 }
5468 if (!cur->is_file()) {
5469 respond_to_request(mdr, -CEPHFS_EINVAL);
5470 return;
5471 }
5472 if (cur->get_projected_inode()->size ||
5473 cur->get_projected_inode()->truncate_seq > 1) {
5474 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
5475 return;
5476 }
5477
5478 // validate layout
5479 file_layout_t layout = cur->get_projected_inode()->layout;
5480 // save existing layout for later
5481 const auto old_layout = layout;
5482
5483 int access = MAY_WRITE;
5484
5485 if (req->head.args.setlayout.layout.fl_object_size > 0)
5486 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5487 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5488 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5489 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5490 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5491 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5492 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5493
5494 // make sure we have as new a map as the client
5495 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5496 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5497 return;
5498 }
5499 }
5500
5501 // Don't permit layout modifications without 'p' caps
5502 if (layout != old_layout) {
5503 access |= MAY_SET_VXATTR;
5504 }
5505
5506 if (!layout.is_valid()) {
5507 dout(10) << "bad layout" << dendl;
5508 respond_to_request(mdr, -CEPHFS_EINVAL);
5509 return;
5510 }
5511 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5512 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5513 respond_to_request(mdr, -CEPHFS_EINVAL);
5514 return;
5515 }
5516
5517 MutationImpl::LockOpVec lov;
5518 lov.add_xlock(&cur->filelock);
5519 if (!mds->locker->acquire_locks(mdr, lov))
5520 return;
5521
5522 if (!check_access(mdr, cur, access))
5523 return;
5524
5525 // project update
5526 auto pi = cur->project_inode(mdr);
5527 pi.inode->layout = layout;
5528 // add the old pool to the inode
5529 pi.inode->add_old_pool(old_layout.pool_id);
5530 pi.inode->version = cur->pre_dirty();
5531 pi.inode->ctime = mdr->get_op_stamp();
5532 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5533 pi.inode->rstat.rctime = mdr->get_op_stamp();
5534 pi.inode->change_attr++;
5535
5536 // log + wait
5537 mdr->ls = mdlog->get_current_segment();
5538 EUpdate *le = new EUpdate(mdlog, "setlayout");
5539 mdlog->start_entry(le);
5540 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5541 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5542 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5543
5544 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5545 }
5546
5547 bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
5548 {
5549 if (mdr->locking_state & MutationImpl::ALL_LOCKED)
5550 return true;
5551
5552 MutationImpl::LockOpVec lov;
5553 lov.add_xlock(&in->policylock);
5554 if (xlock_snaplock)
5555 lov.add_xlock(&in->snaplock);
5556 else
5557 lov.add_rdlock(&in->snaplock);
5558 if (!mds->locker->acquire_locks(mdr, lov))
5559 return false;
5560
5561 if (want_layout && in->get_projected_inode()->has_layout()) {
5562 mdr->dir_layout = in->get_projected_inode()->layout;
5563 want_layout = false;
5564 }
5565 if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
5566 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
5567 return false;
5568 }
5569
5570 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5571 return true;
5572 }
5573
5574 CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
5575 {
5576 CInode *in = mdcache->get_inode(ino);
5577 if (!in || in->state_test(CInode::STATE_PURGING)) {
5578 respond_to_request(mdr, -CEPHFS_ESTALE);
5579 return nullptr;
5580 }
5581 if (!in->is_auth()) {
5582 mdcache->request_forward(mdr, in->authority().first);
5583 return nullptr;
5584 }
5585
5586 return in;
5587 }
5588
5589 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
5590 {
5591 const cref_t<MClientRequest> &req = mdr->client_request;
5592
5593 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5594 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5595 if (!cur)
5596 return;
5597
5598 if (!cur->is_dir()) {
5599 respond_to_request(mdr, -CEPHFS_ENOTDIR);
5600 return;
5601 }
5602
5603 if (!xlock_policylock(mdr, cur, true))
5604 return;
5605
5606 // validate layout
5607 const auto& old_pi = cur->get_projected_inode();
5608 file_layout_t layout;
5609 if (old_pi->has_layout())
5610 layout = old_pi->layout;
5611 else if (mdr->dir_layout != file_layout_t())
5612 layout = mdr->dir_layout;
5613 else
5614 layout = mdcache->default_file_layout;
5615
5616 // Level of access required to complete
5617 int access = MAY_WRITE;
5618
5619 const auto old_layout = layout;
5620
5621 if (req->head.args.setlayout.layout.fl_object_size > 0)
5622 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5623 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5624 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5625 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5626 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5627 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5628 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5629 // make sure we have as new a map as the client
5630 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5631 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5632 return;
5633 }
5634 }
5635
5636 if (layout != old_layout) {
5637 access |= MAY_SET_VXATTR;
5638 }
5639
5640 if (!layout.is_valid()) {
5641 dout(10) << "bad layout" << dendl;
5642 respond_to_request(mdr, -CEPHFS_EINVAL);
5643 return;
5644 }
5645 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5646 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5647 respond_to_request(mdr, -CEPHFS_EINVAL);
5648 return;
5649 }
5650
5651 if (!check_access(mdr, cur, access))
5652 return;
5653
5654 auto pi = cur->project_inode(mdr);
5655 pi.inode->layout = layout;
5656 pi.inode->version = cur->pre_dirty();
5657
5658 // log + wait
5659 mdr->ls = mdlog->get_current_segment();
5660 EUpdate *le = new EUpdate(mdlog, "setlayout");
5661 mdlog->start_entry(le);
5662 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5663 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5664 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5665
5666 mdr->no_early_reply = true;
5667 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5668 }
5669
5670 // XATTRS
5671 int Server::parse_layout_vxattr_json(
5672 string name, string value, const OSDMap& osdmap, file_layout_t *layout)
5673 {
5674 auto parse_pool = [&](std::string pool_name, int64_t pool_id) -> int64_t {
5675 if (pool_name != "") {
5676 int64_t _pool_id = osdmap.lookup_pg_pool_name(pool_name);
5677 if (_pool_id < 0) {
5678 dout(10) << __func__ << ": unknown pool name:" << pool_name << dendl;
5679 return -CEPHFS_EINVAL;
5680 }
5681 return _pool_id;
5682 } else if (pool_id >= 0) {
5683 const auto pools = osdmap.get_pools();
5684 if (pools.find(pool_id) == pools.end()) {
5685 dout(10) << __func__ << ": unknown pool id:" << pool_id << dendl;
5686 return -CEPHFS_EINVAL;
5687 }
5688 return pool_id;
5689 } else {
5690 return -CEPHFS_EINVAL;
5691 }
5692 };
5693
5694 try {
5695 if (name == "layout.json") {
5696 JSONParser json_parser;
5697 if (json_parser.parse(value.c_str(), value.length()) and json_parser.is_object()) {
5698 std::string field;
5699 try {
5700 field = "object_size";
5701 JSONDecoder::decode_json("object_size", layout->object_size, &json_parser, true);
5702
5703 field = "stripe_unit";
5704 JSONDecoder::decode_json("stripe_unit", layout->stripe_unit, &json_parser, true);
5705
5706 field = "stripe_count";
5707 JSONDecoder::decode_json("stripe_count", layout->stripe_count, &json_parser, true);
5708
5709 field = "pool_namespace";
5710 JSONDecoder::decode_json("pool_namespace", layout->pool_ns, &json_parser, false);
5711
5712 field = "pool_id";
5713 int64_t pool_id = 0;
5714 JSONDecoder::decode_json("pool_id", pool_id, &json_parser, false);
5715
5716 field = "pool_name";
5717 std::string pool_name;
5718 JSONDecoder::decode_json("pool_name", pool_name, &json_parser, false);
5719
5720 pool_id = parse_pool(pool_name, pool_id);
5721 if (pool_id < 0) {
5722 return (int)pool_id;
5723 }
5724 layout->pool_id = pool_id;
5725 } catch (JSONDecoder::err&) {
5726 dout(10) << __func__ << ": json is missing a mandatory field named "
5727 << field << dendl;
5728 return -CEPHFS_EINVAL;
5729 }
5730 } else {
5731 dout(10) << __func__ << ": bad json" << dendl;
5732 return -CEPHFS_EINVAL;
5733 }
5734 } else {
5735 dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
5736 return -CEPHFS_ENODATA; // no such attribute
5737 }
5738 } catch (boost::bad_lexical_cast const&) {
5739 dout(10) << __func__ << ": bad vxattr value:" << value
5740 << ", unable to parse for xattr:" << name << dendl;
5741 return -CEPHFS_EINVAL;
5742 }
5743 return 0;
5744 }
5745
5746 // parse old style layout string
5747 int Server::parse_layout_vxattr_string(
5748 string name, string value, const OSDMap& osdmap, file_layout_t *layout)
5749 {
5750 try {
5751 if (name == "layout") {
5752 string::iterator begin = value.begin();
5753 string::iterator end = value.end();
5754 keys_and_values<string::iterator> p; // create instance of parser
5755 std::map<string, string> m; // map to receive results
5756 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5757 return -CEPHFS_EINVAL;
5758 }
5759 string left(begin, end);
5760 dout(10) << __func__ << ": parsed " << m << " left '" << left << "'" << dendl;
5761 if (begin != end)
5762 return -CEPHFS_EINVAL;
5763 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5764 // Skip validation on each attr, we do it once at the end (avoid
5765 // rejecting intermediate states if the overall result is ok)
5766 int r = parse_layout_vxattr_string(string("layout.") + q->first, q->second,
5767 osdmap, layout);
5768 if (r < 0)
5769 return r;
5770 }
5771 } else if (name == "layout.object_size") {
5772 layout->object_size = boost::lexical_cast<unsigned>(value);
5773 } else if (name == "layout.stripe_unit") {
5774 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5775 } else if (name == "layout.stripe_count") {
5776 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5777 } else if (name == "layout.pool") {
5778 try {
5779 layout->pool_id = boost::lexical_cast<unsigned>(value);
5780 } catch (boost::bad_lexical_cast const&) {
5781 int64_t pool = osdmap.lookup_pg_pool_name(value);
5782 if (pool < 0) {
5783 dout(10) << __func__ << ": unknown pool " << value << dendl;
5784 return -CEPHFS_ENOENT;
5785 }
5786 layout->pool_id = pool;
5787 }
5788 } else if (name == "layout.pool_id") {
5789 layout->pool_id = boost::lexical_cast<int64_t>(value);
5790 } else if (name == "layout.pool_name") {
5791 layout->pool_id = osdmap.lookup_pg_pool_name(value);
5792 if (layout->pool_id < 0) {
5793 dout(10) << __func__ << ": unknown pool " << value << dendl;
5794 return -CEPHFS_EINVAL;
5795 }
5796 } else if (name == "layout.pool_namespace") {
5797 layout->pool_ns = value;
5798 } else {
5799 dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
5800 return -CEPHFS_ENODATA; // no such attribute
5801 }
5802 } catch (boost::bad_lexical_cast const&) {
5803 dout(10) << __func__ << ": bad vxattr value, unable to parse int for "
5804 << name << dendl;
5805 return -CEPHFS_EINVAL;
5806 }
5807 return 0;
5808 }
5809
5810 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5811 file_layout_t *layout, bool validate)
5812 {
5813 dout(20) << __func__ << ": name:" << name << " value:'" << value << "'" << dendl;
5814
5815 int r;
5816 if (name == "layout.json") {
5817 r = parse_layout_vxattr_json(name, value, osdmap, layout);
5818 } else {
5819 r = parse_layout_vxattr_string(name, value, osdmap, layout);
5820 }
5821 if (r < 0) {
5822 return r;
5823 }
5824
5825 if (validate && !layout->is_valid()) {
5826 dout(10) << __func__ << ": bad layout" << dendl;
5827 return -CEPHFS_EINVAL;
5828 }
5829 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
5830 dout(10) << __func__ << ": invalid data pool " << layout->pool_id << dendl;
5831 return -CEPHFS_EINVAL;
5832 }
5833 return 0;
5834 }
5835
5836 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5837 {
5838 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5839 try {
5840 if (name == "quota") {
5841 string::iterator begin = value.begin();
5842 string::iterator end = value.end();
5843 if (begin == end) {
5844 // keep quota unchanged. (for create_quota_realm())
5845 return 0;
5846 }
5847 keys_and_values<string::iterator> p; // create instance of parser
5848 std::map<string, string> m; // map to receive results
5849 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5850 return -CEPHFS_EINVAL;
5851 }
5852 string left(begin, end);
5853 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5854 if (begin != end)
5855 return -CEPHFS_EINVAL;
5856 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5857 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5858 if (r < 0)
5859 return r;
5860 }
5861 } else if (name == "quota.max_bytes") {
5862 int64_t q = boost::lexical_cast<int64_t>(value);
5863 if (q < 0)
5864 return -CEPHFS_EINVAL;
5865 quota->max_bytes = q;
5866 } else if (name == "quota.max_files") {
5867 int64_t q = boost::lexical_cast<int64_t>(value);
5868 if (q < 0)
5869 return -CEPHFS_EINVAL;
5870 quota->max_files = q;
5871 } else {
5872 dout(10) << " unknown quota vxattr " << name << dendl;
5873 return -CEPHFS_EINVAL;
5874 }
5875 } catch (boost::bad_lexical_cast const&) {
5876 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5877 return -CEPHFS_EINVAL;
5878 }
5879
5880 if (!quota->is_valid()) {
5881 dout(10) << "bad quota" << dendl;
5882 return -CEPHFS_EINVAL;
5883 }
5884 return 0;
5885 }
5886
5887 void Server::create_quota_realm(CInode *in)
5888 {
5889 dout(10) << __func__ << " " << *in << dendl;
5890
5891 auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
5892 req->set_filepath(filepath(in->ino()));
5893 req->set_string2("ceph.quota");
5894 // empty vxattr value
5895 req->set_tid(mds->issue_tid());
5896
5897 mds->send_message_mds(req, in->authority().first);
5898 }
5899
5900 /*
5901 * Verify that the file layout attribute carried by client
5902 * is well-formatted.
5903 * Return 0 on success, otherwise this function takes
5904 * responsibility for the passed mdr.
5905 */
5906 int Server::check_layout_vxattr(MDRequestRef& mdr,
5907 string name,
5908 string value,
5909 file_layout_t *layout)
5910 {
5911 const cref_t<MClientRequest> &req = mdr->client_request;
5912 epoch_t epoch;
5913 int r;
5914
5915 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5916 r = parse_layout_vxattr(name, value, osdmap, layout);
5917 epoch = osdmap.get_epoch();
5918 });
5919
5920 if (r == -CEPHFS_ENOENT) {
5921
5922 // we don't have the specified pool, make sure our map
5923 // is newer than or as new as the client.
5924 epoch_t req_epoch = req->get_osdmap_epoch();
5925
5926 if (req_epoch > epoch) {
5927
5928 // well, our map is older. consult mds.
5929 auto fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
5930
5931 mds->objecter->wait_for_map(req_epoch, lambdafy(fin));
5932 return r;
5933 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5934
5935 // For compatibility with client w/ old code, we still need get the
5936 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5937 // we can remove those code.
5938 mdr->waited_for_osdmap = true;
5939 mds->objecter->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5940 mds, new C_MDS_RetryRequest(mdcache, mdr))));
5941 return r;
5942 }
5943 }
5944
5945 if (r < 0) {
5946
5947 if (r == -CEPHFS_ENOENT)
5948 r = -CEPHFS_EINVAL;
5949
5950 respond_to_request(mdr, r);
5951 return r;
5952 }
5953
5954 // all is well
5955 return 0;
5956 }
5957
5958 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
5959 {
5960 const cref_t<MClientRequest> &req = mdr->client_request;
5961 MutationImpl::LockOpVec lov;
5962 string name(req->get_path2());
5963 bufferlist bl = req->get_data();
5964 string value (bl.c_str(), bl.length());
5965 dout(10) << "handle_set_vxattr " << name
5966 << " val " << value.length()
5967 << " bytes on " << *cur
5968 << dendl;
5969
5970 CInode::mempool_inode *pip = nullptr;
5971 string rest;
5972
5973 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5974 return;
5975 }
5976
5977 bool adjust_realm = false;
5978 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5979 if (!cur->is_dir()) {
5980 respond_to_request(mdr, -CEPHFS_EINVAL);
5981 return;
5982 }
5983
5984 if (!xlock_policylock(mdr, cur, true))
5985 return;
5986
5987 /* We need 'As' caps for the fscrypt context */
5988 lov.add_xlock(&cur->authlock);
5989 if (!mds->locker->acquire_locks(mdr, lov)) {
5990 return;
5991 }
5992
5993 /* encrypted directories can't have their layout changed */
5994 if (!cur->get_inode()->fscrypt_auth.empty()) {
5995 respond_to_request(mdr, -CEPHFS_EINVAL);
5996 return;
5997 }
5998
5999 file_layout_t layout;
6000 if (cur->get_projected_inode()->has_layout())
6001 layout = cur->get_projected_inode()->layout;
6002 else if (mdr->dir_layout != file_layout_t())
6003 layout = mdr->dir_layout;
6004 else
6005 layout = mdcache->default_file_layout;
6006
6007 rest = name.substr(name.find("layout"));
6008 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
6009 return;
6010
6011 auto pi = cur->project_inode(mdr);
6012 pi.inode->layout = layout;
6013 mdr->no_early_reply = true;
6014 pip = pi.inode.get();
6015 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
6016 if (!cur->is_file()) {
6017 respond_to_request(mdr, -CEPHFS_EINVAL);
6018 return;
6019 }
6020 if (cur->get_projected_inode()->size ||
6021 cur->get_projected_inode()->truncate_seq > 1) {
6022 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
6023 return;
6024 }
6025 file_layout_t layout = cur->get_projected_inode()->layout;
6026 rest = name.substr(name.find("layout"));
6027 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
6028 return;
6029
6030 lov.add_xlock(&cur->filelock);
6031 if (!mds->locker->acquire_locks(mdr, lov))
6032 return;
6033
6034 /* encrypted files can't have their layout changed */
6035 if (!cur->get_inode()->fscrypt_auth.empty()) {
6036 respond_to_request(mdr, -CEPHFS_EINVAL);
6037 return;
6038 }
6039
6040 auto pi = cur->project_inode(mdr);
6041 int64_t old_pool = pi.inode->layout.pool_id;
6042 pi.inode->add_old_pool(old_pool);
6043 pi.inode->layout = layout;
6044 pip = pi.inode.get();
6045 } else if (name.compare(0, 10, "ceph.quota") == 0) {
6046 if (!cur->is_dir()) {
6047 respond_to_request(mdr, -CEPHFS_EINVAL);
6048 return;
6049 }
6050
6051 quota_info_t quota = cur->get_projected_inode()->quota;
6052
6053 rest = name.substr(name.find("quota"));
6054 int r = parse_quota_vxattr(rest, value, &quota);
6055 if (r < 0) {
6056 respond_to_request(mdr, r);
6057 return;
6058 }
6059
6060 if (quota.is_enabled() && !cur->get_projected_srnode())
6061 adjust_realm = true;
6062
6063 if (!xlock_policylock(mdr, cur, false, adjust_realm))
6064 return;
6065
6066 if (cur->get_projected_inode()->quota == quota) {
6067 respond_to_request(mdr, 0);
6068 return;
6069 }
6070
6071 auto pi = cur->project_inode(mdr, false, adjust_realm);
6072 pi.inode->quota = quota;
6073
6074 if (adjust_realm)
6075 pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq();
6076
6077 mdr->no_early_reply = true;
6078 pip = pi.inode.get();
6079
6080 client_t exclude_ct = mdr->get_client();
6081 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
6082 } else if (name == "ceph.dir.subvolume"sv) {
6083 if (!cur->is_dir()) {
6084 respond_to_request(mdr, -CEPHFS_EINVAL);
6085 return;
6086 }
6087
6088 bool val;
6089 try {
6090 val = boost::lexical_cast<bool>(value);
6091 } catch (boost::bad_lexical_cast const&) {
6092 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
6093 respond_to_request(mdr, -CEPHFS_EINVAL);
6094 return;
6095 }
6096
6097 /* Verify it's not already a subvolume with lighter weight
6098 * rdlock.
6099 */
6100 if (!mdr->more()->rdonly_checks) {
6101 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
6102 lov.add_rdlock(&cur->snaplock);
6103 if (!mds->locker->acquire_locks(mdr, lov))
6104 return;
6105 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6106 }
6107 const auto srnode = cur->get_projected_srnode();
6108 if (val == (srnode && srnode->is_subvolume())) {
6109 dout(20) << "already marked subvolume" << dendl;
6110 respond_to_request(mdr, 0);
6111 return;
6112 }
6113 mdr->more()->rdonly_checks = true;
6114 }
6115
6116 if ((mdr->locking_state & MutationImpl::ALL_LOCKED) && !mdr->is_xlocked(&cur->snaplock)) {
6117 /* drop the rdlock and acquire xlocks */
6118 dout(20) << "dropping rdlocks" << dendl;
6119 mds->locker->drop_locks(mdr.get());
6120 if (!xlock_policylock(mdr, cur, false, true))
6121 return;
6122 }
6123
6124 /* repeat rdonly checks in case changed between rdlock -> xlock */
6125 SnapRealm *realm = cur->find_snaprealm();
6126 if (val) {
6127 inodeno_t subvol_ino = realm->get_subvolume_ino();
6128 // can't create subvolume inside another subvolume
6129 if (subvol_ino && subvol_ino != cur->ino()) {
6130 respond_to_request(mdr, -CEPHFS_EINVAL);
6131 return;
6132 }
6133 }
6134
6135 const auto srnode = cur->get_projected_srnode();
6136 if (val == (srnode && srnode->is_subvolume())) {
6137 respond_to_request(mdr, 0);
6138 return;
6139 }
6140
6141 auto pi = cur->project_inode(mdr, false, true);
6142 if (!srnode)
6143 pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
6144 if (val)
6145 pi.snapnode->mark_subvolume();
6146 else
6147 pi.snapnode->clear_subvolume();
6148
6149 mdr->no_early_reply = true;
6150 pip = pi.inode.get();
6151 adjust_realm = true;
6152 } else if (name == "ceph.dir.pin"sv) {
6153 if (!cur->is_dir() || cur->is_root()) {
6154 respond_to_request(mdr, -CEPHFS_EINVAL);
6155 return;
6156 }
6157
6158 mds_rank_t rank;
6159 try {
6160 rank = boost::lexical_cast<mds_rank_t>(value);
6161 if (rank < 0) rank = MDS_RANK_NONE;
6162 else if (rank >= MAX_MDS) {
6163 respond_to_request(mdr, -CEPHFS_EDOM);
6164 return;
6165 }
6166 } catch (boost::bad_lexical_cast const&) {
6167 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
6168 respond_to_request(mdr, -CEPHFS_EINVAL);
6169 return;
6170 }
6171
6172 if (!xlock_policylock(mdr, cur))
6173 return;
6174
6175 auto pi = cur->project_inode(mdr);
6176 cur->set_export_pin(rank);
6177 pip = pi.inode.get();
6178 } else if (name == "ceph.dir.pin.random"sv) {
6179 if (!cur->is_dir() || cur->is_root()) {
6180 respond_to_request(mdr, -CEPHFS_EINVAL);
6181 return;
6182 }
6183
6184 double val;
6185 try {
6186 val = boost::lexical_cast<double>(value);
6187 } catch (boost::bad_lexical_cast const&) {
6188 dout(10) << "bad vxattr value, unable to parse float for " << name << dendl;
6189 respond_to_request(mdr, -CEPHFS_EINVAL);
6190 return;
6191 }
6192
6193 if (val < 0.0 || 1.0 < val) {
6194 respond_to_request(mdr, -CEPHFS_EDOM);
6195 return;
6196 } else if (mdcache->export_ephemeral_random_max < val) {
6197 respond_to_request(mdr, -CEPHFS_EINVAL);
6198 return;
6199 }
6200
6201 if (!xlock_policylock(mdr, cur))
6202 return;
6203
6204 auto pi = cur->project_inode(mdr);
6205 cur->setxattr_ephemeral_rand(val);
6206 pip = pi.inode.get();
6207 } else if (name == "ceph.dir.pin.distributed"sv) {
6208 if (!cur->is_dir() || cur->is_root()) {
6209 respond_to_request(mdr, -CEPHFS_EINVAL);
6210 return;
6211 }
6212
6213 bool val;
6214 try {
6215 val = boost::lexical_cast<bool>(value);
6216 } catch (boost::bad_lexical_cast const&) {
6217 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
6218 respond_to_request(mdr, -CEPHFS_EINVAL);
6219 return;
6220 }
6221
6222 if (!xlock_policylock(mdr, cur))
6223 return;
6224
6225 auto pi = cur->project_inode(mdr);
6226 cur->setxattr_ephemeral_dist(val);
6227 pip = pi.inode.get();
6228 } else {
6229 dout(10) << " unknown vxattr " << name << dendl;
6230 respond_to_request(mdr, -CEPHFS_EINVAL);
6231 return;
6232 }
6233
6234 pip->change_attr++;
6235 pip->ctime = mdr->get_op_stamp();
6236 if (mdr->get_op_stamp() > pip->rstat.rctime)
6237 pip->rstat.rctime = mdr->get_op_stamp();
6238 pip->version = cur->pre_dirty();
6239 if (cur->is_file())
6240 pip->update_backtrace();
6241
6242 // log + wait
6243 mdr->ls = mdlog->get_current_segment();
6244 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
6245 mdlog->start_entry(le);
6246 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6247 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6248 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6249
6250 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
6251 false, false, adjust_realm));
6252 return;
6253 }
6254
6255 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
6256 {
6257 const cref_t<MClientRequest> &req = mdr->client_request;
6258 string name(req->get_path2());
6259
6260 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
6261
6262 if (name == "ceph.dir.layout") {
6263 if (!cur->is_dir()) {
6264 respond_to_request(mdr, -CEPHFS_ENODATA);
6265 return;
6266 }
6267 if (cur->is_root()) {
6268 dout(10) << "can't remove layout policy on the root directory" << dendl;
6269 respond_to_request(mdr, -CEPHFS_EINVAL);
6270 return;
6271 }
6272
6273 if (!cur->get_projected_inode()->has_layout()) {
6274 respond_to_request(mdr, -CEPHFS_ENODATA);
6275 return;
6276 }
6277
6278 MutationImpl::LockOpVec lov;
6279 lov.add_xlock(&cur->policylock);
6280 if (!mds->locker->acquire_locks(mdr, lov))
6281 return;
6282
6283 auto pi = cur->project_inode(mdr);
6284 pi.inode->clear_layout();
6285 pi.inode->version = cur->pre_dirty();
6286
6287 // log + wait
6288 mdr->ls = mdlog->get_current_segment();
6289 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
6290 mdlog->start_entry(le);
6291 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6292 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6293 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6294
6295 mdr->no_early_reply = true;
6296 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6297 return;
6298 } else if (name == "ceph.dir.layout.pool_namespace"
6299 || name == "ceph.file.layout.pool_namespace") {
6300 // Namespace is the only layout field that has a meaningful
6301 // null/none value (empty string, means default layout). Is equivalent
6302 // to a setxattr with empty string: pass through the empty payload of
6303 // the rmxattr request to do this.
6304 handle_set_vxattr(mdr, cur);
6305 return;
6306 }
6307
6308 respond_to_request(mdr, -CEPHFS_ENODATA);
6309 }
6310
6311 const Server::XattrHandler Server::xattr_handlers[] = {
6312 {
6313 xattr_name: Server::DEFAULT_HANDLER,
6314 description: "default xattr handler",
6315 validate: &Server::default_xattr_validate,
6316 setxattr: &Server::default_setxattr_handler,
6317 removexattr: &Server::default_removexattr_handler,
6318 },
6319 {
6320 xattr_name: "ceph.mirror.info",
6321 description: "mirror info xattr handler",
6322 validate: &Server::mirror_info_xattr_validate,
6323 setxattr: &Server::mirror_info_setxattr_handler,
6324 removexattr: &Server::mirror_info_removexattr_handler
6325 },
6326 };
6327
6328 const Server::XattrHandler* Server::get_xattr_or_default_handler(std::string_view xattr_name) {
6329 const XattrHandler *default_xattr_handler = nullptr;
6330
6331 for (auto &handler : xattr_handlers) {
6332 if (handler.xattr_name == Server::DEFAULT_HANDLER) {
6333 ceph_assert(default_xattr_handler == nullptr);
6334 default_xattr_handler = &handler;
6335 }
6336 if (handler.xattr_name == xattr_name) {
6337 dout(20) << "handler=" << handler.description << dendl;
6338 return &handler;
6339 }
6340 }
6341
6342 ceph_assert(default_xattr_handler != nullptr);
6343 dout(20) << "handler=" << default_xattr_handler->description << dendl;
6344 return default_xattr_handler;
6345 }
6346
6347 int Server::xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6348 const std::string &xattr_name, int op, int flags) {
6349 if (op == CEPH_MDS_OP_SETXATTR) {
6350 if (xattrs) {
6351 if ((flags & CEPH_XATTR_CREATE) && xattrs->count(mempool::mds_co::string(xattr_name))) {
6352 dout(10) << "setxattr '" << xattr_name << "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur << dendl;
6353 return -CEPHFS_EEXIST;
6354 }
6355 }
6356 if ((flags & CEPH_XATTR_REPLACE) && !(xattrs && xattrs->count(mempool::mds_co::string(xattr_name)))) {
6357 dout(10) << "setxattr '" << xattr_name << "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur << dendl;
6358 return -CEPHFS_ENODATA;
6359 }
6360
6361 return 0;
6362 }
6363
6364 if (op == CEPH_MDS_OP_RMXATTR) {
6365 if (!xattrs || xattrs->count(mempool::mds_co::string(xattr_name)) == 0) {
6366 dout(10) << "removexattr '" << xattr_name << "' and CEPHFS_ENODATA on " << *cur << dendl;
6367 return -CEPHFS_ENODATA;
6368 }
6369
6370 return 0;
6371 }
6372
6373 derr << ": unhandled validation for: " << xattr_name << dendl;
6374 return -CEPHFS_EINVAL;
6375 }
6376
6377 void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name,
6378 const bufferlist &xattr_value) {
6379 size_t len = xattr_value.length();
6380 bufferptr b = buffer::create(len);
6381 if (len) {
6382 xattr_value.begin().copy(len, b.c_str());
6383 }
6384 auto em = xattrs->emplace(std::piecewise_construct,
6385 std::forward_as_tuple(mempool::mds_co::string(xattr_name)),
6386 std::forward_as_tuple(b));
6387 if (!em.second) {
6388 em.first->second = b;
6389 }
6390 }
6391
6392 void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name) {
6393 xattrs->erase(mempool::mds_co::string(xattr_name));
6394 }
6395
6396 int Server::default_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6397 XattrOp *xattr_op) {
6398 return xattr_validate(cur, xattrs, xattr_op->xattr_name, xattr_op->op, xattr_op->flags);
6399 }
6400
6401 void Server::default_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6402 const XattrOp &xattr_op) {
6403 xattr_set(xattrs, xattr_op.xattr_name, xattr_op.xattr_value);
6404 }
6405
6406 void Server::default_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6407 const XattrOp &xattr_op) {
6408 xattr_rm(xattrs, xattr_op.xattr_name);
6409 }
6410
6411 // mirror info xattr handlers
6412 const std::string Server::MirrorXattrInfo::MIRROR_INFO_REGEX = "^cluster_id=([a-f0-9]{8}-" \
6413 "[a-f0-9]{4}-[a-f0-9]{4}-" \
6414 "[a-f0-9]{4}-[a-f0-9]{12})" \
6415 " fs_id=(\\d+)$";
6416 const std::string Server::MirrorXattrInfo::CLUSTER_ID = "ceph.mirror.info.cluster_id";
6417 const std::string Server::MirrorXattrInfo::FS_ID = "ceph.mirror.info.fs_id";
6418 int Server::parse_mirror_info_xattr(const std::string &name, const std::string &value,
6419 std::string &cluster_id, std::string &fs_id) {
6420 dout(20) << "parsing name=" << name << ", value=" << value << dendl;
6421
6422 static const std::regex regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX);
6423 std::smatch match;
6424
6425 std::regex_search(value, match, regex);
6426 if (match.size() != 3) {
6427 derr << "mirror info parse error" << dendl;
6428 return -CEPHFS_EINVAL;
6429 }
6430
6431 cluster_id = match[1];
6432 fs_id = match[2];
6433 dout(20) << " parsed cluster_id=" << cluster_id << ", fs_id=" << fs_id << dendl;
6434 return 0;
6435 }
6436
6437 int Server::mirror_info_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6438 XattrOp *xattr_op) {
6439 if (!cur->is_root()) {
6440 return -CEPHFS_EINVAL;
6441 }
6442
6443 int v1 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::CLUSTER_ID, xattr_op->op, xattr_op->flags);
6444 int v2 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::FS_ID, xattr_op->op, xattr_op->flags);
6445 if (v1 != v2) {
6446 derr << "inconsistent mirror info state (" << v1 << "," << v2 << ")" << dendl;
6447 return -CEPHFS_EINVAL;
6448 }
6449
6450 if (v1 < 0) {
6451 return v1;
6452 }
6453
6454 if (xattr_op->op == CEPH_MDS_OP_RMXATTR) {
6455 return 0;
6456 }
6457
6458 std::string cluster_id;
6459 std::string fs_id;
6460 int r = parse_mirror_info_xattr(xattr_op->xattr_name, xattr_op->xattr_value.to_str(),
6461 cluster_id, fs_id);
6462 if (r < 0) {
6463 return r;
6464 }
6465
6466 xattr_op->xinfo = std::make_unique<MirrorXattrInfo>(cluster_id, fs_id);
6467 return 0;
6468 }
6469
6470 void Server::mirror_info_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6471 const XattrOp &xattr_op) {
6472 auto mirror_info = dynamic_cast<MirrorXattrInfo&>(*(xattr_op.xinfo));
6473
6474 bufferlist bl;
6475 bl.append(mirror_info.cluster_id.c_str(), mirror_info.cluster_id.length());
6476 xattr_set(xattrs, Server::MirrorXattrInfo::CLUSTER_ID, bl);
6477
6478 bl.clear();
6479 bl.append(mirror_info.fs_id.c_str(), mirror_info.fs_id.length());
6480 xattr_set(xattrs, Server::MirrorXattrInfo::FS_ID, bl);
6481 }
6482
6483 void Server::mirror_info_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6484 const XattrOp &xattr_op) {
6485 xattr_rm(xattrs, Server::MirrorXattrInfo::CLUSTER_ID);
6486 xattr_rm(xattrs, Server::MirrorXattrInfo::FS_ID);
6487 }
6488
6489 void Server::handle_client_setxattr(MDRequestRef& mdr)
6490 {
6491 const cref_t<MClientRequest> &req = mdr->client_request;
6492 string name(req->get_path2());
6493
6494 // is a ceph virtual xattr?
6495 if (is_ceph_vxattr(name)) {
6496 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6497 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6498 if (!cur)
6499 return;
6500
6501 handle_set_vxattr(mdr, cur);
6502 return;
6503 }
6504
6505 if (!is_allowed_ceph_xattr(name)) {
6506 respond_to_request(mdr, -CEPHFS_EINVAL);
6507 return;
6508 }
6509
6510 CInode *cur = rdlock_path_pin_ref(mdr, true);
6511 if (!cur)
6512 return;
6513
6514 if (mdr->snapid != CEPH_NOSNAP) {
6515 respond_to_request(mdr, -CEPHFS_EROFS);
6516 return;
6517 }
6518
6519 int flags = req->head.args.setxattr.flags;
6520
6521 MutationImpl::LockOpVec lov;
6522 lov.add_xlock(&cur->xattrlock);
6523 if (!mds->locker->acquire_locks(mdr, lov))
6524 return;
6525
6526 if (!check_access(mdr, cur, MAY_WRITE))
6527 return;
6528
6529 size_t len = req->get_data().length();
6530 size_t inc = len + name.length();
6531
6532 auto handler = Server::get_xattr_or_default_handler(name);
6533 const auto& pxattrs = cur->get_projected_xattrs();
6534 if (pxattrs) {
6535 // check xattrs kv pairs size
6536 size_t cur_xattrs_size = 0;
6537 for (const auto& p : *pxattrs) {
6538 if ((flags & CEPH_XATTR_REPLACE) && name.compare(p.first) == 0) {
6539 continue;
6540 }
6541 cur_xattrs_size += p.first.length() + p.second.length();
6542 }
6543
6544 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
6545 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6546 << cur_xattrs_size << ", inc " << inc << dendl;
6547 respond_to_request(mdr, -CEPHFS_ENOSPC);
6548 return;
6549 }
6550 }
6551
6552 XattrOp xattr_op(CEPH_MDS_OP_SETXATTR, name, req->get_data(), flags);
6553 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6554 if (r < 0) {
6555 respond_to_request(mdr, r);
6556 return;
6557 }
6558
6559 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
6560
6561 // project update
6562 auto pi = cur->project_inode(mdr, true);
6563 pi.inode->version = cur->pre_dirty();
6564 pi.inode->ctime = mdr->get_op_stamp();
6565 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6566 pi.inode->rstat.rctime = mdr->get_op_stamp();
6567 pi.inode->change_attr++;
6568 pi.inode->xattr_version++;
6569
6570 if ((flags & CEPH_XATTR_REMOVE)) {
6571 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
6572 } else {
6573 std::invoke(handler->setxattr, this, cur, pi.xattrs, xattr_op);
6574 }
6575
6576 // log + wait
6577 mdr->ls = mdlog->get_current_segment();
6578 EUpdate *le = new EUpdate(mdlog, "setxattr");
6579 mdlog->start_entry(le);
6580 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6581 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6582 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6583
6584 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6585 }
6586
6587 void Server::handle_client_removexattr(MDRequestRef& mdr)
6588 {
6589 const cref_t<MClientRequest> &req = mdr->client_request;
6590 std::string name(req->get_path2());
6591
6592 // is a ceph virtual xattr?
6593 if (is_ceph_vxattr(name)) {
6594 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6595 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6596 if (!cur)
6597 return;
6598
6599 handle_remove_vxattr(mdr, cur);
6600 return;
6601 }
6602
6603 if (!is_allowed_ceph_xattr(name)) {
6604 respond_to_request(mdr, -CEPHFS_EINVAL);
6605 return;
6606 }
6607
6608 CInode* cur = rdlock_path_pin_ref(mdr, true);
6609 if (!cur)
6610 return;
6611
6612 if (mdr->snapid != CEPH_NOSNAP) {
6613 respond_to_request(mdr, -CEPHFS_EROFS);
6614 return;
6615 }
6616
6617 MutationImpl::LockOpVec lov;
6618 lov.add_xlock(&cur->xattrlock);
6619 if (!mds->locker->acquire_locks(mdr, lov))
6620 return;
6621
6622
6623 auto handler = Server::get_xattr_or_default_handler(name);
6624 bufferlist bl;
6625 XattrOp xattr_op(CEPH_MDS_OP_RMXATTR, name, bl, 0);
6626
6627 const auto& pxattrs = cur->get_projected_xattrs();
6628 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6629 if (r < 0) {
6630 respond_to_request(mdr, r);
6631 return;
6632 }
6633
6634 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
6635
6636 // project update
6637 auto pi = cur->project_inode(mdr, true);
6638 pi.inode->version = cur->pre_dirty();
6639 pi.inode->ctime = mdr->get_op_stamp();
6640 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6641 pi.inode->rstat.rctime = mdr->get_op_stamp();
6642 pi.inode->change_attr++;
6643 pi.inode->xattr_version++;
6644 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
6645
6646 // log + wait
6647 mdr->ls = mdlog->get_current_segment();
6648 EUpdate *le = new EUpdate(mdlog, "removexattr");
6649 mdlog->start_entry(le);
6650 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6651 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6652 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6653
6654 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6655 }
6656
6657 void Server::handle_client_getvxattr(MDRequestRef& mdr)
6658 {
6659 const auto& req = mdr->client_request;
6660 string xattr_name{req->get_path2()};
6661
6662 // is a ceph virtual xattr?
6663 if (!is_ceph_vxattr(xattr_name)) {
6664 respond_to_request(mdr, -CEPHFS_ENODATA);
6665 return;
6666 }
6667
6668 CInode *cur = rdlock_path_pin_ref(mdr, true, false);
6669 if (!cur) {
6670 return;
6671 }
6672
6673 if (is_ceph_dir_vxattr(xattr_name)) {
6674 if (!cur->is_dir()) {
6675 respond_to_request(mdr, -CEPHFS_ENODATA);
6676 return;
6677 }
6678 } else if (is_ceph_file_vxattr(xattr_name)) {
6679 if (cur->is_dir()) {
6680 respond_to_request(mdr, -CEPHFS_ENODATA);
6681 return;
6682 }
6683 }
6684
6685 CachedStackStringStream css;
6686 int r = 0;
6687 ceph::bufferlist bl;
6688 // handle these vxattrs
6689 if ((xattr_name.substr(0, 15) == "ceph.dir.layout"sv) ||
6690 (xattr_name.substr(0, 16) == "ceph.file.layout"sv)) {
6691 std::string layout_field;
6692
6693 struct layout_xattr_info_t {
6694 enum class InheritanceStatus : uint32_t {
6695 DEFAULT = 0,
6696 SET = 1,
6697 INHERITED = 2
6698 };
6699
6700 const file_layout_t layout;
6701 const InheritanceStatus status;
6702
6703 layout_xattr_info_t(const file_layout_t& l, InheritanceStatus inh)
6704 : layout(l), status(inh) { }
6705
6706 static std::string status_to_string(InheritanceStatus status) {
6707 switch (status) {
6708 case InheritanceStatus::DEFAULT: return "default"s;
6709 case InheritanceStatus::SET: return "set"s;
6710 case InheritanceStatus::INHERITED: return "inherited"s;
6711 default: return "unknown"s;
6712 }
6713 }
6714 };
6715
6716 auto is_default_layout = [&](const file_layout_t& layout) -> bool {
6717 return (layout == mdcache->default_file_layout);
6718 };
6719 auto get_inherited_layout = [&](CInode *cur) -> layout_xattr_info_t {
6720 auto orig_in = cur;
6721
6722 while (cur) {
6723 if (cur->get_projected_inode()->has_layout()) {
6724 auto& curr_layout = cur->get_projected_inode()->layout;
6725 if (is_default_layout(curr_layout)) {
6726 return {curr_layout, layout_xattr_info_t::InheritanceStatus::DEFAULT};
6727 }
6728 if (cur == orig_in) {
6729 // we've found a new layout at this inode
6730 return {curr_layout, layout_xattr_info_t::InheritanceStatus::SET};
6731 } else {
6732 return {curr_layout, layout_xattr_info_t::InheritanceStatus::INHERITED};
6733 }
6734 }
6735
6736 if (cur->is_root()) {
6737 break;
6738 }
6739
6740 cur = cur->get_projected_parent_dir()->get_inode();
6741 }
6742 mds->clog->error() << "no layout found at root dir!";
6743 ceph_abort("no layout found at root dir! something is really messed up with layouts!");
6744 };
6745
6746 if (xattr_name == "ceph.dir.layout.json"sv ||
6747 xattr_name == "ceph.file.layout.json"sv) {
6748 // fetch layout only for valid xattr_name
6749 const auto lxi = get_inherited_layout(cur);
6750
6751 *css << "{\"stripe_unit\": " << lxi.layout.stripe_unit
6752 << ", \"stripe_count\": " << lxi.layout.stripe_count
6753 << ", \"object_size\": " << lxi.layout.object_size
6754 << ", \"pool_name\": ";
6755 mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
6756 *css << "\"";
6757 if (o.have_pg_pool(lxi.layout.pool_id)) {
6758 *css << o.get_pool_name(lxi.layout.pool_id);
6759 }
6760 *css << "\"";
6761 });
6762 *css << ", \"pool_id\": " << (uint64_t)lxi.layout.pool_id;
6763 *css << ", \"pool_namespace\": \"" << lxi.layout.pool_ns << "\"";
6764 *css << ", \"inheritance\": \"@"
6765 << layout_xattr_info_t::status_to_string(lxi.status) << "\"}";
6766 } else if ((xattr_name == "ceph.dir.layout.pool_name"sv) ||
6767 (xattr_name == "ceph.file.layout.pool_name"sv)) {
6768 // fetch layout only for valid xattr_name
6769 const auto lxi = get_inherited_layout(cur);
6770 mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
6771 if (o.have_pg_pool(lxi.layout.pool_id)) {
6772 *css << o.get_pool_name(lxi.layout.pool_id);
6773 }
6774 });
6775 } else if ((xattr_name == "ceph.dir.layout.pool_id"sv) ||
6776 (xattr_name == "ceph.file.layout.pool_id"sv)) {
6777 // fetch layout only for valid xattr_name
6778 const auto lxi = get_inherited_layout(cur);
6779 *css << (uint64_t)lxi.layout.pool_id;
6780 } else {
6781 r = -CEPHFS_ENODATA; // no such attribute
6782 }
6783 } else if (xattr_name.substr(0, 12) == "ceph.dir.pin"sv) {
6784 if (xattr_name == "ceph.dir.pin"sv) {
6785 *css << cur->get_projected_inode()->export_pin;
6786 } else if (xattr_name == "ceph.dir.pin.random"sv) {
6787 *css << cur->get_projected_inode()->export_ephemeral_random_pin;
6788 } else if (xattr_name == "ceph.dir.pin.distributed"sv) {
6789 *css << cur->get_projected_inode()->export_ephemeral_distributed_pin;
6790 } else {
6791 // otherwise respond as invalid request
6792 // since we only handle ceph vxattrs here
6793 r = -CEPHFS_ENODATA; // no such attribute
6794 }
6795 } else {
6796 // otherwise respond as invalid request
6797 // since we only handle ceph vxattrs here
6798 r = -CEPHFS_ENODATA; // no such attribute
6799 }
6800
6801 if (r == 0) {
6802 ENCODE_START(1, 1, bl);
6803 encode(css->strv(), bl);
6804 ENCODE_FINISH(bl);
6805 mdr->reply_extra_bl = bl;
6806 }
6807
6808 respond_to_request(mdr, r);
6809 }
6810
6811 // =================================================================
6812 // DIRECTORY and NAMESPACE OPS
6813
6814
6815 // ------------------------------------------------
6816
6817 // MKNOD
6818
6819 class C_MDS_mknod_finish : public ServerLogContext {
6820 CDentry *dn;
6821 CInode *newi;
6822 public:
6823 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
6824 ServerLogContext(s, r), dn(d), newi(ni) {}
6825 void finish(int r) override {
6826 ceph_assert(r == 0);
6827
6828 // crash current MDS and the replacing MDS will test the journal
6829 ceph_assert(!g_conf()->mds_kill_skip_replaying_inotable);
6830
6831 // link the inode
6832 dn->pop_projected_linkage();
6833
6834 // be a bit hacky with the inode version, here.. we decrement it
6835 // just to keep mark_dirty() happen. (we didn't bother projecting
6836 // a new version of hte inode since it's just been created)
6837 newi->mark_dirty(mdr->ls);
6838 newi->mark_dirty_parent(mdr->ls, true);
6839
6840 // mkdir?
6841 if (newi->is_dir()) {
6842 CDir *dir = newi->get_dirfrag(frag_t());
6843 ceph_assert(dir);
6844 dir->mark_dirty(mdr->ls);
6845 dir->mark_new(mdr->ls);
6846 }
6847
6848 mdr->apply();
6849
6850 MDRequestRef null_ref;
6851 get_mds()->mdcache->send_dentry_link(dn, null_ref);
6852
6853 if (newi->is_file()) {
6854 get_mds()->locker->share_inode_max_size(newi);
6855 } else if (newi->is_dir()) {
6856 // We do this now so that the linkages on the new directory are stable.
6857 newi->maybe_ephemeral_rand();
6858 }
6859
6860 // hit pop
6861 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
6862
6863 // reply
6864 server->respond_to_request(mdr, 0);
6865 }
6866 };
6867
6868
6869 void Server::handle_client_mknod(MDRequestRef& mdr)
6870 {
6871 const cref_t<MClientRequest> &req = mdr->client_request;
6872 client_t client = mdr->get_client();
6873
6874 unsigned mode = req->head.args.mknod.mode;
6875 if ((mode & S_IFMT) == 0)
6876 mode |= S_IFREG;
6877
6878 mdr->disable_lock_cache();
6879 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, false, S_ISREG(mode));
6880 if (!dn)
6881 return;
6882
6883 CDir *dir = dn->get_dir();
6884 CInode *diri = dir->get_inode();
6885 if (!check_access(mdr, diri, MAY_WRITE))
6886 return;
6887 if (!check_fragment_space(mdr, dir))
6888 return;
6889 if (!check_dir_max_entries(mdr, dir))
6890 return;
6891
6892 ceph_assert(dn->get_projected_linkage()->is_null());
6893 if (req->get_alternate_name().size() > alternate_name_max) {
6894 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6895 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6896 return;
6897 }
6898 dn->set_alternate_name(req->get_alternate_name());
6899
6900 // set layout
6901 file_layout_t layout;
6902 if (mdr->dir_layout != file_layout_t())
6903 layout = mdr->dir_layout;
6904 else
6905 layout = mdcache->default_file_layout;
6906
6907 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
6908 ceph_assert(newi);
6909
6910 dn->push_projected_linkage(newi);
6911
6912 auto _inode = newi->_get_inode();
6913 _inode->version = dn->pre_dirty();
6914 _inode->rdev = req->head.args.mknod.rdev;
6915 _inode->rstat.rfiles = 1;
6916 _inode->accounted_rstat = _inode->rstat;
6917 if (layout.pool_id != mdcache->default_file_layout.pool_id)
6918 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
6919 _inode->update_backtrace();
6920
6921 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6922 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6923 ceph_assert(follows >= realm->get_newest_seq());
6924
6925 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6926 // want to write to it (e.g., if they are reexporting NFS)
6927 if (S_ISREG(_inode->mode)) {
6928 // issue a cap on the file
6929 int cmode = CEPH_FILE_MODE_RDWR;
6930 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
6931 if (cap) {
6932 cap->set_wanted(0);
6933
6934 // put locks in excl mode
6935 newi->filelock.set_state(LOCK_EXCL);
6936 newi->authlock.set_state(LOCK_EXCL);
6937 newi->xattrlock.set_state(LOCK_EXCL);
6938
6939 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
6940 _inode->client_ranges[client].range.first = 0;
6941 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
6942 _inode->client_ranges[client].follows = follows;
6943 newi->mark_clientwriteable();
6944 cap->mark_clientwriteable();
6945 }
6946 }
6947
6948 ceph_assert(dn->first == follows + 1);
6949 newi->first = dn->first;
6950
6951 dout(10) << "mknod mode " << _inode->mode << " rdev " << _inode->rdev << dendl;
6952
6953 // prepare finisher
6954 mdr->ls = mdlog->get_current_segment();
6955 EUpdate *le = new EUpdate(mdlog, "mknod");
6956 mdlog->start_entry(le);
6957 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6958 journal_allocated_inos(mdr, &le->metablob);
6959
6960 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
6961 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6962 le->metablob.add_primary_dentry(dn, newi, true, true, true);
6963
6964 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6965 mds->balancer->maybe_fragment(dn->get_dir(), false);
6966 }
6967
6968
6969
6970 // MKDIR
6971 /* This function takes responsibility for the passed mdr*/
6972 void Server::handle_client_mkdir(MDRequestRef& mdr)
6973 {
6974 const cref_t<MClientRequest> &req = mdr->client_request;
6975
6976 mdr->disable_lock_cache();
6977 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6978 if (!dn)
6979 return;
6980
6981 CDir *dir = dn->get_dir();
6982 CInode *diri = dir->get_inode();
6983
6984 // mkdir check access
6985 if (!check_access(mdr, diri, MAY_WRITE))
6986 return;
6987
6988 if (!check_fragment_space(mdr, dir))
6989 return;
6990 if (!check_dir_max_entries(mdr, dir))
6991 return;
6992
6993 ceph_assert(dn->get_projected_linkage()->is_null());
6994 if (req->get_alternate_name().size() > alternate_name_max) {
6995 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6996 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6997 return;
6998 }
6999 dn->set_alternate_name(req->get_alternate_name());
7000
7001 // new inode
7002 unsigned mode = req->head.args.mkdir.mode;
7003 mode &= ~S_IFMT;
7004 mode |= S_IFDIR;
7005 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
7006 ceph_assert(newi);
7007
7008 // it's a directory.
7009 dn->push_projected_linkage(newi);
7010
7011 auto _inode = newi->_get_inode();
7012 _inode->version = dn->pre_dirty();
7013 _inode->rstat.rsubdirs = 1;
7014 _inode->accounted_rstat = _inode->rstat;
7015 _inode->update_backtrace();
7016
7017 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
7018 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
7019 ceph_assert(follows >= realm->get_newest_seq());
7020
7021 dout(12) << " follows " << follows << dendl;
7022 ceph_assert(dn->first == follows + 1);
7023 newi->first = dn->first;
7024
7025 // ...and that new dir is empty.
7026 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
7027 newdir->state_set(CDir::STATE_CREATING);
7028 newdir->mark_complete();
7029 newdir->_get_fnode()->version = newdir->pre_dirty();
7030
7031 // prepare finisher
7032 mdr->ls = mdlog->get_current_segment();
7033 EUpdate *le = new EUpdate(mdlog, "mkdir");
7034 mdlog->start_entry(le);
7035 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
7036 journal_allocated_inos(mdr, &le->metablob);
7037 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7038 le->metablob.add_primary_dentry(dn, newi, true, true);
7039 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
7040
7041 // issue a cap on the directory
7042 int cmode = CEPH_FILE_MODE_RDWR;
7043 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
7044 if (cap) {
7045 cap->set_wanted(0);
7046
7047 // put locks in excl mode
7048 newi->filelock.set_state(LOCK_EXCL);
7049 newi->authlock.set_state(LOCK_EXCL);
7050 newi->xattrlock.set_state(LOCK_EXCL);
7051 }
7052
7053 // make sure this inode gets into the journal
7054 le->metablob.add_opened_ino(newi->ino());
7055
7056 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
7057
7058 // We hit_dir (via hit_inode) in our finish callback, but by then we might
7059 // have overshot the split size (multiple mkdir in flight), so here is
7060 // an early chance to split the dir if this mkdir makes it oversized.
7061 mds->balancer->maybe_fragment(dir, false);
7062 }
7063
7064
7065 // SYMLINK
7066
7067 void Server::handle_client_symlink(MDRequestRef& mdr)
7068 {
7069 const auto& req = mdr->client_request;
7070
7071 mdr->disable_lock_cache();
7072 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
7073 if (!dn)
7074 return;
7075
7076 CDir *dir = dn->get_dir();
7077 CInode *diri = dir->get_inode();
7078
7079 if (!check_access(mdr, diri, MAY_WRITE))
7080 return;
7081 if (!check_fragment_space(mdr, dir))
7082 return;
7083 if (!check_dir_max_entries(mdr, dir))
7084 return;
7085
7086 ceph_assert(dn->get_projected_linkage()->is_null());
7087 if (req->get_alternate_name().size() > alternate_name_max) {
7088 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
7089 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
7090 }
7091 dn->set_alternate_name(req->get_alternate_name());
7092
7093 unsigned mode = S_IFLNK | 0777;
7094 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
7095 ceph_assert(newi);
7096
7097 // it's a symlink
7098 dn->push_projected_linkage(newi);
7099
7100 newi->symlink = req->get_path2();
7101 auto _inode = newi->_get_inode();
7102 _inode->version = dn->pre_dirty();
7103 _inode->size = newi->symlink.length();
7104 _inode->rstat.rbytes = _inode->size;
7105 _inode->rstat.rfiles = 1;
7106 _inode->accounted_rstat = _inode->rstat;
7107 _inode->update_backtrace();
7108
7109 newi->first = dn->first;
7110
7111 // prepare finisher
7112 mdr->ls = mdlog->get_current_segment();
7113 EUpdate *le = new EUpdate(mdlog, "symlink");
7114 mdlog->start_entry(le);
7115 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
7116 journal_allocated_inos(mdr, &le->metablob);
7117 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7118 le->metablob.add_primary_dentry(dn, newi, true, true);
7119
7120 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
7121 mds->balancer->maybe_fragment(dir, false);
7122
7123 // flush the journal as soon as possible
7124 if (g_conf()->mds_kill_skip_replaying_inotable) {
7125 mdlog->flush();
7126 }
7127 }
7128
7129
7130
7131
7132
7133 // LINK
7134
7135 void Server::handle_client_link(MDRequestRef& mdr)
7136 {
7137 const cref_t<MClientRequest> &req = mdr->client_request;
7138
7139 dout(7) << "handle_client_link " << req->get_filepath()
7140 << " to " << req->get_filepath2()
7141 << dendl;
7142
7143 mdr->disable_lock_cache();
7144
7145 CDentry *destdn;
7146 CInode *targeti;
7147
7148 if (req->get_filepath2().depth() == 0) {
7149 targeti = mdcache->get_inode(req->get_filepath2().get_ino());
7150 if (!targeti) {
7151 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
7152 inodeno_t ino = req->get_filepath2().get_ino();
7153 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
7154 return;
7155 }
7156 mdr->pin(targeti);
7157
7158 if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
7159 CDentry *pdn = targeti->get_projected_parent_dn();
7160 if (!pdn) {
7161 dout(7) << "target has no parent dn, failing..." << dendl;
7162 respond_to_request(mdr, -CEPHFS_EINVAL);
7163 return;
7164 }
7165 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
7166 return;
7167 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
7168 }
7169
7170 destdn = rdlock_path_xlock_dentry(mdr, false);
7171 if (!destdn)
7172 return;
7173 } else {
7174 auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
7175 destdn = ret.first;
7176 if (!destdn)
7177 return;
7178
7179 if (!destdn->get_projected_linkage()->is_null()) {
7180 respond_to_request(mdr, -CEPHFS_EEXIST);
7181 return;
7182 }
7183
7184 targeti = ret.second->get_projected_linkage()->get_inode();
7185 }
7186
7187 ceph_assert(destdn->get_projected_linkage()->is_null());
7188 if (req->get_alternate_name().size() > alternate_name_max) {
7189 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
7190 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
7191 return;
7192 }
7193 destdn->set_alternate_name(req->get_alternate_name());
7194
7195 if (targeti->is_dir()) {
7196 dout(7) << "target is a dir, failing..." << dendl;
7197 respond_to_request(mdr, -CEPHFS_EINVAL);
7198 return;
7199 }
7200
7201 CDir *dir = destdn->get_dir();
7202 dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
7203 dout(7) << "target is " << *targeti << dendl;
7204
7205 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7206 MutationImpl::LockOpVec lov;
7207 lov.add_xlock(&targeti->snaplock);
7208 lov.add_xlock(&targeti->linklock);
7209
7210 if (!mds->locker->acquire_locks(mdr, lov))
7211 return;
7212
7213 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7214 }
7215
7216 if (targeti->get_projected_inode()->nlink == 0) {
7217 dout(7) << "target has no link, failing..." << dendl;
7218 respond_to_request(mdr, -CEPHFS_ENOENT);
7219 return;
7220 }
7221
7222 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7223 if (!check_access(mdr, targeti, MAY_WRITE))
7224 return;
7225
7226 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
7227 return;
7228
7229 if (!check_fragment_space(mdr, dir))
7230 return;
7231
7232 if (!check_dir_max_entries(mdr, dir))
7233 return;
7234 }
7235
7236 CInode* target_pin = targeti->get_projected_parent_dir()->inode;
7237 SnapRealm *target_realm = target_pin->find_snaprealm();
7238 if (target_pin != dir->inode &&
7239 target_realm->get_subvolume_ino() !=
7240 dir->inode->find_snaprealm()->get_subvolume_ino() &&
7241 /* The inode is temporarily located in the stray dir pending reintegration */
7242 !target_pin->is_stray()) {
7243 dout(7) << "target is in different subvolume, failing..." << dendl;
7244 respond_to_request(mdr, -CEPHFS_EXDEV);
7245 return;
7246 }
7247
7248 // go!
7249 ceph_assert(g_conf()->mds_kill_link_at != 1);
7250
7251 // local or remote?
7252 if (targeti->is_auth())
7253 _link_local(mdr, destdn, targeti, target_realm);
7254 else
7255 _link_remote(mdr, true, destdn, targeti);
7256 mds->balancer->maybe_fragment(dir, false);
7257 }
7258
7259
7260 class C_MDS_link_local_finish : public ServerLogContext {
7261 CDentry *dn;
7262 CInode *targeti;
7263 version_t dnpv;
7264 version_t tipv;
7265 bool adjust_realm;
7266 public:
7267 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
7268 version_t dnpv_, version_t tipv_, bool ar) :
7269 ServerLogContext(s, r), dn(d), targeti(ti),
7270 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
7271 void finish(int r) override {
7272 ceph_assert(r == 0);
7273 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
7274 }
7275 };
7276
7277
7278 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm)
7279 {
7280 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
7281
7282 mdr->ls = mdlog->get_current_segment();
7283
7284 // predirty NEW dentry
7285 version_t dnpv = dn->pre_dirty();
7286 version_t tipv = targeti->pre_dirty();
7287
7288 // project inode update
7289 auto pi = targeti->project_inode(mdr);
7290 pi.inode->nlink++;
7291 pi.inode->ctime = mdr->get_op_stamp();
7292 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
7293 pi.inode->rstat.rctime = mdr->get_op_stamp();
7294 pi.inode->change_attr++;
7295 pi.inode->version = tipv;
7296
7297 bool adjust_realm = false;
7298 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
7299 sr_t *newsnap = targeti->project_snaprealm();
7300 targeti->mark_snaprealm_global(newsnap);
7301 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true);
7302 adjust_realm = true;
7303 }
7304
7305 // log + wait
7306 EUpdate *le = new EUpdate(mdlog, "link_local");
7307 mdlog->start_entry(le);
7308 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7309 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
7310 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
7311 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
7312 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
7313
7314 // do this after predirty_*, to avoid funky extra dnl arg
7315 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
7316
7317 journal_and_reply(mdr, targeti, dn, le,
7318 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
7319 }
7320
7321 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
7322 version_t dnpv, version_t tipv, bool adjust_realm)
7323 {
7324 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
7325
7326 // link and unlock the NEW dentry
7327 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
7328 if (!dnl->get_inode())
7329 dn->link_remote(dnl, targeti);
7330 dn->mark_dirty(dnpv, mdr->ls);
7331
7332 // target inode
7333 mdr->apply();
7334
7335 MDRequestRef null_ref;
7336 mdcache->send_dentry_link(dn, null_ref);
7337
7338 if (adjust_realm) {
7339 int op = CEPH_SNAP_OP_SPLIT;
7340 mds->mdcache->send_snap_update(targeti, 0, op);
7341 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7342 }
7343
7344 // bump target popularity
7345 mds->balancer->hit_inode(targeti, META_POP_IWR);
7346 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7347
7348 // reply
7349 respond_to_request(mdr, 0);
7350 }
7351
7352
7353 // link / unlink remote
7354
7355 class C_MDS_link_remote_finish : public ServerLogContext {
7356 bool inc;
7357 CDentry *dn;
7358 CInode *targeti;
7359 version_t dpv;
7360 public:
7361 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
7362 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
7363 dpv(d->get_projected_version()) {}
7364 void finish(int r) override {
7365 ceph_assert(r == 0);
7366 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
7367 }
7368 };
7369
7370 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
7371 {
7372 dout(10) << "_link_remote "
7373 << (inc ? "link ":"unlink ")
7374 << *dn << " to " << *targeti << dendl;
7375
7376 // 1. send LinkPrepare to dest (journal nlink++ prepare)
7377 mds_rank_t linkauth = targeti->authority().first;
7378 if (mdr->more()->witnessed.count(linkauth) == 0) {
7379 if (mds->is_cluster_degraded() &&
7380 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
7381 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
7382 if (mdr->more()->waiting_on_peer.empty())
7383 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
7384 return;
7385 }
7386
7387 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
7388 int op;
7389 if (inc)
7390 op = MMDSPeerRequest::OP_LINKPREP;
7391 else
7392 op = MMDSPeerRequest::OP_UNLINKPREP;
7393 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, op);
7394 targeti->set_object_info(req->get_object_info());
7395 req->op_stamp = mdr->get_op_stamp();
7396 if (auto& desti_srnode = mdr->more()->desti_srnode)
7397 encode(*desti_srnode, req->desti_snapbl);
7398 mds->send_message_mds(req, linkauth);
7399
7400 ceph_assert(mdr->more()->waiting_on_peer.count(linkauth) == 0);
7401 mdr->more()->waiting_on_peer.insert(linkauth);
7402 return;
7403 }
7404 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
7405
7406 ceph_assert(g_conf()->mds_kill_link_at != 2);
7407
7408 if (auto& desti_srnode = mdr->more()->desti_srnode) {
7409 delete desti_srnode;
7410 desti_srnode = NULL;
7411 }
7412
7413 mdr->set_mds_stamp(ceph_clock_now());
7414
7415 // add to event
7416 mdr->ls = mdlog->get_current_segment();
7417 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
7418 mdlog->start_entry(le);
7419 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7420 if (!mdr->more()->witnessed.empty()) {
7421 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7422 le->reqid = mdr->reqid;
7423 le->had_peers = true;
7424 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7425 }
7426
7427 if (inc) {
7428 dn->pre_dirty();
7429 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
7430 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
7431 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
7432 } else {
7433 dn->pre_dirty();
7434 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
7435 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7436 le->metablob.add_null_dentry(dn, true);
7437 dn->push_projected_linkage();
7438 }
7439
7440 journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
7441 new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
7442 }
7443
7444 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
7445 CDentry *dn, CInode *targeti,
7446 version_t dpv)
7447 {
7448 dout(10) << "_link_remote_finish "
7449 << (inc ? "link ":"unlink ")
7450 << *dn << " to " << *targeti << dendl;
7451
7452 ceph_assert(g_conf()->mds_kill_link_at != 3);
7453
7454 if (!mdr->more()->witnessed.empty())
7455 mdcache->logged_leader_update(mdr->reqid);
7456
7457 if (inc) {
7458 // link the new dentry
7459 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
7460 if (!dnl->get_inode())
7461 dn->link_remote(dnl, targeti);
7462 dn->mark_dirty(dpv, mdr->ls);
7463 } else {
7464 // unlink main dentry
7465 dn->get_dir()->unlink_inode(dn);
7466 dn->pop_projected_linkage();
7467 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
7468 }
7469
7470 mdr->apply();
7471
7472 MDRequestRef null_ref;
7473 if (inc)
7474 mdcache->send_dentry_link(dn, null_ref);
7475 else
7476 mdcache->send_dentry_unlink(dn, NULL, null_ref);
7477
7478 // bump target popularity
7479 mds->balancer->hit_inode(targeti, META_POP_IWR);
7480 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7481
7482 // reply
7483 respond_to_request(mdr, 0);
7484
7485 if (!inc)
7486 // removing a new dn?
7487 dn->get_dir()->try_remove_unlinked_dn(dn);
7488 }
7489
7490
7491 // remote linking/unlinking
7492
7493 class C_MDS_PeerLinkPrep : public ServerLogContext {
7494 CInode *targeti;
7495 bool adjust_realm;
7496 public:
7497 C_MDS_PeerLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
7498 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
7499 void finish(int r) override {
7500 ceph_assert(r == 0);
7501 server->_logged_peer_link(mdr, targeti, adjust_realm);
7502 }
7503 };
7504
7505 class C_MDS_PeerLinkCommit : public ServerContext {
7506 MDRequestRef mdr;
7507 CInode *targeti;
7508 public:
7509 C_MDS_PeerLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
7510 ServerContext(s), mdr(r), targeti(t) { }
7511 void finish(int r) override {
7512 server->_commit_peer_link(mdr, r, targeti);
7513 }
7514 };
7515
7516 void Server::handle_peer_link_prep(MDRequestRef& mdr)
7517 {
7518 dout(10) << "handle_peer_link_prep " << *mdr
7519 << " on " << mdr->peer_request->get_object_info()
7520 << dendl;
7521
7522 ceph_assert(g_conf()->mds_kill_link_at != 4);
7523
7524 CInode *targeti = mdcache->get_inode(mdr->peer_request->get_object_info().ino);
7525 ceph_assert(targeti);
7526 dout(10) << "targeti " << *targeti << dendl;
7527 CDentry *dn = targeti->get_parent_dn();
7528 CDentry::linkage_t *dnl = dn->get_linkage();
7529 ceph_assert(dnl->is_primary());
7530
7531 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7532
7533 mdr->auth_pin(targeti);
7534
7535 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
7536 ceph_assert(g_conf()->mds_kill_link_at != 5);
7537
7538 // journal it
7539 mdr->ls = mdlog->get_current_segment();
7540 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_prep", mdr->reqid, mdr->peer_to_mds,
7541 EPeerUpdate::OP_PREPARE, EPeerUpdate::LINK);
7542 mdlog->start_entry(le);
7543
7544 auto pi = dnl->get_inode()->project_inode(mdr);
7545
7546 // update journaled target inode
7547 bool inc;
7548 bool adjust_realm = false;
7549 bool realm_projected = false;
7550 if (mdr->peer_request->get_op() == MMDSPeerRequest::OP_LINKPREP) {
7551 inc = true;
7552 pi.inode->nlink++;
7553
7554 CDentry *target_pdn = targeti->get_projected_parent_dn();
7555 SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm();
7556 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
7557 sr_t *newsnap = targeti->project_snaprealm();
7558 targeti->mark_snaprealm_global(newsnap);
7559 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true);
7560 adjust_realm = true;
7561 realm_projected = true;
7562 }
7563 } else {
7564 inc = false;
7565 pi.inode->nlink--;
7566 if (targeti->is_projected_snaprealm_global()) {
7567 ceph_assert(mdr->peer_request->desti_snapbl.length());
7568 auto p = mdr->peer_request->desti_snapbl.cbegin();
7569
7570 sr_t *newsnap = targeti->project_snaprealm();
7571 decode(*newsnap, p);
7572
7573 if (pi.inode->nlink == 0)
7574 ceph_assert(!newsnap->is_parent_global());
7575
7576 realm_projected = true;
7577 } else {
7578 ceph_assert(mdr->peer_request->desti_snapbl.length() == 0);
7579 }
7580 }
7581
7582 link_rollback rollback;
7583 rollback.reqid = mdr->reqid;
7584 rollback.ino = targeti->ino();
7585 rollback.old_ctime = targeti->get_inode()->ctime; // we hold versionlock xlock; no concorrent projections
7586 const auto& pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
7587 rollback.old_dir_mtime = pf->fragstat.mtime;
7588 rollback.old_dir_rctime = pf->rstat.rctime;
7589 rollback.was_inc = inc;
7590 if (realm_projected) {
7591 if (targeti->snaprealm) {
7592 encode(true, rollback.snapbl);
7593 targeti->encode_snap_blob(rollback.snapbl);
7594 } else {
7595 encode(false, rollback.snapbl);
7596 }
7597 }
7598 encode(rollback, le->rollback);
7599 mdr->more()->rollback_bl = le->rollback;
7600
7601 pi.inode->ctime = mdr->get_op_stamp();
7602 pi.inode->version = targeti->pre_dirty();
7603
7604 dout(10) << " projected inode " << pi.inode->ino << " v " << pi.inode->version << dendl;
7605
7606 // commit case
7607 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
7608 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
7609 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7610
7611 // set up commit waiter
7612 mdr->more()->peer_commit = new C_MDS_PeerLinkCommit(this, mdr, targeti);
7613
7614 mdr->more()->peer_update_journaled = true;
7615 submit_mdlog_entry(le, new C_MDS_PeerLinkPrep(this, mdr, targeti, adjust_realm),
7616 mdr, __func__);
7617 mdlog->flush();
7618 }
7619
7620 void Server::_logged_peer_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
7621 {
7622 dout(10) << "_logged_peer_link " << *mdr
7623 << " " << *targeti << dendl;
7624
7625 ceph_assert(g_conf()->mds_kill_link_at != 6);
7626
7627 // update the target
7628 mdr->apply();
7629
7630 // hit pop
7631 mds->balancer->hit_inode(targeti, META_POP_IWR);
7632
7633 // done.
7634 mdr->reset_peer_request();
7635
7636 if (adjust_realm) {
7637 int op = CEPH_SNAP_OP_SPLIT;
7638 mds->mdcache->send_snap_update(targeti, 0, op);
7639 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7640 }
7641
7642 // ack
7643 if (!mdr->aborted) {
7644 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_LINKPREPACK);
7645 mds->send_message_mds(reply, mdr->peer_to_mds);
7646 } else {
7647 dout(10) << " abort flag set, finishing" << dendl;
7648 mdcache->request_finish(mdr);
7649 }
7650 }
7651
7652
7653 struct C_MDS_CommittedPeer : public ServerLogContext {
7654 C_MDS_CommittedPeer(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
7655 void finish(int r) override {
7656 server->_committed_peer(mdr);
7657 }
7658 };
7659
7660 void Server::_commit_peer_link(MDRequestRef& mdr, int r, CInode *targeti)
7661 {
7662 dout(10) << "_commit_peer_link " << *mdr
7663 << " r=" << r
7664 << " " << *targeti << dendl;
7665
7666 ceph_assert(g_conf()->mds_kill_link_at != 7);
7667
7668 if (r == 0) {
7669 // drop our pins, etc.
7670 mdr->cleanup();
7671
7672 // write a commit to the journal
7673 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_commit", mdr->reqid, mdr->peer_to_mds,
7674 EPeerUpdate::OP_COMMIT, EPeerUpdate::LINK);
7675 mdlog->start_entry(le);
7676 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7677 mdlog->flush();
7678 } else {
7679 do_link_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7680 }
7681 }
7682
7683 void Server::_committed_peer(MDRequestRef& mdr)
7684 {
7685 dout(10) << "_committed_peer " << *mdr << dendl;
7686
7687 ceph_assert(g_conf()->mds_kill_link_at != 8);
7688
7689 bool assert_exist = mdr->more()->peer_update_journaled;
7690 mdcache->finish_uncommitted_peer(mdr->reqid, assert_exist);
7691 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_COMMITTED);
7692 mds->send_message_mds(req, mdr->peer_to_mds);
7693 mdcache->request_finish(mdr);
7694 }
7695
7696 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
7697 MutationRef mut;
7698 map<client_t,ref_t<MClientSnap>> splits;
7699 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
7700 map<client_t,ref_t<MClientSnap>>&& _splits) :
7701 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
7702 }
7703 void finish(int r) override {
7704 server->_link_rollback_finish(mut, mdr, splits);
7705 }
7706 };
7707
7708 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7709 {
7710 link_rollback rollback;
7711 auto p = rbl.cbegin();
7712 decode(rollback, p);
7713
7714 dout(10) << "do_link_rollback on " << rollback.reqid
7715 << (rollback.was_inc ? " inc":" dec")
7716 << " ino " << rollback.ino
7717 << dendl;
7718
7719 ceph_assert(g_conf()->mds_kill_link_at != 9);
7720
7721 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
7722 ceph_assert(mdr || mds->is_resolve());
7723
7724 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7725 mut->ls = mds->mdlog->get_current_segment();
7726
7727 CInode *in = mdcache->get_inode(rollback.ino);
7728 ceph_assert(in);
7729 dout(10) << " target is " << *in << dendl;
7730 ceph_assert(!in->is_projected()); // live peer request hold versionlock xlock.
7731
7732 auto pi = in->project_inode(mut);
7733 pi.inode->version = in->pre_dirty();
7734
7735 // parent dir rctime
7736 CDir *parent = in->get_projected_parent_dn()->get_dir();
7737 auto pf = parent->project_fnode(mut);
7738 pf->version = parent->pre_dirty();
7739 if (pf->fragstat.mtime == pi.inode->ctime) {
7740 pf->fragstat.mtime = rollback.old_dir_mtime;
7741 if (pf->rstat.rctime == pi.inode->ctime)
7742 pf->rstat.rctime = rollback.old_dir_rctime;
7743 mut->add_updated_lock(&parent->get_inode()->filelock);
7744 mut->add_updated_lock(&parent->get_inode()->nestlock);
7745 }
7746
7747 // inode
7748 pi.inode->ctime = rollback.old_ctime;
7749 if (rollback.was_inc)
7750 pi.inode->nlink--;
7751 else
7752 pi.inode->nlink++;
7753
7754 map<client_t,ref_t<MClientSnap>> splits;
7755 if (rollback.snapbl.length() && in->snaprealm) {
7756 bool hadrealm;
7757 auto p = rollback.snapbl.cbegin();
7758 decode(hadrealm, p);
7759 if (hadrealm) {
7760 if (!mds->is_resolve()) {
7761 sr_t *new_srnode = new sr_t();
7762 decode(*new_srnode, p);
7763 in->project_snaprealm(new_srnode);
7764 } else {
7765 decode(in->snaprealm->srnode, p);
7766 }
7767 } else {
7768 SnapRealm *realm = parent->get_inode()->find_snaprealm();
7769 if (!mds->is_resolve())
7770 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
7771 in->project_snaprealm(NULL);
7772 }
7773 }
7774
7775 // journal it
7776 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_rollback", rollback.reqid, leader,
7777 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::LINK);
7778 mdlog->start_entry(le);
7779 le->commit.add_dir_context(parent);
7780 le->commit.add_dir(parent, true);
7781 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
7782
7783 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
7784 mdr, __func__);
7785 mdlog->flush();
7786 }
7787
7788 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
7789 map<client_t,ref_t<MClientSnap>>& splits)
7790 {
7791 dout(10) << "_link_rollback_finish" << dendl;
7792
7793 ceph_assert(g_conf()->mds_kill_link_at != 10);
7794
7795 mut->apply();
7796
7797 if (!mds->is_resolve())
7798 mdcache->send_snaps(splits);
7799
7800 if (mdr)
7801 mdcache->request_finish(mdr);
7802
7803 mdcache->finish_rollback(mut->reqid, mdr);
7804
7805 mut->cleanup();
7806 }
7807
7808
7809 void Server::handle_peer_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m)
7810 {
7811 dout(10) << "handle_peer_link_prep_ack " << *mdr
7812 << " " << *m << dendl;
7813 mds_rank_t from = mds_rank_t(m->get_source().num());
7814
7815 ceph_assert(g_conf()->mds_kill_link_at != 11);
7816
7817 // note peer
7818 mdr->more()->peers.insert(from);
7819
7820 // witnessed!
7821 ceph_assert(mdr->more()->witnessed.count(from) == 0);
7822 mdr->more()->witnessed.insert(from);
7823 ceph_assert(!m->is_not_journaled());
7824 mdr->more()->has_journaled_peers = true;
7825
7826 // remove from waiting list
7827 ceph_assert(mdr->more()->waiting_on_peer.count(from));
7828 mdr->more()->waiting_on_peer.erase(from);
7829
7830 ceph_assert(mdr->more()->waiting_on_peer.empty());
7831
7832 dispatch_client_request(mdr); // go again!
7833 }
7834
7835
7836
7837
7838
7839 // UNLINK
7840
7841 void Server::handle_client_unlink(MDRequestRef& mdr)
7842 {
7843 const cref_t<MClientRequest> &req = mdr->client_request;
7844 client_t client = mdr->get_client();
7845
7846 // rmdir or unlink?
7847 bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
7848
7849 if (rmdir)
7850 mdr->disable_lock_cache();
7851 CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
7852 if (!dn)
7853 return;
7854
7855 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
7856 ceph_assert(!dnl->is_null());
7857 CInode *in = dnl->get_inode();
7858
7859 if (rmdir) {
7860 dout(7) << "handle_client_rmdir on " << *dn << dendl;
7861 } else {
7862 dout(7) << "handle_client_unlink on " << *dn << dendl;
7863 }
7864 dout(7) << "dn links to " << *in << dendl;
7865
7866 // rmdir vs is_dir
7867 if (in->is_dir()) {
7868 if (rmdir) {
7869 // do empty directory checks
7870 if (_dir_is_nonempty_unlocked(mdr, in)) {
7871 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7872 return;
7873 }
7874 } else {
7875 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
7876 respond_to_request(mdr, -CEPHFS_EISDIR);
7877 return;
7878 }
7879 } else {
7880 if (rmdir) {
7881 // unlink
7882 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
7883 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7884 return;
7885 }
7886 }
7887
7888 CInode *diri = dn->get_dir()->get_inode();
7889 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7890 if (!check_access(mdr, diri, MAY_WRITE))
7891 return;
7892 }
7893
7894 // -- create stray dentry? --
7895 CDentry *straydn = NULL;
7896 if (dnl->is_primary()) {
7897 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
7898 if (!straydn)
7899 return;
7900 dout(10) << " straydn is " << *straydn << dendl;
7901 } else if (mdr->straydn) {
7902 mdr->unpin(mdr->straydn);
7903 mdr->straydn = NULL;
7904 }
7905
7906 // lock
7907 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7908 MutationImpl::LockOpVec lov;
7909
7910 lov.add_xlock(&in->linklock);
7911 lov.add_xlock(&in->snaplock);
7912 if (in->is_dir())
7913 lov.add_rdlock(&in->filelock); // to verify it's empty
7914
7915 if (straydn) {
7916 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7917 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7918 lov.add_xlock(&straydn->lock);
7919 }
7920
7921 if (!mds->locker->acquire_locks(mdr, lov))
7922 return;
7923
7924 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7925 }
7926
7927 if (in->is_dir() &&
7928 _dir_is_nonempty(mdr, in)) {
7929 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7930 return;
7931 }
7932
7933 if (straydn)
7934 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7935
7936 if (!mdr->more()->desti_srnode) {
7937 if (in->is_projected_snaprealm_global()) {
7938 sr_t *new_srnode = in->prepare_new_srnode(0);
7939 in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary());
7940 // dropping the last linkage or dropping the last remote linkage,
7941 // detch the inode from global snaprealm
7942 auto nlink = in->get_projected_inode()->nlink;
7943 if (nlink == 1 ||
7944 (nlink == 2 && !dnl->is_primary() &&
7945 !in->get_projected_parent_dir()->inode->is_stray()))
7946 in->clear_snaprealm_global(new_srnode);
7947 mdr->more()->desti_srnode = new_srnode;
7948 } else if (dnl->is_primary()) {
7949 // prepare snaprealm blob for peer request
7950 SnapRealm *realm = in->find_snaprealm();
7951 snapid_t follows = realm->get_newest_seq();
7952 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
7953 sr_t *new_srnode = in->prepare_new_srnode(follows);
7954 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7955 mdr->more()->desti_srnode = new_srnode;
7956 }
7957 }
7958 }
7959
7960 // yay!
7961 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
7962 // subtree root auths need to be witnesses
7963 set<mds_rank_t> witnesses;
7964 in->list_replicas(witnesses);
7965 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7966
7967 for (set<mds_rank_t>::iterator p = witnesses.begin();
7968 p != witnesses.end();
7969 ++p) {
7970 if (mdr->more()->witnessed.count(*p)) {
7971 dout(10) << " already witnessed by mds." << *p << dendl;
7972 } else if (mdr->more()->waiting_on_peer.count(*p)) {
7973 dout(10) << " already waiting on witness mds." << *p << dendl;
7974 } else {
7975 if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
7976 return;
7977 }
7978 }
7979 if (!mdr->more()->waiting_on_peer.empty())
7980 return; // we're waiting for a witness.
7981 }
7982
7983 if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
7984 mds->locker->create_lock_cache(mdr, diri);
7985
7986 // ok!
7987 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
7988 _link_remote(mdr, false, dn, dnl->get_inode());
7989 else
7990 _unlink_local(mdr, dn, straydn);
7991 }
7992
7993 class C_MDS_unlink_local_finish : public ServerLogContext {
7994 CDentry *dn;
7995 CDentry *straydn;
7996 version_t dnpv; // deleted dentry
7997 public:
7998 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
7999 ServerLogContext(s, r), dn(d), straydn(sd),
8000 dnpv(d->get_projected_version()) {}
8001 void finish(int r) override {
8002 ceph_assert(r == 0);
8003 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
8004 }
8005 };
8006
8007 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
8008 {
8009 dout(10) << "_unlink_local " << *dn << dendl;
8010
8011 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8012 CInode *in = dnl->get_inode();
8013
8014
8015 // ok, let's do it.
8016 mdr->ls = mdlog->get_current_segment();
8017
8018 // prepare log entry
8019 EUpdate *le = new EUpdate(mdlog, "unlink_local");
8020 mdlog->start_entry(le);
8021 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
8022 if (!mdr->more()->witnessed.empty()) {
8023 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
8024 le->reqid = mdr->reqid;
8025 le->had_peers = true;
8026 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
8027 }
8028
8029 if (straydn) {
8030 ceph_assert(dnl->is_primary());
8031 straydn->push_projected_linkage(in);
8032 }
8033
8034 // the unlinked dentry
8035 dn->pre_dirty();
8036
8037 auto pi = in->project_inode(mdr);
8038 {
8039 std::string t;
8040 dn->make_path_string(t, true);
8041 pi.inode->stray_prior_path = std::move(t);
8042 }
8043 pi.inode->version = in->pre_dirty();
8044 pi.inode->ctime = mdr->get_op_stamp();
8045 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
8046 pi.inode->rstat.rctime = mdr->get_op_stamp();
8047 pi.inode->change_attr++;
8048 pi.inode->nlink--;
8049 if (pi.inode->nlink == 0)
8050 in->state_set(CInode::STATE_ORPHAN);
8051
8052 if (mdr->more()->desti_srnode) {
8053 auto& desti_srnode = mdr->more()->desti_srnode;
8054 in->project_snaprealm(desti_srnode);
8055 desti_srnode = NULL;
8056 }
8057
8058 if (straydn) {
8059 // will manually pop projected inode
8060
8061 // primary link. add stray dentry.
8062 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
8063 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
8064
8065 pi.inode->update_backtrace();
8066 le->metablob.add_primary_dentry(straydn, in, true, true);
8067 } else {
8068 // remote link. update remote inode.
8069 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
8070 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
8071 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
8072 }
8073
8074 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
8075 le->metablob.add_null_dentry(dn, true);
8076
8077 if (in->is_dir()) {
8078 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8079 le->metablob.renamed_dirino = in->ino();
8080 }
8081
8082 dn->push_projected_linkage();
8083
8084 if (straydn) {
8085 ceph_assert(in->first <= straydn->first);
8086 in->first = straydn->first;
8087 }
8088
8089 if (in->is_dir()) {
8090 ceph_assert(straydn);
8091 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
8092 }
8093
8094 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
8095 }
8096
8097 void Server::_unlink_local_finish(MDRequestRef& mdr,
8098 CDentry *dn, CDentry *straydn,
8099 version_t dnpv)
8100 {
8101 dout(10) << "_unlink_local_finish " << *dn << dendl;
8102
8103 if (!mdr->more()->witnessed.empty())
8104 mdcache->logged_leader_update(mdr->reqid);
8105
8106 CInode *strayin = NULL;
8107 bool hadrealm = false;
8108 if (straydn) {
8109 // if there is newly created snaprealm, need to split old snaprealm's
8110 // inodes_with_caps. So pop snaprealm before linkage changes.
8111 strayin = dn->get_linkage()->get_inode();
8112 hadrealm = strayin->snaprealm ? true : false;
8113 strayin->early_pop_projected_snaprealm();
8114 }
8115
8116 // unlink main dentry
8117 dn->get_dir()->unlink_inode(dn);
8118 dn->pop_projected_linkage();
8119 dn->mark_dirty(dnpv, mdr->ls);
8120
8121 // relink as stray? (i.e. was primary link?)
8122 if (straydn) {
8123 dout(20) << " straydn is " << *straydn << dendl;
8124 straydn->pop_projected_linkage();
8125 mdcache->touch_dentry_bottom(straydn);
8126 }
8127
8128 mdr->apply();
8129
8130 mdcache->send_dentry_unlink(dn, straydn, mdr);
8131
8132 if (straydn) {
8133 // update subtree map?
8134 if (strayin->is_dir())
8135 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
8136
8137 if (strayin->snaprealm && !hadrealm)
8138 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
8139 }
8140
8141 // bump pop
8142 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
8143
8144 // reply
8145 respond_to_request(mdr, 0);
8146
8147 // removing a new dn?
8148 dn->get_dir()->try_remove_unlinked_dn(dn);
8149
8150 // clean up ?
8151 // respond_to_request() drops locks. So stray reintegration can race with us.
8152 if (straydn && !straydn->get_projected_linkage()->is_null()) {
8153 // Tip off the MDCache that this dentry is a stray that
8154 // might be elegible for purge.
8155 mdcache->notify_stray(straydn);
8156 }
8157 }
8158
8159 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
8160 {
8161 if (mds->is_cluster_degraded() &&
8162 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
8163 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
8164 if (mdr->more()->waiting_on_peer.empty())
8165 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
8166 return false;
8167 }
8168
8169 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
8170 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREP);
8171 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
8172 for (auto dn : trace)
8173 req->srcdnpath.push_dentry(dn->get_name());
8174 mdcache->encode_replica_stray(straydn, who, req->straybl);
8175 if (mdr->more()->desti_srnode)
8176 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
8177
8178 req->op_stamp = mdr->get_op_stamp();
8179 mds->send_message_mds(req, who);
8180
8181 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
8182 mdr->more()->waiting_on_peer.insert(who);
8183 return true;
8184 }
8185
8186 struct C_MDS_PeerRmdirPrep : public ServerLogContext {
8187 CDentry *dn, *straydn;
8188 C_MDS_PeerRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
8189 : ServerLogContext(s, r), dn(d), straydn(st) {}
8190 void finish(int r) override {
8191 server->_logged_peer_rmdir(mdr, dn, straydn);
8192 }
8193 };
8194
8195 struct C_MDS_PeerRmdirCommit : public ServerContext {
8196 MDRequestRef mdr;
8197 CDentry *straydn;
8198 C_MDS_PeerRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
8199 : ServerContext(s), mdr(r), straydn(sd) { }
8200 void finish(int r) override {
8201 server->_commit_peer_rmdir(mdr, r, straydn);
8202 }
8203 };
8204
8205 void Server::handle_peer_rmdir_prep(MDRequestRef& mdr)
8206 {
8207 dout(10) << "handle_peer_rmdir_prep " << *mdr
8208 << " " << mdr->peer_request->srcdnpath
8209 << " to " << mdr->peer_request->destdnpath
8210 << dendl;
8211
8212 vector<CDentry*> trace;
8213 filepath srcpath(mdr->peer_request->srcdnpath);
8214 dout(10) << " src " << srcpath << dendl;
8215 CInode *in;
8216 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
8217 int r = mdcache->path_traverse(mdr, cf, srcpath,
8218 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
8219 &trace, &in);
8220 if (r > 0) return;
8221 if (r == -CEPHFS_ESTALE) {
8222 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
8223 mdr->peer_to_mds, true);
8224 return;
8225 }
8226 ceph_assert(r == 0);
8227 CDentry *dn = trace.back();
8228 dout(10) << " dn " << *dn << dendl;
8229 mdr->pin(dn);
8230
8231 ceph_assert(mdr->straydn);
8232 CDentry *straydn = mdr->straydn;
8233 dout(10) << " straydn " << *straydn << dendl;
8234
8235 mdr->set_op_stamp(mdr->peer_request->op_stamp);
8236
8237 rmdir_rollback rollback;
8238 rollback.reqid = mdr->reqid;
8239 rollback.src_dir = dn->get_dir()->dirfrag();
8240 rollback.src_dname = dn->get_name();
8241 rollback.dest_dir = straydn->get_dir()->dirfrag();
8242 rollback.dest_dname = straydn->get_name();
8243 if (mdr->peer_request->desti_snapbl.length()) {
8244 if (in->snaprealm) {
8245 encode(true, rollback.snapbl);
8246 in->encode_snap_blob(rollback.snapbl);
8247 } else {
8248 encode(false, rollback.snapbl);
8249 }
8250 }
8251 encode(rollback, mdr->more()->rollback_bl);
8252 // FIXME: rollback snaprealm
8253 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
8254
8255 // set up commit waiter
8256 mdr->more()->peer_commit = new C_MDS_PeerRmdirCommit(this, mdr, straydn);
8257
8258 straydn->push_projected_linkage(in);
8259 dn->push_projected_linkage();
8260
8261 ceph_assert(straydn->first >= in->first);
8262 in->first = straydn->first;
8263
8264 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
8265 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
8266 _logged_peer_rmdir(mdr, dn, straydn);
8267 return;
8268 }
8269
8270 mdr->ls = mdlog->get_current_segment();
8271 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir", mdr->reqid, mdr->peer_to_mds,
8272 EPeerUpdate::OP_PREPARE, EPeerUpdate::RMDIR);
8273 mdlog->start_entry(le);
8274 le->rollback = mdr->more()->rollback_bl;
8275
8276 le->commit.add_dir_context(straydn->get_dir());
8277 le->commit.add_primary_dentry(straydn, in, true);
8278 // peer: no need to journal original dentry
8279
8280 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8281 le->commit.renamed_dirino = in->ino();
8282
8283 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
8284 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
8285
8286 mdr->more()->peer_update_journaled = true;
8287 submit_mdlog_entry(le, new C_MDS_PeerRmdirPrep(this, mdr, dn, straydn),
8288 mdr, __func__);
8289 mdlog->flush();
8290 }
8291
8292 void Server::_logged_peer_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
8293 {
8294 dout(10) << "_logged_peer_rmdir " << *mdr << " on " << *dn << dendl;
8295 CInode *in = dn->get_linkage()->get_inode();
8296
8297 bool new_realm;
8298 if (mdr->peer_request->desti_snapbl.length()) {
8299 new_realm = !in->snaprealm;
8300 in->decode_snap_blob(mdr->peer_request->desti_snapbl);
8301 ceph_assert(in->snaprealm);
8302 } else {
8303 new_realm = false;
8304 }
8305
8306 // update our cache now, so we are consistent with what is in the journal
8307 // when we journal a subtree map
8308 dn->get_dir()->unlink_inode(dn);
8309 straydn->pop_projected_linkage();
8310 dn->pop_projected_linkage();
8311
8312 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->peer_update_journaled);
8313
8314 if (new_realm)
8315 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
8316
8317 // done.
8318 mdr->reset_peer_request();
8319 mdr->straydn = 0;
8320
8321 if (!mdr->aborted) {
8322 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREPACK);
8323 if (!mdr->more()->peer_update_journaled)
8324 reply->mark_not_journaled();
8325 mds->send_message_mds(reply, mdr->peer_to_mds);
8326 } else {
8327 dout(10) << " abort flag set, finishing" << dendl;
8328 mdcache->request_finish(mdr);
8329 }
8330 }
8331
8332 void Server::handle_peer_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
8333 {
8334 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
8335 << " " << *ack << dendl;
8336
8337 mds_rank_t from = mds_rank_t(ack->get_source().num());
8338
8339 mdr->more()->peers.insert(from);
8340 mdr->more()->witnessed.insert(from);
8341 if (!ack->is_not_journaled())
8342 mdr->more()->has_journaled_peers = true;
8343
8344 // remove from waiting list
8345 ceph_assert(mdr->more()->waiting_on_peer.count(from));
8346 mdr->more()->waiting_on_peer.erase(from);
8347
8348 if (mdr->more()->waiting_on_peer.empty())
8349 dispatch_client_request(mdr); // go again!
8350 else
8351 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
8352 }
8353
8354 void Server::_commit_peer_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
8355 {
8356 dout(10) << "_commit_peer_rmdir " << *mdr << " r=" << r << dendl;
8357
8358 if (r == 0) {
8359 if (mdr->more()->peer_update_journaled) {
8360 CInode *strayin = straydn->get_projected_linkage()->get_inode();
8361 if (strayin && !strayin->snaprealm)
8362 mdcache->clear_dirty_bits_for_stray(strayin);
8363 }
8364
8365 mdr->cleanup();
8366
8367 if (mdr->more()->peer_update_journaled) {
8368 // write a commit to the journal
8369 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_commit", mdr->reqid,
8370 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
8371 EPeerUpdate::RMDIR);
8372 mdlog->start_entry(le);
8373 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
8374 mdlog->flush();
8375 } else {
8376 _committed_peer(mdr);
8377 }
8378 } else {
8379 // abort
8380 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
8381 }
8382 }
8383
8384 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
8385 metareqid_t reqid;
8386 CDentry *dn;
8387 CDentry *straydn;
8388 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
8389 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
8390 void finish(int r) override {
8391 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
8392 }
8393 };
8394
8395 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
8396 {
8397 // unlink the other rollback methods, the rmdir rollback is only
8398 // needed to record the subtree changes in the journal for inode
8399 // replicas who are auth for empty dirfrags. no actual changes to
8400 // the file system are taking place here, so there is no Mutation.
8401
8402 rmdir_rollback rollback;
8403 auto p = rbl.cbegin();
8404 decode(rollback, p);
8405
8406 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
8407 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
8408 ceph_assert(mdr || mds->is_resolve());
8409
8410 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
8411 if (!dir)
8412 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
8413 ceph_assert(dir);
8414 CDentry *dn = dir->lookup(rollback.src_dname);
8415 ceph_assert(dn);
8416 dout(10) << " dn " << *dn << dendl;
8417 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
8418 ceph_assert(straydir);
8419 CDentry *straydn = straydir->lookup(rollback.dest_dname);
8420 ceph_assert(straydn);
8421 dout(10) << " straydn " << *straydn << dendl;
8422 CInode *in = straydn->get_linkage()->get_inode();
8423
8424 dn->push_projected_linkage(in);
8425 straydn->push_projected_linkage();
8426
8427 if (rollback.snapbl.length() && in->snaprealm) {
8428 bool hadrealm;
8429 auto p = rollback.snapbl.cbegin();
8430 decode(hadrealm, p);
8431 if (hadrealm) {
8432 decode(in->snaprealm->srnode, p);
8433 } else {
8434 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
8435 }
8436 }
8437
8438 if (mdr && !mdr->more()->peer_update_journaled) {
8439 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
8440
8441 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
8442 return;
8443 }
8444
8445
8446 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_rollback", rollback.reqid, leader,
8447 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RMDIR);
8448 mdlog->start_entry(le);
8449
8450 le->commit.add_dir_context(dn->get_dir());
8451 le->commit.add_primary_dentry(dn, in, true);
8452 // peer: no need to journal straydn
8453
8454 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8455 le->commit.renamed_dirino = in->ino();
8456
8457 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
8458
8459 submit_mdlog_entry(le,
8460 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
8461 dn, straydn),
8462 mdr, __func__);
8463 mdlog->flush();
8464 }
8465
8466 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
8467 {
8468 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
8469
8470 straydn->get_dir()->unlink_inode(straydn);
8471 dn->pop_projected_linkage();
8472 straydn->pop_projected_linkage();
8473
8474 CInode *in = dn->get_linkage()->get_inode();
8475 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
8476 !mdr || mdr->more()->peer_update_journaled);
8477
8478 if (mds->is_resolve()) {
8479 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
8480 mdcache->try_trim_non_auth_subtree(root);
8481 }
8482
8483 if (mdr)
8484 mdcache->request_finish(mdr);
8485
8486 mdcache->finish_rollback(reqid, mdr);
8487 }
8488
8489
8490 /** _dir_is_nonempty[_unlocked]
8491 *
8492 * check if a directory is non-empty (i.e. we can rmdir it).
8493 *
8494 * the unlocked varient this is a fastpath check. we can't really be
8495 * sure until we rdlock the filelock.
8496 */
8497 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
8498 {
8499 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
8500 ceph_assert(in->is_auth());
8501
8502 if (in->filelock.is_cached())
8503 return false; // there can be pending async create/unlink. don't know.
8504 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
8505 return true; // in a snapshot!
8506
8507 auto&& ls = in->get_dirfrags();
8508 for (const auto& dir : ls) {
8509 // is the frag obviously non-empty?
8510 if (dir->is_auth()) {
8511 if (dir->get_projected_fnode()->fragstat.size()) {
8512 dout(10) << "dir_is_nonempty_unlocked dirstat has "
8513 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
8514 return true;
8515 }
8516 }
8517 }
8518
8519 return false;
8520 }
8521
8522 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
8523 {
8524 dout(10) << "dir_is_nonempty " << *in << dendl;
8525 ceph_assert(in->is_auth());
8526 ceph_assert(in->filelock.can_read(mdr->get_client()));
8527
8528 frag_info_t dirstat;
8529 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
8530
8531 auto&& ls = in->get_dirfrags();
8532 for (const auto& dir : ls) {
8533 const auto& pf = dir->get_projected_fnode();
8534 if (pf->fragstat.size()) {
8535 dout(10) << "dir_is_nonempty dirstat has "
8536 << pf->fragstat.size() << " items " << *dir << dendl;
8537 return true;
8538 }
8539
8540 if (pf->accounted_fragstat.version == dirstat_version)
8541 dirstat.add(pf->accounted_fragstat);
8542 else
8543 dirstat.add(pf->fragstat);
8544 }
8545
8546 return dirstat.size() != in->get_projected_inode()->dirstat.size();
8547 }
8548
8549
8550 // ======================================================
8551
8552
8553 class C_MDS_rename_finish : public ServerLogContext {
8554 CDentry *srcdn;
8555 CDentry *destdn;
8556 CDentry *straydn;
8557 public:
8558 C_MDS_rename_finish(Server *s, MDRequestRef& r,
8559 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
8560 ServerLogContext(s, r),
8561 srcdn(sdn), destdn(ddn), straydn(stdn) { }
8562 void finish(int r) override {
8563 ceph_assert(r == 0);
8564 server->_rename_finish(mdr, srcdn, destdn, straydn);
8565 }
8566 };
8567
8568
8569 /** handle_client_rename
8570 *
8571 * rename leader is the destdn auth. this is because cached inodes
8572 * must remain connected. thus, any replica of srci, must also
8573 * replicate destdn, and possibly straydn, so that srci (and
8574 * destdn->inode) remain connected during the rename.
8575 *
8576 * to do this, we freeze srci, then leader (destdn auth) verifies that
8577 * all other nodes have also replciated destdn and straydn. note that
8578 * destdn replicas need not also replicate srci. this only works when
8579 * destdn is leader.
8580 *
8581 * This function takes responsibility for the passed mdr.
8582 */
8583 void Server::handle_client_rename(MDRequestRef& mdr)
8584 {
8585 const auto& req = mdr->client_request;
8586 dout(7) << "handle_client_rename " << *req << dendl;
8587
8588 filepath destpath = req->get_filepath();
8589 filepath srcpath = req->get_filepath2();
8590 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
8591 respond_to_request(mdr, -CEPHFS_EBUSY);
8592 return;
8593 }
8594
8595 if (req->get_alternate_name().size() > alternate_name_max) {
8596 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
8597 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
8598 return;
8599 }
8600
8601 auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
8602 if (!destdn)
8603 return;
8604
8605 dout(10) << " destdn " << *destdn << dendl;
8606 CDir *destdir = destdn->get_dir();
8607 ceph_assert(destdir->is_auth());
8608 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8609
8610 dout(10) << " srcdn " << *srcdn << dendl;
8611 CDir *srcdir = srcdn->get_dir();
8612 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8613 CInode *srci = srcdnl->get_inode();
8614 dout(10) << " srci " << *srci << dendl;
8615
8616 // -- some sanity checks --
8617 if (destdn == srcdn) {
8618 dout(7) << "rename src=dest, noop" << dendl;
8619 respond_to_request(mdr, 0);
8620 return;
8621 }
8622
8623 // dest a child of src?
8624 // e.g. mv /usr /usr/foo
8625 if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
8626 dout(7) << "cannot rename item to be a child of itself" << dendl;
8627 respond_to_request(mdr, -CEPHFS_EINVAL);
8628 return;
8629 }
8630
8631 // is this a stray migration, reintegration or merge? (sanity checks!)
8632 if (mdr->reqid.name.is_mds() &&
8633 !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
8634 MDS_INO_IS_STRAY(destpath.get_ino())) &&
8635 !(destdnl->is_remote() &&
8636 destdnl->get_remote_ino() == srci->ino())) {
8637 respond_to_request(mdr, -CEPHFS_EINVAL); // actually, this won't reply, but whatev.
8638 return;
8639 }
8640
8641 CInode *oldin = 0;
8642 if (!destdnl->is_null()) {
8643 //dout(10) << "dest dn exists " << *destdn << dendl;
8644 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
8645 if (!oldin) return;
8646 dout(10) << " oldin " << *oldin << dendl;
8647
8648 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8649 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
8650 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
8651 return;
8652 }
8653
8654 // mv /some/thing /to/some/existing_other_thing
8655 if (oldin->is_dir() && !srci->is_dir()) {
8656 respond_to_request(mdr, -CEPHFS_EISDIR);
8657 return;
8658 }
8659 if (!oldin->is_dir() && srci->is_dir()) {
8660 respond_to_request(mdr, -CEPHFS_ENOTDIR);
8661 return;
8662 }
8663 if (srci == oldin && !srcdir->inode->is_stray()) {
8664 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
8665 return;
8666 }
8667 if (destdn->get_alternate_name() != req->get_alternate_name()) {
8668 /* the dentry exists but the alternate_names do not match, fail... */
8669 respond_to_request(mdr, -CEPHFS_EINVAL);
8670 return;
8671 }
8672 }
8673
8674 vector<CDentry*>& srctrace = mdr->dn[1];
8675 vector<CDentry*>& desttrace = mdr->dn[0];
8676
8677 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8678 if (destpath.get_ino() != srcpath.get_ino() &&
8679 !(req->get_source().is_mds() &&
8680 MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
8681 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
8682 CInode *destbase = desttrace[0]->get_dir()->get_inode();
8683 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8684 while (srcbase != destbase &&
8685 !srcbase->is_projected_ancestor_of(destbase)) {
8686 CDentry *pdn = srcbase->get_projected_parent_dn();
8687 srctrace.insert(srctrace.begin(), pdn);
8688 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
8689 srcbase = pdn->get_dir()->get_inode();
8690 }
8691
8692 // then, extend destpath until it shares the same parent inode as srcpath.
8693 while (destbase != srcbase) {
8694 CDentry *pdn = destbase->get_projected_parent_dn();
8695 desttrace.insert(desttrace.begin(), pdn);
8696 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
8697 destbase = pdn->get_dir()->get_inode();
8698 }
8699 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
8700 }
8701
8702
8703 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
8704 if (linkmerge)
8705 dout(10) << " this is a link merge" << dendl;
8706
8707 // -- create stray dentry? --
8708 CDentry *straydn = NULL;
8709 if (destdnl->is_primary() && !linkmerge) {
8710 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
8711 if (!straydn)
8712 return;
8713 dout(10) << " straydn is " << *straydn << dendl;
8714 } else if (mdr->straydn) {
8715 mdr->unpin(mdr->straydn);
8716 mdr->straydn = NULL;
8717 }
8718
8719
8720 // -- locks --
8721 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
8722 MutationImpl::LockOpVec lov;
8723
8724 // we need to update srci's ctime. xlock its least contended lock to do that...
8725 lov.add_xlock(&srci->linklock);
8726 lov.add_xlock(&srci->snaplock);
8727
8728 if (oldin) {
8729 // xlock oldin (for nlink--)
8730 lov.add_xlock(&oldin->linklock);
8731 lov.add_xlock(&oldin->snaplock);
8732 if (oldin->is_dir()) {
8733 ceph_assert(srci->is_dir());
8734 lov.add_rdlock(&oldin->filelock); // to verify it's empty
8735
8736 // adjust locking order?
8737 int cmp = mdr->compare_paths();
8738 if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
8739 std::reverse(lov.begin(), lov.end());
8740 } else {
8741 ceph_assert(!srci->is_dir());
8742 // adjust locking order;
8743 if (srci->ino() > oldin->ino())
8744 std::reverse(lov.begin(), lov.end());
8745 }
8746 }
8747
8748 // straydn?
8749 if (straydn) {
8750 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
8751 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
8752 lov.add_xlock(&straydn->lock);
8753 }
8754
8755 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
8756 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
8757 return;
8758
8759 mdr->locking_state |= MutationImpl::ALL_LOCKED;
8760 }
8761
8762 if (linkmerge)
8763 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
8764
8765 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
8766 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
8767 return;
8768
8769 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
8770 return;
8771
8772 if (!linkmerge && !check_fragment_space(mdr, destdn->get_dir()))
8773 return;
8774
8775 if (!linkmerge && !check_dir_max_entries(mdr, destdn->get_dir()))
8776 return;
8777
8778 if (!check_access(mdr, srci, MAY_WRITE))
8779 return;
8780 }
8781
8782 // with read lock, really verify oldin is empty
8783 if (oldin &&
8784 oldin->is_dir() &&
8785 _dir_is_nonempty(mdr, oldin)) {
8786 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
8787 return;
8788 }
8789
8790 /* project_snaprealm_past_parent() will do this job
8791 *
8792 // moving between snaprealms?
8793 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8794 SnapRealm *srcrealm = srci->find_snaprealm();
8795 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8796 if (srcrealm != destrealm &&
8797 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8798 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8799 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8800 mdcache->snaprealm_create(mdr, srci);
8801 return;
8802 }
8803 }
8804 */
8805
8806 SnapRealm *dest_realm = nullptr;
8807 SnapRealm *src_realm = nullptr;
8808 if (!linkmerge) {
8809 dest_realm = destdir->inode->find_snaprealm();
8810 if (srcdir->inode == destdir->inode)
8811 src_realm = dest_realm;
8812 else
8813 src_realm = srcdir->inode->find_snaprealm();
8814 if (src_realm != dest_realm &&
8815 src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) {
8816 respond_to_request(mdr, -CEPHFS_EXDEV);
8817 return;
8818 }
8819 }
8820
8821 ceph_assert(g_conf()->mds_kill_rename_at != 1);
8822
8823 // -- open all srcdn inode frags, if any --
8824 // we need these open so that auth can properly delegate from inode to dirfrags
8825 // after the inode is _ours_.
8826 if (srcdnl->is_primary() &&
8827 !srcdn->is_auth() &&
8828 srci->is_dir()) {
8829 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
8830 mdr->set_stickydirs(srci);
8831
8832 frag_vec_t leaves;
8833 srci->dirfragtree.get_leaves(leaves);
8834 for (const auto& leaf : leaves) {
8835 CDir *dir = srci->get_dirfrag(leaf);
8836 if (!dir) {
8837 dout(10) << " opening " << leaf << " under " << *srci << dendl;
8838 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
8839 return;
8840 }
8841 }
8842 }
8843
8844 // -- prepare snaprealm ---
8845
8846 if (linkmerge) {
8847 if (!mdr->more()->srci_srnode &&
8848 srci->get_projected_inode()->nlink == 1 &&
8849 srci->is_projected_snaprealm_global()) {
8850 sr_t *new_srnode = srci->prepare_new_srnode(0);
8851 srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false);
8852
8853 srci->clear_snaprealm_global(new_srnode);
8854 mdr->more()->srci_srnode = new_srnode;
8855 }
8856 } else {
8857 if (oldin && !mdr->more()->desti_srnode) {
8858 if (oldin->is_projected_snaprealm_global()) {
8859 sr_t *new_srnode = oldin->prepare_new_srnode(0);
8860 oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary());
8861 // dropping the last linkage or dropping the last remote linkage,
8862 // detch the inode from global snaprealm
8863 auto nlink = oldin->get_projected_inode()->nlink;
8864 if (nlink == 1 ||
8865 (nlink == 2 && !destdnl->is_primary() &&
8866 !oldin->get_projected_parent_dir()->inode->is_stray()))
8867 oldin->clear_snaprealm_global(new_srnode);
8868 mdr->more()->desti_srnode = new_srnode;
8869 } else if (destdnl->is_primary()) {
8870 snapid_t follows = dest_realm->get_newest_seq();
8871 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
8872 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
8873 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
8874 mdr->more()->desti_srnode = new_srnode;
8875 }
8876 }
8877 }
8878 if (!mdr->more()->srci_srnode) {
8879 if (srci->is_projected_snaprealm_global()) {
8880 sr_t *new_srnode = srci->prepare_new_srnode(0);
8881 srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary());
8882 mdr->more()->srci_srnode = new_srnode;
8883 } else if (srcdnl->is_primary()) {
8884 snapid_t follows = src_realm->get_newest_seq();
8885 if (src_realm != dest_realm &&
8886 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
8887 sr_t *new_srnode = srci->prepare_new_srnode(follows);
8888 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
8889 mdr->more()->srci_srnode = new_srnode;
8890 }
8891 }
8892 }
8893 }
8894
8895 // -- prepare witnesses --
8896
8897 /*
8898 * NOTE: we use _all_ replicas as witnesses.
8899 * this probably isn't totally necessary (esp for file renames),
8900 * but if/when we change that, we have to make sure rejoin is
8901 * sufficiently robust to handle strong rejoins from survivors
8902 * with totally wrong dentry->inode linkage.
8903 * (currently, it can ignore rename effects, because the resolve
8904 * stage will sort them out.)
8905 */
8906 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
8907 if (srcdn->is_auth())
8908 srcdn->list_replicas(witnesses);
8909 else
8910 witnesses.insert(srcdn->authority().first);
8911 if (srcdnl->is_remote() && !srci->is_auth())
8912 witnesses.insert(srci->authority().first);
8913 destdn->list_replicas(witnesses);
8914 if (destdnl->is_remote() && !oldin->is_auth())
8915 witnesses.insert(oldin->authority().first);
8916 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
8917
8918 if (!witnesses.empty()) {
8919 // Replicas can't see projected dentry linkages and will get confused.
8920 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8921 // can't project these inodes' linkages.
8922 bool need_flush = false;
8923 for (auto& dn : srctrace) {
8924 if (dn->is_projected()) {
8925 need_flush = true;
8926 break;
8927 }
8928 }
8929 if (!need_flush) {
8930 CDentry *dn = destdn;
8931 do {
8932 if (dn->is_projected()) {
8933 need_flush = true;
8934 break;
8935 }
8936 CInode *diri = dn->get_dir()->get_inode();
8937 dn = diri->get_projected_parent_dn();
8938 } while (dn);
8939 }
8940 if (need_flush) {
8941 mdlog->wait_for_safe(
8942 new MDSInternalContextWrapper(mds,
8943 new C_MDS_RetryRequest(mdcache, mdr)));
8944 mdlog->flush();
8945 return;
8946 }
8947 }
8948
8949 // do srcdn auth last
8950 mds_rank_t last = MDS_RANK_NONE;
8951 if (!srcdn->is_auth()) {
8952 last = srcdn->authority().first;
8953 mdr->more()->srcdn_auth_mds = last;
8954 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8955 // are involved in the rename operation.
8956 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
8957 dout(10) << " preparing ambiguous auth for srci" << dendl;
8958 ceph_assert(mdr->more()->is_remote_frozen_authpin);
8959 ceph_assert(mdr->more()->rename_inode == srci);
8960 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8961 return;
8962 }
8963 }
8964
8965 for (set<mds_rank_t>::iterator p = witnesses.begin();
8966 p != witnesses.end();
8967 ++p) {
8968 if (*p == last) continue; // do it last!
8969 if (mdr->more()->witnessed.count(*p)) {
8970 dout(10) << " already witnessed by mds." << *p << dendl;
8971 } else if (mdr->more()->waiting_on_peer.count(*p)) {
8972 dout(10) << " already waiting on witness mds." << *p << dendl;
8973 } else {
8974 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
8975 return;
8976 }
8977 }
8978 if (!mdr->more()->waiting_on_peer.empty())
8979 return; // we're waiting for a witness.
8980
8981 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
8982 dout(10) << " preparing last witness (srcdn auth)" << dendl;
8983 ceph_assert(mdr->more()->waiting_on_peer.count(last) == 0);
8984 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8985 return;
8986 }
8987
8988 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
8989 if (!mdr->more()->peers.empty() && !srci->is_dir())
8990 ceph_assert(g_conf()->mds_kill_rename_at != 3);
8991 if (!mdr->more()->peers.empty() && srci->is_dir())
8992 ceph_assert(g_conf()->mds_kill_rename_at != 4);
8993
8994 // -- declare now --
8995 mdr->set_mds_stamp(ceph_clock_now());
8996
8997 // -- prepare journal entry --
8998 mdr->ls = mdlog->get_current_segment();
8999 EUpdate *le = new EUpdate(mdlog, "rename");
9000 mdlog->start_entry(le);
9001 le->metablob.add_client_req(mdr->reqid, req->get_oldest_client_tid());
9002 if (!mdr->more()->witnessed.empty()) {
9003 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
9004
9005 le->reqid = mdr->reqid;
9006 le->had_peers = true;
9007
9008 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
9009 // no need to send frozen auth pin to recovring auth MDS of srci
9010 mdr->more()->is_remote_frozen_authpin = false;
9011 }
9012
9013 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, req->get_alternate_name(), straydn);
9014 if (le->client_map.length())
9015 le->cmapv = mds->sessionmap.get_projected();
9016
9017 // -- commit locally --
9018 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
9019
9020 journal_and_reply(mdr, srci, destdn, le, fin);
9021 mds->balancer->maybe_fragment(destdn->get_dir(), false);
9022 }
9023
9024
9025 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9026 {
9027 dout(10) << "_rename_finish " << *mdr << dendl;
9028
9029 if (!mdr->more()->witnessed.empty())
9030 mdcache->logged_leader_update(mdr->reqid);
9031
9032 // apply
9033 _rename_apply(mdr, srcdn, destdn, straydn);
9034
9035 mdcache->send_dentry_link(destdn, mdr);
9036
9037 CDentry::linkage_t *destdnl = destdn->get_linkage();
9038 CInode *in = destdnl->get_inode();
9039 bool need_eval = mdr->more()->cap_imports.count(in);
9040
9041 // test hack: test peer commit
9042 if (!mdr->more()->peers.empty() && !in->is_dir())
9043 ceph_assert(g_conf()->mds_kill_rename_at != 5);
9044 if (!mdr->more()->peers.empty() && in->is_dir())
9045 ceph_assert(g_conf()->mds_kill_rename_at != 6);
9046
9047 // bump popularity
9048 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
9049 if (destdnl->is_remote() && in->is_auth())
9050 mds->balancer->hit_inode(in, META_POP_IWR);
9051
9052 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
9053
9054 ceph_assert(g_conf()->mds_kill_rename_at != 7);
9055
9056 // reply
9057 respond_to_request(mdr, 0);
9058
9059 if (need_eval)
9060 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
9061
9062 // clean up?
9063 // respond_to_request() drops locks. So stray reintegration can race with us.
9064 if (straydn && !straydn->get_projected_linkage()->is_null()) {
9065 mdcache->notify_stray(straydn);
9066 }
9067 }
9068
9069
9070
9071 // helpers
9072
9073 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
9074 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
9075 {
9076 const auto& client_req = mdr->client_request;
9077 ceph_assert(client_req);
9078
9079 if (mds->is_cluster_degraded() &&
9080 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
9081 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
9082 if (mdr->more()->waiting_on_peer.empty())
9083 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
9084 return false;
9085 }
9086
9087 dout(10) << "_rename_prepare_witness mds." << who << dendl;
9088 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREP);
9089
9090 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
9091 for (auto dn : srctrace)
9092 req->srcdnpath.push_dentry(dn->get_name());
9093 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
9094 for (auto dn : dsttrace)
9095 req->destdnpath.push_dentry(dn->get_name());
9096 req->alternate_name = client_req->alternate_name;
9097 if (straydn)
9098 mdcache->encode_replica_stray(straydn, who, req->straybl);
9099
9100 if (mdr->more()->srci_srnode)
9101 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
9102 if (mdr->more()->desti_srnode)
9103 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
9104
9105 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
9106
9107 // srcdn auth will verify our current witness list is sufficient
9108 req->witnesses = witnesse;
9109
9110 req->op_stamp = mdr->get_op_stamp();
9111 mds->send_message_mds(req, who);
9112
9113 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
9114 mdr->more()->waiting_on_peer.insert(who);
9115 return true;
9116 }
9117
9118 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
9119 {
9120 version_t oldpv = mdr->more()->inode_import_v;
9121
9122 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9123
9124 /* import node */
9125 auto blp = mdr->more()->inode_import.cbegin();
9126
9127 // imported caps
9128 map<client_t,entity_inst_t> client_map;
9129 map<client_t, client_metadata_t> client_metadata_map;
9130 decode(client_map, blp);
9131 decode(client_metadata_map, blp);
9132 prepare_force_open_sessions(client_map, client_metadata_map,
9133 mdr->more()->imported_session_map);
9134 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
9135 encode(client_metadata_map, *client_map_bl);
9136
9137 list<ScatterLock*> updated_scatterlocks;
9138 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
9139 mdr->more()->cap_imports, updated_scatterlocks);
9140
9141 // hack: force back to !auth and clean, temporarily
9142 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
9143 srcdnl->get_inode()->mark_clean();
9144
9145 return oldpv;
9146 }
9147
9148 bool Server::_need_force_journal(CInode *diri, bool empty)
9149 {
9150 auto&& dirs = diri->get_dirfrags();
9151
9152 bool force_journal = false;
9153 if (empty) {
9154 for (const auto& dir : dirs) {
9155 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
9156 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
9157 force_journal = true;
9158 break;
9159 } else
9160 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
9161 }
9162 } else {
9163 // see if any children of our frags are auth subtrees.
9164 std::vector<CDir*> subtrees;
9165 mdcache->get_subtrees(subtrees);
9166 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
9167 for (const auto& dir : dirs) {
9168 for (const auto& subtree : subtrees) {
9169 if (dir->contains(subtree)) {
9170 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
9171 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
9172 << *subtree << dendl;
9173 force_journal = true;
9174 break;
9175 } else
9176 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
9177 } else
9178 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
9179 }
9180 if (force_journal)
9181 break;
9182 }
9183 }
9184 return force_journal;
9185 }
9186
9187 void Server::_rename_prepare(MDRequestRef& mdr,
9188 EMetaBlob *metablob, bufferlist *client_map_bl,
9189 CDentry *srcdn, CDentry *destdn, std::string_view alternate_name,
9190 CDentry *straydn)
9191 {
9192 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
9193 if (straydn)
9194 dout(10) << " straydn " << *straydn << dendl;
9195
9196 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
9197 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
9198 CInode *srci = srcdnl->get_inode();
9199 CInode *oldin = destdnl->get_inode();
9200
9201 // primary+remote link merge?
9202 bool linkmerge = (srci == oldin);
9203 if (linkmerge)
9204 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
9205 bool silent = srcdn->get_dir()->inode->is_stray();
9206
9207 bool force_journal_dest = false;
9208 if (srci->is_dir() && !destdn->is_auth()) {
9209 if (srci->is_auth()) {
9210 // if we are auth for srci and exporting it, force journal because journal replay needs
9211 // the source inode to create auth subtrees.
9212 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
9213 force_journal_dest = true;
9214 } else
9215 force_journal_dest = _need_force_journal(srci, false);
9216 }
9217
9218 bool force_journal_stray = false;
9219 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
9220 force_journal_stray = _need_force_journal(oldin, true);
9221
9222 if (linkmerge)
9223 dout(10) << " merging remote and primary links to the same inode" << dendl;
9224 if (silent)
9225 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
9226 if (force_journal_dest)
9227 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
9228 if (force_journal_stray)
9229 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
9230
9231 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
9232 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
9233 metablob->renamed_dirino = srci->ino();
9234 } else if (oldin && oldin->is_dir() && force_journal_stray) {
9235 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
9236 metablob->renamed_dirino = oldin->ino();
9237 }
9238
9239 // prepare
9240 CInode::mempool_inode *spi = 0; // renamed inode
9241 CInode::mempool_inode *tpi = 0; // target/overwritten inode
9242
9243 // target inode
9244 if (!linkmerge) {
9245 if (destdnl->is_primary()) {
9246 ceph_assert(straydn); // moving to straydn.
9247 // link--, and move.
9248 if (destdn->is_auth()) {
9249 auto pi= oldin->project_inode(mdr); //project_snaprealm
9250 pi.inode->version = straydn->pre_dirty(pi.inode->version);
9251 pi.inode->update_backtrace();
9252 tpi = pi.inode.get();
9253 }
9254 straydn->push_projected_linkage(oldin);
9255 } else if (destdnl->is_remote()) {
9256 // nlink-- targeti
9257 if (oldin->is_auth()) {
9258 auto pi = oldin->project_inode(mdr);
9259 pi.inode->version = oldin->pre_dirty();
9260 tpi = pi.inode.get();
9261 }
9262 }
9263 }
9264
9265 // dest
9266 if (destdnl->is_null()) {
9267 /* handle_client_rename checks that alternate_name matches for existing destdn */
9268 destdn->set_alternate_name(alternate_name);
9269 }
9270 if (srcdnl->is_remote()) {
9271 if (!linkmerge) {
9272 // destdn
9273 if (destdn->is_auth())
9274 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
9275 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
9276 // srci
9277 if (srci->is_auth()) {
9278 auto pi = srci->project_inode(mdr);
9279 pi.inode->version = srci->pre_dirty();
9280 spi = pi.inode.get();
9281 }
9282 } else {
9283 dout(10) << " will merge remote onto primary link" << dendl;
9284 if (destdn->is_auth()) {
9285 auto pi = oldin->project_inode(mdr);
9286 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->get_version());
9287 spi = pi.inode.get();
9288 }
9289 }
9290 } else { // primary
9291 if (destdn->is_auth()) {
9292 version_t oldpv;
9293 if (srcdn->is_auth())
9294 oldpv = srci->get_projected_version();
9295 else {
9296 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
9297
9298 // note which dirfrags have child subtrees in the journal
9299 // event, so that we can open those (as bounds) during replay.
9300 if (srci->is_dir()) {
9301 auto&& ls = srci->get_dirfrags();
9302 for (const auto& dir : ls) {
9303 if (!dir->is_auth())
9304 metablob->renamed_dir_frags.push_back(dir->get_frag());
9305 }
9306 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
9307 }
9308 }
9309 auto pi = srci->project_inode(mdr); // project snaprealm if srcdnl->is_primary
9310 // & srcdnl->snaprealm
9311 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
9312 pi.inode->update_backtrace();
9313 spi = pi.inode.get();
9314 }
9315 destdn->push_projected_linkage(srci);
9316 }
9317
9318 // src
9319 if (srcdn->is_auth())
9320 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
9321 srcdn->push_projected_linkage(); // push null linkage
9322
9323 if (!silent) {
9324 if (spi) {
9325 spi->ctime = mdr->get_op_stamp();
9326 if (mdr->get_op_stamp() > spi->rstat.rctime)
9327 spi->rstat.rctime = mdr->get_op_stamp();
9328 spi->change_attr++;
9329 if (linkmerge)
9330 spi->nlink--;
9331 }
9332 if (tpi) {
9333 tpi->ctime = mdr->get_op_stamp();
9334 if (mdr->get_op_stamp() > tpi->rstat.rctime)
9335 tpi->rstat.rctime = mdr->get_op_stamp();
9336 tpi->change_attr++;
9337 {
9338 std::string t;
9339 destdn->make_path_string(t, true);
9340 tpi->stray_prior_path = std::move(t);
9341 }
9342 tpi->nlink--;
9343 if (tpi->nlink == 0)
9344 oldin->state_set(CInode::STATE_ORPHAN);
9345 }
9346 }
9347
9348 // prepare nesting, mtime updates
9349 int predirty_dir = silent ? 0:PREDIRTY_DIR;
9350
9351 // guarantee stray dir is processed first during journal replay. unlink the old inode,
9352 // then link the source inode to destdn
9353 if (destdnl->is_primary()) {
9354 ceph_assert(straydn);
9355 if (straydn->is_auth()) {
9356 metablob->add_dir_context(straydn->get_dir());
9357 metablob->add_dir(straydn->get_dir(), true);
9358 }
9359 }
9360
9361 if (!linkmerge && destdnl->is_remote() && oldin->is_auth()) {
9362 CDir *oldin_dir = oldin->get_projected_parent_dir();
9363 if (oldin_dir != srcdn->get_dir() && oldin_dir != destdn->get_dir())
9364 mdcache->predirty_journal_parents(mdr, metablob, oldin, oldin_dir, PREDIRTY_PRIMARY);
9365 }
9366
9367 // sub off target
9368 if (destdn->is_auth() && !destdnl->is_null()) {
9369 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
9370 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
9371 if (destdnl->is_primary()) {
9372 ceph_assert(straydn);
9373 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
9374 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
9375 }
9376 }
9377
9378 if (srcdnl->is_remote() && srci->is_auth()) {
9379 CDir *srci_dir = srci->get_projected_parent_dir();
9380 if (srci_dir != srcdn->get_dir() && srci_dir != destdn->get_dir())
9381 mdcache->predirty_journal_parents(mdr, metablob, srci, srci_dir, PREDIRTY_PRIMARY);
9382 }
9383
9384 // move srcdn
9385 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
9386 int flags = predirty_dir | predirty_primary;
9387 if (srcdn->is_auth())
9388 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
9389 if (destdn->is_auth())
9390 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
9391
9392 // add it all to the metablob
9393 // target inode
9394 if (!linkmerge) {
9395 if (destdnl->is_primary()) {
9396 ceph_assert(straydn);
9397 if (destdn->is_auth()) {
9398 // project snaprealm, too
9399 if (auto& desti_srnode = mdr->more()->desti_srnode) {
9400 oldin->project_snaprealm(desti_srnode);
9401 if (tpi->nlink == 0)
9402 ceph_assert(!desti_srnode->is_parent_global());
9403 desti_srnode = NULL;
9404 }
9405 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
9406 metablob->add_primary_dentry(straydn, oldin, true, true);
9407 } else if (force_journal_stray) {
9408 dout(10) << " forced journaling straydn " << *straydn << dendl;
9409 metablob->add_dir_context(straydn->get_dir());
9410 metablob->add_primary_dentry(straydn, oldin, true);
9411 }
9412 } else if (destdnl->is_remote()) {
9413 if (oldin->is_auth()) {
9414 sr_t *new_srnode = NULL;
9415 if (mdr->peer_request) {
9416 if (mdr->peer_request->desti_snapbl.length() > 0) {
9417 new_srnode = new sr_t();
9418 auto p = mdr->peer_request->desti_snapbl.cbegin();
9419 decode(*new_srnode, p);
9420 }
9421 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
9422 new_srnode = desti_srnode;
9423 desti_srnode = NULL;
9424 }
9425 if (new_srnode) {
9426 oldin->project_snaprealm(new_srnode);
9427 if (tpi->nlink == 0)
9428 ceph_assert(!new_srnode->is_parent_global());
9429 }
9430 // auth for targeti
9431 CDentry *oldin_pdn = oldin->get_projected_parent_dn();
9432 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin_pdn);
9433 metablob->add_primary_dentry(oldin_pdn, oldin, true);
9434 }
9435 }
9436 }
9437
9438 // dest
9439 if (srcdnl->is_remote()) {
9440 ceph_assert(!linkmerge);
9441 if (destdn->is_auth() && !destdnl->is_null())
9442 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
9443 else
9444 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
9445
9446 if (destdn->is_auth())
9447 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
9448
9449 if (srci->is_auth() ) { // it's remote
9450 if (mdr->peer_request) {
9451 if (mdr->peer_request->srci_snapbl.length() > 0) {
9452 sr_t *new_srnode = new sr_t();
9453 auto p = mdr->peer_request->srci_snapbl.cbegin();
9454 decode(*new_srnode, p);
9455 srci->project_snaprealm(new_srnode);
9456 }
9457 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9458 srci->project_snaprealm(srci_srnode);
9459 srci_srnode = NULL;
9460 }
9461
9462 CDentry *srci_pdn = srci->get_projected_parent_dn();
9463 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn);
9464 metablob->add_primary_dentry(srci_pdn, srci, true);
9465 }
9466 } else if (srcdnl->is_primary()) {
9467 // project snap parent update?
9468 if (destdn->is_auth()) {
9469 if (auto& srci_srnode = mdr->more()->srci_srnode) {
9470 srci->project_snaprealm(srci_srnode);
9471 srci_srnode = NULL;
9472 }
9473 }
9474
9475 if (destdn->is_auth() && !destdnl->is_null())
9476 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
9477
9478 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
9479 {
9480 auto do_corruption = inject_rename_corrupt_dentry_first;
9481 if (unlikely(do_corruption > 0.0)) {
9482 auto r = ceph::util::generate_random_number(0.0, 1.0);
9483 if (r < do_corruption) {
9484 dout(0) << "corrupting dn: " << *destdn << dendl;
9485 destdn->first = -10;
9486 }
9487 }
9488 }
9489
9490 if (destdn->is_auth())
9491 metablob->add_primary_dentry(destdn, srci, true, true);
9492 else if (force_journal_dest) {
9493 dout(10) << " forced journaling destdn " << *destdn << dendl;
9494 metablob->add_dir_context(destdn->get_dir());
9495 metablob->add_primary_dentry(destdn, srci, true);
9496 if (srcdn->is_auth() && srci->is_dir()) {
9497 // journal new subtrees root dirfrags
9498 auto&& ls = srci->get_dirfrags();
9499 for (const auto& dir : ls) {
9500 if (dir->is_auth())
9501 metablob->add_dir(dir, true);
9502 }
9503 }
9504 }
9505 }
9506
9507 // src
9508 if (srcdn->is_auth()) {
9509 dout(10) << " journaling srcdn " << *srcdn << dendl;
9510 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
9511 // also journal the inode in case we need do peer rename rollback. It is Ok to add
9512 // both primary and NULL dentries. Because during journal replay, null dentry is
9513 // processed after primary dentry.
9514 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
9515 metablob->add_primary_dentry(srcdn, srci, true);
9516 metablob->add_null_dentry(srcdn, true);
9517 } else
9518 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
9519
9520 // make renamed inode first track the dn
9521 if (srcdnl->is_primary() && destdn->is_auth()) {
9522 ceph_assert(srci->first <= destdn->first);
9523 srci->first = destdn->first;
9524 }
9525 // make stray inode first track the straydn
9526 if (straydn && straydn->is_auth()) {
9527 ceph_assert(oldin->first <= straydn->first);
9528 oldin->first = straydn->first;
9529 }
9530
9531 if (oldin && oldin->is_dir()) {
9532 ceph_assert(straydn);
9533 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
9534 }
9535 if (srci->is_dir())
9536 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
9537
9538 }
9539
9540
9541 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9542 {
9543 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
9544 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
9545
9546 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9547 CDentry::linkage_t *destdnl = destdn->get_linkage();
9548
9549 CInode *oldin = destdnl->get_inode();
9550
9551 // primary+remote link merge?
9552 bool linkmerge = (srcdnl->get_inode() == oldin);
9553 if (linkmerge)
9554 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
9555
9556 bool new_in_snaprealm = false;
9557 bool new_oldin_snaprealm = false;
9558
9559 // target inode
9560 if (!linkmerge) {
9561 if (destdnl->is_primary()) {
9562 ceph_assert(straydn);
9563 dout(10) << "straydn is " << *straydn << dendl;
9564
9565 // if there is newly created snaprealm, need to split old snaprealm's
9566 // inodes_with_caps. So pop snaprealm before linkage changes.
9567 if (destdn->is_auth()) {
9568 bool hadrealm = (oldin->snaprealm ? true : false);
9569 oldin->early_pop_projected_snaprealm();
9570 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
9571 } else {
9572 ceph_assert(mdr->peer_request);
9573 if (mdr->peer_request->desti_snapbl.length()) {
9574 new_oldin_snaprealm = !oldin->snaprealm;
9575 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
9576 ceph_assert(oldin->snaprealm);
9577 }
9578 }
9579
9580 destdn->get_dir()->unlink_inode(destdn, false);
9581
9582 straydn->pop_projected_linkage();
9583 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9584 ceph_assert(!straydn->is_projected()); // no other projected
9585
9586 // nlink-- targeti
9587 if (destdn->is_auth())
9588 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9589
9590 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
9591 } else if (destdnl->is_remote()) {
9592 destdn->get_dir()->unlink_inode(destdn, false);
9593 if (oldin->is_auth()) {
9594 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9595 } else if (mdr->peer_request) {
9596 if (mdr->peer_request->desti_snapbl.length() > 0) {
9597 ceph_assert(oldin->snaprealm);
9598 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
9599 }
9600 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
9601 delete desti_srnode;
9602 desti_srnode = NULL;
9603 }
9604 }
9605 }
9606
9607 // unlink src before we relink it at dest
9608 CInode *in = srcdnl->get_inode();
9609 ceph_assert(in);
9610
9611 bool srcdn_was_remote = srcdnl->is_remote();
9612 if (!srcdn_was_remote) {
9613 // if there is newly created snaprealm, need to split old snaprealm's
9614 // inodes_with_caps. So pop snaprealm before linkage changes.
9615 if (destdn->is_auth()) {
9616 bool hadrealm = (in->snaprealm ? true : false);
9617 in->early_pop_projected_snaprealm();
9618 new_in_snaprealm = (in->snaprealm && !hadrealm);
9619 } else {
9620 ceph_assert(mdr->peer_request);
9621 if (mdr->peer_request->srci_snapbl.length()) {
9622 new_in_snaprealm = !in->snaprealm;
9623 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
9624 ceph_assert(in->snaprealm);
9625 }
9626 }
9627 }
9628
9629 srcdn->get_dir()->unlink_inode(srcdn);
9630
9631 // dest
9632 if (srcdn_was_remote) {
9633 if (!linkmerge) {
9634 // destdn
9635 destdnl = destdn->pop_projected_linkage();
9636 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9637 ceph_assert(!destdn->is_projected()); // no other projected
9638
9639 destdn->link_remote(destdnl, in);
9640 if (destdn->is_auth())
9641 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
9642 // in
9643 if (in->is_auth()) {
9644 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9645 } else if (mdr->peer_request) {
9646 if (mdr->peer_request->srci_snapbl.length() > 0) {
9647 ceph_assert(in->snaprealm);
9648 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
9649 }
9650 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9651 delete srci_srnode;
9652 srci_srnode = NULL;
9653 }
9654 } else {
9655 dout(10) << "merging remote onto primary link" << dendl;
9656 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9657 }
9658 } else { // primary
9659 if (linkmerge) {
9660 dout(10) << "merging primary onto remote link" << dendl;
9661 destdn->get_dir()->unlink_inode(destdn, false);
9662 }
9663 destdnl = destdn->pop_projected_linkage();
9664 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9665 ceph_assert(!destdn->is_projected()); // no other projected
9666
9667 // srcdn inode import?
9668 if (!srcdn->is_auth() && destdn->is_auth()) {
9669 ceph_assert(mdr->more()->inode_import.length() > 0);
9670
9671 map<client_t,Capability::Import> imported_caps;
9672
9673 // finish cap imports
9674 finish_force_open_sessions(mdr->more()->imported_session_map);
9675 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
9676 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
9677 mdr->more()->srcdn_auth_mds, true,
9678 mdr->more()->imported_session_map,
9679 mdr->more()->cap_imports[destdnl->get_inode()],
9680 imported_caps);
9681 }
9682
9683 mdr->more()->inode_import.clear();
9684 encode(imported_caps, mdr->more()->inode_import);
9685
9686 /* hack: add an auth pin for each xlock we hold. These were
9687 * remote xlocks previously but now they're local and
9688 * we're going to try and unpin when we xlock_finish. */
9689
9690 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
9691 i != mdr->locks.end();
9692 ++i) {
9693 SimpleLock *lock = i->lock;
9694 if (lock->get_parent() != destdnl->get_inode())
9695 break;
9696 if (i->is_xlock() && !lock->is_locallock())
9697 mds->locker->xlock_import(lock);
9698 }
9699
9700 // hack: fix auth bit
9701 in->state_set(CInode::STATE_AUTH);
9702
9703 mdr->clear_ambiguous_auth();
9704 }
9705
9706 if (destdn->is_auth())
9707 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9708 }
9709
9710 // src
9711 if (srcdn->is_auth())
9712 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
9713 srcdn->pop_projected_linkage();
9714 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9715 ceph_assert(!srcdn->is_projected()); // no other projected
9716
9717 // apply remaining projected inodes (nested)
9718 mdr->apply();
9719
9720 // update subtree map?
9721 if (destdnl->is_primary() && in->is_dir())
9722 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
9723
9724 if (straydn && oldin->is_dir())
9725 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
9726
9727 if (new_oldin_snaprealm)
9728 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
9729 if (new_in_snaprealm)
9730 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
9731
9732 // removing a new dn?
9733 if (srcdn->is_auth())
9734 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
9735 }
9736
9737
9738
9739 // ------------
9740 // PEER
9741
9742 class C_MDS_PeerRenamePrep : public ServerLogContext {
9743 CDentry *srcdn, *destdn, *straydn;
9744 public:
9745 C_MDS_PeerRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
9746 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
9747 void finish(int r) override {
9748 server->_logged_peer_rename(mdr, srcdn, destdn, straydn);
9749 }
9750 };
9751
9752 class C_MDS_PeerRenameCommit : public ServerContext {
9753 MDRequestRef mdr;
9754 CDentry *srcdn, *destdn, *straydn;
9755 public:
9756 C_MDS_PeerRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
9757 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
9758 void finish(int r) override {
9759 server->_commit_peer_rename(mdr, r, srcdn, destdn, straydn);
9760 }
9761 };
9762
9763 class C_MDS_PeerRenameSessionsFlushed : public ServerContext {
9764 MDRequestRef mdr;
9765 public:
9766 C_MDS_PeerRenameSessionsFlushed(Server *s, MDRequestRef& r) :
9767 ServerContext(s), mdr(r) {}
9768 void finish(int r) override {
9769 server->_peer_rename_sessions_flushed(mdr);
9770 }
9771 };
9772
9773 void Server::handle_peer_rename_prep(MDRequestRef& mdr)
9774 {
9775 dout(10) << "handle_peer_rename_prep " << *mdr
9776 << " " << mdr->peer_request->srcdnpath
9777 << " to " << mdr->peer_request->destdnpath
9778 << dendl;
9779
9780 if (mdr->peer_request->is_interrupted()) {
9781 dout(10) << " peer request interrupted, sending noop reply" << dendl;
9782 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9783 reply->mark_interrupted();
9784 mds->send_message_mds(reply, mdr->peer_to_mds);
9785 mdr->reset_peer_request();
9786 return;
9787 }
9788
9789 // discover destdn
9790 filepath destpath(mdr->peer_request->destdnpath);
9791 dout(10) << " dest " << destpath << dendl;
9792 vector<CDentry*> trace;
9793 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
9794 int r = mdcache->path_traverse(mdr, cf, destpath,
9795 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
9796 &trace);
9797 if (r > 0) return;
9798 if (r == -CEPHFS_ESTALE) {
9799 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
9800 mdr->peer_to_mds, true);
9801 return;
9802 }
9803 ceph_assert(r == 0); // we shouldn't get an error here!
9804
9805 CDentry *destdn = trace.back();
9806 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
9807 dout(10) << " destdn " << *destdn << dendl;
9808 mdr->pin(destdn);
9809
9810 // discover srcdn
9811 filepath srcpath(mdr->peer_request->srcdnpath);
9812 dout(10) << " src " << srcpath << dendl;
9813 CInode *srci = nullptr;
9814 r = mdcache->path_traverse(mdr, cf, srcpath,
9815 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
9816 &trace, &srci);
9817 if (r > 0) return;
9818 ceph_assert(r == 0);
9819
9820 CDentry *srcdn = trace.back();
9821 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
9822 dout(10) << " srcdn " << *srcdn << dendl;
9823 mdr->pin(srcdn);
9824 mdr->pin(srci);
9825
9826 // stray?
9827 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
9828 if (linkmerge)
9829 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
9830 CDentry *straydn = mdr->straydn;
9831 if (destdnl->is_primary() && !linkmerge)
9832 ceph_assert(straydn);
9833
9834 mdr->set_op_stamp(mdr->peer_request->op_stamp);
9835 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
9836
9837 // set up commit waiter (early, to clean up any freezing etc we do)
9838 if (!mdr->more()->peer_commit)
9839 mdr->more()->peer_commit = new C_MDS_PeerRenameCommit(this, mdr, srcdn, destdn, straydn);
9840
9841 // am i srcdn auth?
9842 if (srcdn->is_auth()) {
9843 set<mds_rank_t> srcdnrep;
9844 srcdn->list_replicas(srcdnrep);
9845
9846 bool reply_witness = false;
9847 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
9848 // freeze?
9849 // we need this to
9850 // - avoid conflicting lock state changes
9851 // - avoid concurrent updates to the inode
9852 // (this could also be accomplished with the versionlock)
9853 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
9854 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
9855 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
9856
9857 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9858 if (srcdnl->get_inode()->is_frozen_auth_pin())
9859 mdr->unfreeze_auth_pin();
9860
9861 if (!frozen_inode) {
9862 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
9863 return;
9864 }
9865
9866 /*
9867 * set ambiguous auth for srci
9868 * NOTE: we don't worry about ambiguous cache expire as we do
9869 * with subtree migrations because all peers will pin
9870 * srcdn->get_inode() for duration of this rename.
9871 */
9872 mdr->set_ambiguous_auth(srcdnl->get_inode());
9873
9874 // just mark the source inode as ambiguous auth if more than two MDS are involved.
9875 // the leader will send another OP_RENAMEPREP peer request later.
9876 if (mdr->peer_request->witnesses.size() > 1) {
9877 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
9878 reply_witness = true;
9879 }
9880
9881 // make sure bystanders have received all lock related messages
9882 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
9883 if (*p == mdr->peer_to_mds ||
9884 (mds->is_cluster_degraded() &&
9885 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
9886 continue;
9887 auto notify = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMENOTIFY);
9888 mds->send_message_mds(notify, *p);
9889 mdr->more()->waiting_on_peer.insert(*p);
9890 }
9891
9892 // make sure clients have received all cap related messages
9893 set<client_t> export_client_set;
9894 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
9895
9896 MDSGatherBuilder gather(g_ceph_context);
9897 flush_client_sessions(export_client_set, gather);
9898 if (gather.has_subs()) {
9899 mdr->more()->waiting_on_peer.insert(MDS_RANK_NONE);
9900 gather.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr));
9901 gather.activate();
9902 }
9903 }
9904
9905 // is witness list sufficient?
9906 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
9907 if (*p == mdr->peer_to_mds ||
9908 mdr->peer_request->witnesses.count(*p)) continue;
9909 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
9910 reply_witness = true;
9911 break;
9912 }
9913
9914 if (reply_witness) {
9915 ceph_assert(!srcdnrep.empty());
9916 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9917 reply->witnesses.swap(srcdnrep);
9918 mds->send_message_mds(reply, mdr->peer_to_mds);
9919 mdr->reset_peer_request();
9920 return;
9921 }
9922 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
9923 if (!mdr->more()->waiting_on_peer.empty()) {
9924 dout(10) << " still waiting for rename notify acks from "
9925 << mdr->more()->waiting_on_peer << dendl;
9926 return;
9927 }
9928 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
9929 // set ambiguous auth for srci on witnesses
9930 mdr->set_ambiguous_auth(srcdnl->get_inode());
9931 }
9932
9933 // encode everything we'd need to roll this back... basically, just the original state.
9934 rename_rollback rollback;
9935
9936 rollback.reqid = mdr->reqid;
9937
9938 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
9939 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9940 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
9941 rollback.orig_src.dname = srcdn->get_name();
9942 if (srcdnl->is_primary())
9943 rollback.orig_src.ino = srcdnl->get_inode()->ino();
9944 else {
9945 ceph_assert(srcdnl->is_remote());
9946 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
9947 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
9948 }
9949
9950 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
9951 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9952 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
9953 rollback.orig_dest.dname = destdn->get_name();
9954 if (destdnl->is_primary())
9955 rollback.orig_dest.ino = destdnl->get_inode()->ino();
9956 else if (destdnl->is_remote()) {
9957 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
9958 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
9959 }
9960
9961 if (straydn) {
9962 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
9963 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
9964 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
9965 rollback.stray.dname = straydn->get_name();
9966 }
9967 if (mdr->peer_request->desti_snapbl.length()) {
9968 CInode *oldin = destdnl->get_inode();
9969 if (oldin->snaprealm) {
9970 encode(true, rollback.desti_snapbl);
9971 oldin->encode_snap_blob(rollback.desti_snapbl);
9972 } else {
9973 encode(false, rollback.desti_snapbl);
9974 }
9975 }
9976 if (mdr->peer_request->srci_snapbl.length()) {
9977 if (srci->snaprealm) {
9978 encode(true, rollback.srci_snapbl);
9979 srci->encode_snap_blob(rollback.srci_snapbl);
9980 } else {
9981 encode(false, rollback.srci_snapbl);
9982 }
9983 }
9984 encode(rollback, mdr->more()->rollback_bl);
9985 // FIXME: rollback snaprealm
9986 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
9987
9988 // journal.
9989 mdr->ls = mdlog->get_current_segment();
9990 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_prep", mdr->reqid, mdr->peer_to_mds,
9991 EPeerUpdate::OP_PREPARE, EPeerUpdate::RENAME);
9992 mdlog->start_entry(le);
9993 le->rollback = mdr->more()->rollback_bl;
9994
9995 bufferlist blah; // inode import data... obviously not used if we're the peer
9996 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, mdr->peer_request->alternate_name, straydn);
9997
9998 if (le->commit.empty()) {
9999 dout(10) << " empty metablob, skipping journal" << dendl;
10000 mdlog->cancel_entry(le);
10001 mdr->ls = NULL;
10002 _logged_peer_rename(mdr, srcdn, destdn, straydn);
10003 } else {
10004 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
10005 mdr->more()->peer_update_journaled = true;
10006 submit_mdlog_entry(le, new C_MDS_PeerRenamePrep(this, mdr, srcdn, destdn, straydn),
10007 mdr, __func__);
10008 mdlog->flush();
10009 }
10010 }
10011
10012 void Server::_logged_peer_rename(MDRequestRef& mdr,
10013 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
10014 {
10015 dout(10) << "_logged_peer_rename " << *mdr << dendl;
10016
10017 // prepare ack
10018 ref_t<MMDSPeerRequest> reply;
10019 if (!mdr->aborted) {
10020 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
10021 if (!mdr->more()->peer_update_journaled)
10022 reply->mark_not_journaled();
10023 }
10024
10025 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
10026 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
10027
10028 // export srci?
10029 if (srcdn->is_auth() && srcdnl->is_primary()) {
10030 // set export bounds for CInode::encode_export()
10031 if (reply) {
10032 std::vector<CDir*> bounds;
10033 if (srcdnl->get_inode()->is_dir()) {
10034 srcdnl->get_inode()->get_dirfrags(bounds);
10035 for (const auto& bound : bounds) {
10036 bound->state_set(CDir::STATE_EXPORTBOUND);
10037 }
10038 }
10039
10040 map<client_t,entity_inst_t> exported_client_map;
10041 map<client_t, client_metadata_t> exported_client_metadata_map;
10042 bufferlist inodebl;
10043 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
10044 exported_client_map,
10045 exported_client_metadata_map);
10046
10047 for (const auto& bound : bounds) {
10048 bound->state_clear(CDir::STATE_EXPORTBOUND);
10049 }
10050
10051 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
10052 encode(exported_client_metadata_map, reply->inode_export);
10053 reply->inode_export.claim_append(inodebl);
10054 reply->inode_export_v = srcdnl->get_inode()->get_version();
10055 }
10056
10057 // remove mdr auth pin
10058 mdr->auth_unpin(srcdnl->get_inode());
10059 mdr->more()->is_inode_exporter = true;
10060
10061 if (srcdnl->get_inode()->is_dirty())
10062 srcdnl->get_inode()->mark_clean();
10063
10064 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
10065 }
10066
10067 // apply
10068 _rename_apply(mdr, srcdn, destdn, straydn);
10069
10070 CDentry::linkage_t *destdnl = destdn->get_linkage();
10071
10072 // bump popularity
10073 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
10074 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
10075 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
10076
10077 // done.
10078 mdr->reset_peer_request();
10079 mdr->straydn = 0;
10080
10081 if (reply) {
10082 mds->send_message_mds(reply, mdr->peer_to_mds);
10083 } else {
10084 ceph_assert(mdr->aborted);
10085 dout(10) << " abort flag set, finishing" << dendl;
10086 mdcache->request_finish(mdr);
10087 }
10088 }
10089
10090 void Server::_commit_peer_rename(MDRequestRef& mdr, int r,
10091 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
10092 {
10093 dout(10) << "_commit_peer_rename " << *mdr << " r=" << r << dendl;
10094
10095 CInode *in = destdn->get_linkage()->get_inode();
10096
10097 inodeno_t migrated_stray;
10098 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
10099 migrated_stray = in->ino();
10100
10101 MDSContext::vec finished;
10102 if (r == 0) {
10103 // unfreeze+singleauth inode
10104 // hmm, do i really need to delay this?
10105 if (mdr->more()->is_inode_exporter) {
10106 // drop our pins
10107 // we exported, clear out any xlocks that we moved to another MDS
10108
10109 for (auto i = mdr->locks.lower_bound(&in->versionlock);
10110 i != mdr->locks.end(); ) {
10111 SimpleLock *lock = i->lock;
10112 if (lock->get_parent() != in)
10113 break;
10114 // we only care about xlocks on the exported inode
10115 if (i->is_xlock() && !lock->is_locallock())
10116 mds->locker->xlock_export(i++, mdr.get());
10117 else
10118 ++i;
10119 }
10120
10121 map<client_t,Capability::Import> peer_imported;
10122 auto bp = mdr->more()->inode_import.cbegin();
10123 decode(peer_imported, bp);
10124
10125 dout(10) << " finishing inode export on " << *in << dendl;
10126 mdcache->migrator->finish_export_inode(in, mdr->peer_to_mds, peer_imported, finished);
10127 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
10128
10129 // unfreeze
10130 ceph_assert(in->is_frozen_inode());
10131 in->unfreeze_inode(finished);
10132 }
10133
10134 // singleauth
10135 if (mdr->more()->is_ambiguous_auth) {
10136 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
10137 mdr->more()->is_ambiguous_auth = false;
10138 }
10139
10140 if (straydn && mdr->more()->peer_update_journaled) {
10141 CInode *strayin = straydn->get_projected_linkage()->get_inode();
10142 if (strayin && !strayin->snaprealm)
10143 mdcache->clear_dirty_bits_for_stray(strayin);
10144 }
10145
10146 mds->queue_waiters(finished);
10147 mdr->cleanup();
10148
10149 if (mdr->more()->peer_update_journaled) {
10150 // write a commit to the journal
10151 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_commit", mdr->reqid,
10152 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
10153 EPeerUpdate::RENAME);
10154 mdlog->start_entry(le);
10155 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
10156 mdlog->flush();
10157 } else {
10158 _committed_peer(mdr);
10159 }
10160 } else {
10161
10162 // abort
10163 // rollback_bl may be empty if we froze the inode but had to provide an expanded
10164 // witness list from the leader, and they failed before we tried prep again.
10165 if (mdr->more()->rollback_bl.length()) {
10166 if (mdr->more()->is_inode_exporter) {
10167 dout(10) << " reversing inode export of " << *in << dendl;
10168 in->abort_export();
10169 }
10170 if (mdcache->is_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds)) {
10171 mdcache->remove_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds);
10172 // rollback but preserve the peer request
10173 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, false);
10174 mdr->more()->rollback_bl.clear();
10175 } else
10176 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, true);
10177 } else {
10178 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl;
10179 // singleauth
10180 if (mdr->more()->is_ambiguous_auth) {
10181 if (srcdn->is_auth())
10182 mdr->more()->rename_inode->unfreeze_inode(finished);
10183
10184 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
10185 mdr->more()->is_ambiguous_auth = false;
10186 }
10187 mds->queue_waiters(finished);
10188 mdcache->request_finish(mdr);
10189 }
10190 }
10191
10192 if (migrated_stray && mds->is_stopping())
10193 mdcache->shutdown_export_stray_finish(migrated_stray);
10194 }
10195
10196 static void _rollback_repair_dir(MutationRef& mut, CDir *dir,
10197 rename_rollback::drec &r, utime_t ctime,
10198 bool isdir, const nest_info_t &rstat)
10199 {
10200 auto pf = dir->project_fnode(mut);
10201 pf->version = dir->pre_dirty();
10202
10203 if (isdir) {
10204 pf->fragstat.nsubdirs += 1;
10205 } else {
10206 pf->fragstat.nfiles += 1;
10207 }
10208 if (r.ino) {
10209 pf->rstat.rbytes += rstat.rbytes;
10210 pf->rstat.rfiles += rstat.rfiles;
10211 pf->rstat.rsubdirs += rstat.rsubdirs;
10212 pf->rstat.rsnaps += rstat.rsnaps;
10213 }
10214 if (pf->fragstat.mtime == ctime) {
10215 pf->fragstat.mtime = r.dirfrag_old_mtime;
10216 if (pf->rstat.rctime == ctime)
10217 pf->rstat.rctime = r.dirfrag_old_rctime;
10218 }
10219 mut->add_updated_lock(&dir->get_inode()->filelock);
10220 mut->add_updated_lock(&dir->get_inode()->nestlock);
10221 }
10222
10223 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
10224 MutationRef mut;
10225 CDentry *srcdn;
10226 version_t srcdnpv;
10227 CDentry *destdn;
10228 CDentry *straydn;
10229 map<client_t,ref_t<MClientSnap>> splits[2];
10230 bool finish_mdr;
10231 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
10232 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
10233 map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
10234 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
10235 straydn(st), finish_mdr(f) {
10236 splits[0].swap(_splits[0]);
10237 splits[1].swap(_splits[1]);
10238 }
10239 void finish(int r) override {
10240 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
10241 destdn, straydn, splits, finish_mdr);
10242 }
10243 };
10244
10245 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr,
10246 bool finish_mdr)
10247 {
10248 rename_rollback rollback;
10249 auto p = rbl.cbegin();
10250 decode(rollback, p);
10251
10252 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
10253 // need to finish this update before sending resolve to claim the subtree
10254 mdcache->add_rollback(rollback.reqid, leader);
10255
10256 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
10257 mut->ls = mds->mdlog->get_current_segment();
10258
10259 CDentry *srcdn = NULL;
10260 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
10261 if (!srcdir)
10262 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
10263 if (srcdir) {
10264 dout(10) << " srcdir " << *srcdir << dendl;
10265 srcdn = srcdir->lookup(rollback.orig_src.dname);
10266 if (srcdn) {
10267 dout(10) << " srcdn " << *srcdn << dendl;
10268 ceph_assert(srcdn->get_linkage()->is_null());
10269 } else
10270 dout(10) << " srcdn not found" << dendl;
10271 } else
10272 dout(10) << " srcdir not found" << dendl;
10273
10274 CDentry *destdn = NULL;
10275 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
10276 if (!destdir)
10277 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
10278 if (destdir) {
10279 dout(10) << " destdir " << *destdir << dendl;
10280 destdn = destdir->lookup(rollback.orig_dest.dname);
10281 if (destdn)
10282 dout(10) << " destdn " << *destdn << dendl;
10283 else
10284 dout(10) << " destdn not found" << dendl;
10285 } else
10286 dout(10) << " destdir not found" << dendl;
10287
10288 CInode *in = NULL;
10289 if (rollback.orig_src.ino) {
10290 in = mdcache->get_inode(rollback.orig_src.ino);
10291 if (in && in->is_dir())
10292 ceph_assert(srcdn && destdn);
10293 } else
10294 in = mdcache->get_inode(rollback.orig_src.remote_ino);
10295
10296 CDir *straydir = NULL;
10297 CDentry *straydn = NULL;
10298 if (rollback.stray.dirfrag.ino) {
10299 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
10300 if (straydir) {
10301 dout(10) << "straydir " << *straydir << dendl;
10302 straydn = straydir->lookup(rollback.stray.dname);
10303 if (straydn) {
10304 dout(10) << " straydn " << *straydn << dendl;
10305 ceph_assert(straydn->get_linkage()->is_primary());
10306 } else
10307 dout(10) << " straydn not found" << dendl;
10308 } else
10309 dout(10) << "straydir not found" << dendl;
10310 }
10311
10312 CInode *target = NULL;
10313 if (rollback.orig_dest.ino) {
10314 target = mdcache->get_inode(rollback.orig_dest.ino);
10315 if (target)
10316 ceph_assert(destdn && straydn);
10317 } else if (rollback.orig_dest.remote_ino)
10318 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
10319
10320 // can't use is_auth() in the resolve stage
10321 mds_rank_t whoami = mds->get_nodeid();
10322 // peer
10323 ceph_assert(!destdn || destdn->authority().first != whoami);
10324 ceph_assert(!straydn || straydn->authority().first != whoami);
10325
10326 bool force_journal_src = false;
10327 bool force_journal_dest = false;
10328 if (in && in->is_dir() && srcdn->authority().first != whoami)
10329 force_journal_src = _need_force_journal(in, false);
10330 if (in && target && target->is_dir())
10331 force_journal_dest = _need_force_journal(in, true);
10332
10333 version_t srcdnpv = 0;
10334 // repair src
10335 if (srcdn) {
10336 if (srcdn->authority().first == whoami)
10337 srcdnpv = srcdn->pre_dirty();
10338 if (rollback.orig_src.ino) {
10339 ceph_assert(in);
10340 srcdn->push_projected_linkage(in);
10341 } else
10342 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
10343 rollback.orig_src.remote_d_type);
10344 }
10345
10346 map<client_t,ref_t<MClientSnap>> splits[2];
10347
10348 const CInode::mempool_inode *pip = nullptr;
10349 if (in) {
10350 bool projected;
10351 CDir *pdir = in->get_projected_parent_dir();
10352 if (pdir->authority().first == whoami) {
10353 auto pi = in->project_inode(mut);
10354 pi.inode->version = in->pre_dirty();
10355 if (pdir != srcdir) {
10356 auto pf = pdir->project_fnode(mut);
10357 pf->version = pdir->pre_dirty();
10358 }
10359 if (pi.inode->ctime == rollback.ctime)
10360 pi.inode->ctime = rollback.orig_src.old_ctime;
10361 projected = true;
10362 } else {
10363 if (in->get_inode()->ctime == rollback.ctime) {
10364 auto _inode = CInode::allocate_inode(*in->get_inode());
10365 _inode->ctime = rollback.orig_src.old_ctime;
10366 in->reset_inode(_inode);
10367 }
10368 projected = false;
10369 }
10370 pip = in->get_projected_inode().get();
10371
10372 if (rollback.srci_snapbl.length() && in->snaprealm) {
10373 bool hadrealm;
10374 auto p = rollback.srci_snapbl.cbegin();
10375 decode(hadrealm, p);
10376 if (hadrealm) {
10377 if (projected && !mds->is_resolve()) {
10378 sr_t *new_srnode = new sr_t();
10379 decode(*new_srnode, p);
10380 in->project_snaprealm(new_srnode);
10381 } else
10382 decode(in->snaprealm->srnode, p);
10383 } else {
10384 SnapRealm *realm;
10385 if (rollback.orig_src.ino) {
10386 ceph_assert(srcdir);
10387 realm = srcdir->get_inode()->find_snaprealm();
10388 } else {
10389 realm = in->snaprealm->parent;
10390 }
10391 if (!mds->is_resolve())
10392 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
10393 if (projected)
10394 in->project_snaprealm(NULL);
10395 else
10396 in->snaprealm->merge_to(realm);
10397 }
10398 }
10399 }
10400
10401 // repair dest
10402 if (destdn) {
10403 if (rollback.orig_dest.ino && target) {
10404 destdn->push_projected_linkage(target);
10405 } else if (rollback.orig_dest.remote_ino) {
10406 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
10407 rollback.orig_dest.remote_d_type);
10408 } else {
10409 // the dentry will be trimmed soon, it's ok to have wrong linkage
10410 if (rollback.orig_dest.ino)
10411 ceph_assert(mds->is_resolve());
10412 destdn->push_projected_linkage();
10413 }
10414 }
10415
10416 if (straydn)
10417 straydn->push_projected_linkage();
10418
10419 if (target) {
10420 bool projected;
10421 CInode::inode_ptr ti;
10422 CDir *pdir = target->get_projected_parent_dir();
10423 if (pdir->authority().first == whoami) {
10424 auto pi = target->project_inode(mut);
10425 pi.inode->version = target->pre_dirty();
10426 if (pdir != srcdir) {
10427 auto pf = pdir->project_fnode(mut);
10428 pf->version = pdir->pre_dirty();
10429 }
10430 ti = pi.inode;
10431 projected = true;
10432 } else {
10433 ti = CInode::allocate_inode(*target->get_inode());
10434 projected = false;
10435 }
10436
10437 if (ti->ctime == rollback.ctime)
10438 ti->ctime = rollback.orig_dest.old_ctime;
10439 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
10440 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
10441 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
10442 else
10443 ceph_assert(rollback.orig_dest.remote_ino &&
10444 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
10445 } else
10446 ti->nlink++;
10447
10448 if (!projected)
10449 target->reset_inode(ti);
10450
10451 if (rollback.desti_snapbl.length() && target->snaprealm) {
10452 bool hadrealm;
10453 auto p = rollback.desti_snapbl.cbegin();
10454 decode(hadrealm, p);
10455 if (hadrealm) {
10456 if (projected && !mds->is_resolve()) {
10457 sr_t *new_srnode = new sr_t();
10458 decode(*new_srnode, p);
10459 target->project_snaprealm(new_srnode);
10460 } else
10461 decode(target->snaprealm->srnode, p);
10462 } else {
10463 SnapRealm *realm;
10464 if (rollback.orig_dest.ino) {
10465 ceph_assert(destdir);
10466 realm = destdir->get_inode()->find_snaprealm();
10467 } else {
10468 realm = target->snaprealm->parent;
10469 }
10470 if (!mds->is_resolve())
10471 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
10472 if (projected)
10473 target->project_snaprealm(NULL);
10474 else
10475 target->snaprealm->merge_to(realm);
10476 }
10477 }
10478 }
10479
10480 if (srcdn && srcdn->authority().first == whoami) {
10481 nest_info_t blah;
10482 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
10483 in && in->is_dir(), pip ? pip->accounted_rstat : blah);
10484 }
10485
10486 if (srcdn)
10487 dout(0) << " srcdn back to " << *srcdn << dendl;
10488 if (in)
10489 dout(0) << " srci back to " << *in << dendl;
10490 if (destdn)
10491 dout(0) << " destdn back to " << *destdn << dendl;
10492 if (target)
10493 dout(0) << " desti back to " << *target << dendl;
10494
10495 // journal it
10496 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_rollback", rollback.reqid, leader,
10497 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RENAME);
10498 mdlog->start_entry(le);
10499
10500 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
10501 le->commit.add_dir_context(srcdir);
10502 if (rollback.orig_src.ino)
10503 le->commit.add_primary_dentry(srcdn, 0, true);
10504 else
10505 le->commit.add_remote_dentry(srcdn, true);
10506 }
10507
10508 if (!rollback.orig_src.ino && // remote linkage
10509 in && in->authority().first == whoami) {
10510 le->commit.add_dir_context(in->get_projected_parent_dir());
10511 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
10512 }
10513
10514 if (force_journal_dest) {
10515 ceph_assert(rollback.orig_dest.ino);
10516 le->commit.add_dir_context(destdir);
10517 le->commit.add_primary_dentry(destdn, 0, true);
10518 }
10519
10520 // peer: no need to journal straydn
10521
10522 if (target && target != in && target->authority().first == whoami) {
10523 ceph_assert(rollback.orig_dest.remote_ino);
10524 le->commit.add_dir_context(target->get_projected_parent_dir());
10525 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
10526 }
10527
10528 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
10529 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
10530 le->commit.renamed_dirino = in->ino();
10531 if (srcdn->authority().first == whoami) {
10532 auto&& ls = in->get_dirfrags();
10533 for (const auto& dir : ls) {
10534 if (!dir->is_auth())
10535 le->commit.renamed_dir_frags.push_back(dir->get_frag());
10536 }
10537 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
10538 }
10539 } else if (force_journal_dest) {
10540 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
10541 le->commit.renamed_dirino = target->ino();
10542 }
10543
10544 if (target && target->is_dir()) {
10545 ceph_assert(destdn);
10546 mdcache->project_subtree_rename(target, straydir, destdir);
10547 }
10548
10549 if (in && in->is_dir()) {
10550 ceph_assert(srcdn);
10551 mdcache->project_subtree_rename(in, destdir, srcdir);
10552 }
10553
10554 if (mdr && !mdr->more()->peer_update_journaled) {
10555 ceph_assert(le->commit.empty());
10556 mdlog->cancel_entry(le);
10557 mut->ls = NULL;
10558 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
10559 } else {
10560 ceph_assert(!le->commit.empty());
10561 if (mdr)
10562 mdr->more()->peer_update_journaled = false;
10563 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
10564 srcdn, srcdnpv, destdn, straydn,
10565 splits, finish_mdr);
10566 submit_mdlog_entry(le, fin, mdr, __func__);
10567 mdlog->flush();
10568 }
10569 }
10570
10571 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
10572 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
10573 map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
10574 {
10575 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
10576
10577 if (straydn) {
10578 straydn->get_dir()->unlink_inode(straydn);
10579 straydn->pop_projected_linkage();
10580 }
10581 if (destdn) {
10582 destdn->get_dir()->unlink_inode(destdn);
10583 destdn->pop_projected_linkage();
10584 }
10585 if (srcdn) {
10586 srcdn->pop_projected_linkage();
10587 if (srcdn->authority().first == mds->get_nodeid()) {
10588 srcdn->mark_dirty(srcdnpv, mut->ls);
10589 if (srcdn->get_linkage()->is_primary())
10590 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
10591 }
10592 }
10593
10594 mut->apply();
10595
10596 if (srcdn && srcdn->get_linkage()->is_primary()) {
10597 CInode *in = srcdn->get_linkage()->get_inode();
10598 if (in && in->is_dir()) {
10599 ceph_assert(destdn);
10600 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
10601 }
10602 }
10603
10604 if (destdn) {
10605 CInode *oldin = destdn->get_linkage()->get_inode();
10606 // update subtree map?
10607 if (oldin && oldin->is_dir()) {
10608 ceph_assert(straydn);
10609 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
10610 }
10611 }
10612
10613 if (mds->is_resolve()) {
10614 CDir *root = NULL;
10615 if (straydn)
10616 root = mdcache->get_subtree_root(straydn->get_dir());
10617 else if (destdn)
10618 root = mdcache->get_subtree_root(destdn->get_dir());
10619 if (root)
10620 mdcache->try_trim_non_auth_subtree(root);
10621 } else {
10622 mdcache->send_snaps(splits[1]);
10623 mdcache->send_snaps(splits[0]);
10624 }
10625
10626 if (mdr) {
10627 MDSContext::vec finished;
10628 if (mdr->more()->is_ambiguous_auth) {
10629 if (srcdn->is_auth())
10630 mdr->more()->rename_inode->unfreeze_inode(finished);
10631
10632 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
10633 mdr->more()->is_ambiguous_auth = false;
10634 }
10635 mds->queue_waiters(finished);
10636 if (finish_mdr || mdr->aborted)
10637 mdcache->request_finish(mdr);
10638 else
10639 mdr->more()->peer_rolling_back = false;
10640 }
10641
10642 mdcache->finish_rollback(mut->reqid, mdr);
10643
10644 mut->cleanup();
10645 }
10646
10647 void Server::handle_peer_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
10648 {
10649 dout(10) << "handle_peer_rename_prep_ack " << *mdr
10650 << " witnessed by " << ack->get_source()
10651 << " " << *ack << dendl;
10652 mds_rank_t from = mds_rank_t(ack->get_source().num());
10653
10654 // note peer
10655 mdr->more()->peers.insert(from);
10656 if (mdr->more()->srcdn_auth_mds == from &&
10657 mdr->more()->is_remote_frozen_authpin &&
10658 !mdr->more()->is_ambiguous_auth) {
10659 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
10660 }
10661
10662 // witnessed? or add extra witnesses?
10663 ceph_assert(mdr->more()->witnessed.count(from) == 0);
10664 if (ack->is_interrupted()) {
10665 dout(10) << " peer request interrupted, noop" << dendl;
10666 } else if (ack->witnesses.empty()) {
10667 mdr->more()->witnessed.insert(from);
10668 if (!ack->is_not_journaled())
10669 mdr->more()->has_journaled_peers = true;
10670 } else {
10671 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
10672 mdr->more()->extra_witnesses = ack->witnesses;
10673 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
10674 }
10675
10676 // srci import?
10677 if (ack->inode_export.length()) {
10678 dout(10) << " got srci import" << dendl;
10679 mdr->more()->inode_import.share(ack->inode_export);
10680 mdr->more()->inode_import_v = ack->inode_export_v;
10681 }
10682
10683 // remove from waiting list
10684 ceph_assert(mdr->more()->waiting_on_peer.count(from));
10685 mdr->more()->waiting_on_peer.erase(from);
10686
10687 if (mdr->more()->waiting_on_peer.empty())
10688 dispatch_client_request(mdr); // go again!
10689 else
10690 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
10691 }
10692
10693 void Server::handle_peer_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
10694 {
10695 dout(10) << "handle_peer_rename_notify_ack " << *mdr << " from mds."
10696 << ack->get_source() << dendl;
10697 ceph_assert(mdr->is_peer());
10698 mds_rank_t from = mds_rank_t(ack->get_source().num());
10699
10700 if (mdr->more()->waiting_on_peer.count(from)) {
10701 mdr->more()->waiting_on_peer.erase(from);
10702
10703 if (mdr->more()->waiting_on_peer.empty()) {
10704 if (mdr->peer_request)
10705 dispatch_peer_request(mdr);
10706 } else
10707 dout(10) << " still waiting for rename notify acks from "
10708 << mdr->more()->waiting_on_peer << dendl;
10709 }
10710 }
10711
10712 void Server::_peer_rename_sessions_flushed(MDRequestRef& mdr)
10713 {
10714 dout(10) << "_peer_rename_sessions_flushed " << *mdr << dendl;
10715
10716 if (mdr->more()->waiting_on_peer.count(MDS_RANK_NONE)) {
10717 mdr->more()->waiting_on_peer.erase(MDS_RANK_NONE);
10718
10719 if (mdr->more()->waiting_on_peer.empty()) {
10720 if (mdr->peer_request)
10721 dispatch_peer_request(mdr);
10722 } else
10723 dout(10) << " still waiting for rename notify acks from "
10724 << mdr->more()->waiting_on_peer << dendl;
10725 }
10726 }
10727
10728 // snaps
10729 /* This function takes responsibility for the passed mdr*/
10730 void Server::handle_client_lssnap(MDRequestRef& mdr)
10731 {
10732 const cref_t<MClientRequest> &req = mdr->client_request;
10733
10734 // traverse to path
10735 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10736 if (!diri)
10737 return;
10738
10739 if (!diri->is_dir()) {
10740 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10741 return;
10742 }
10743 dout(10) << "lssnap on " << *diri << dendl;
10744
10745 // lock snap
10746 if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
10747 return;
10748
10749 if (!check_access(mdr, diri, MAY_READ))
10750 return;
10751
10752 SnapRealm *realm = diri->find_snaprealm();
10753 map<snapid_t,const SnapInfo*> infomap;
10754 realm->get_snap_info(infomap, diri->get_oldest_snap());
10755
10756 unsigned max_entries = req->head.args.readdir.max_entries;
10757 if (!max_entries)
10758 max_entries = infomap.size();
10759 int max_bytes = req->head.args.readdir.max_bytes;
10760 if (!max_bytes)
10761 // make sure at least one item can be encoded
10762 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
10763
10764 __u64 last_snapid = 0;
10765 string offset_str = req->get_path2();
10766 if (!offset_str.empty())
10767 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
10768
10769 //Empty DirStat
10770 bufferlist dirbl;
10771 static DirStat empty;
10772 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
10773
10774 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
10775
10776 __u32 num = 0;
10777 bufferlist dnbl;
10778 auto p = infomap.upper_bound(last_snapid);
10779 for (; p != infomap.end() && num < max_entries; ++p) {
10780 dout(10) << p->first << " -> " << *p->second << dendl;
10781
10782 // actual
10783 string snap_name;
10784 if (p->second->ino == diri->ino())
10785 snap_name = p->second->name;
10786 else
10787 snap_name = p->second->get_long_name();
10788
10789 unsigned start_len = dnbl.length();
10790 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
10791 break;
10792
10793 encode(snap_name, dnbl);
10794 //infinite lease
10795 LeaseStat e(CEPH_LEASE_VALID, -1, 0);
10796 mds->locker->encode_lease(dnbl, mdr->session->info, e);
10797 dout(20) << "encode_infinite_lease" << dendl;
10798
10799 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
10800 if (r < 0) {
10801 bufferlist keep;
10802 keep.substr_of(dnbl, 0, start_len);
10803 dnbl.swap(keep);
10804 break;
10805 }
10806 ++num;
10807 }
10808
10809 encode(num, dirbl);
10810 __u16 flags = 0;
10811 if (p == infomap.end()) {
10812 flags = CEPH_READDIR_FRAG_END;
10813 if (last_snapid == 0)
10814 flags |= CEPH_READDIR_FRAG_COMPLETE;
10815 }
10816 encode(flags, dirbl);
10817 dirbl.claim_append(dnbl);
10818
10819 mdr->reply_extra_bl = dirbl;
10820 mdr->tracei = diri;
10821 respond_to_request(mdr, 0);
10822 }
10823
10824
10825 // MKSNAP
10826
10827 struct C_MDS_mksnap_finish : public ServerLogContext {
10828 CInode *diri;
10829 SnapInfo info;
10830 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
10831 ServerLogContext(s, r), diri(di), info(i) {}
10832 void finish(int r) override {
10833 server->_mksnap_finish(mdr, diri, info);
10834 }
10835 };
10836
10837 /* This function takes responsibility for the passed mdr*/
10838 void Server::handle_client_mksnap(MDRequestRef& mdr)
10839 {
10840 const cref_t<MClientRequest> &req = mdr->client_request;
10841 // make sure we have as new a map as the client
10842 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
10843 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
10844 return;
10845 }
10846 if (!mds->mdsmap->allows_snaps()) {
10847 // you can't make snapshots until you set an option right now
10848 dout(5) << "new snapshots are disabled for this fs" << dendl;
10849 respond_to_request(mdr, -CEPHFS_EPERM);
10850 return;
10851 }
10852
10853 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10854 if (!diri)
10855 return;
10856
10857 // dir only
10858 if (!diri->is_dir()) {
10859 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10860 return;
10861 }
10862 if (diri->is_system() && !diri->is_root()) {
10863 // no snaps in system dirs (root is ok)
10864 dout(5) << "is an internal system dir" << dendl;
10865 respond_to_request(mdr, -CEPHFS_EPERM);
10866 return;
10867 }
10868
10869 std::string_view snapname = req->get_filepath().last_dentry();
10870
10871 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10872 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
10873 respond_to_request(mdr, -CEPHFS_EPERM);
10874 return;
10875 }
10876
10877 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
10878
10879 // lock snap
10880 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10881 MutationImpl::LockOpVec lov;
10882 lov.add_xlock(&diri->snaplock);
10883 if (!mds->locker->acquire_locks(mdr, lov))
10884 return;
10885
10886 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10887 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10888 return;
10889 }
10890 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10891 }
10892
10893 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10894 return;
10895
10896 if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino();
10897 (subvol_ino && subvol_ino != diri->ino())) {
10898 dout(5) << "is a descendent of a subvolume dir" << dendl;
10899 respond_to_request(mdr, -CEPHFS_EPERM);
10900 return;
10901 }
10902
10903 // check if we can create any more snapshots
10904 // we don't allow any more if we are already at or beyond the limit
10905 if (diri->snaprealm &&
10906 diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
10907 respond_to_request(mdr, -CEPHFS_EMLINK);
10908 return;
10909 }
10910
10911 // make sure name is unique
10912 if (diri->snaprealm &&
10913 diri->snaprealm->exists(snapname)) {
10914 respond_to_request(mdr, -CEPHFS_EEXIST);
10915 return;
10916 }
10917 if (snapname.length() == 0 ||
10918 snapname.length() > snapshot_name_max ||
10919 snapname[0] == '_') {
10920 respond_to_request(mdr, -CEPHFS_EINVAL);
10921 return;
10922 }
10923
10924 // allocate a snapid
10925 if (!mdr->more()->stid) {
10926 // prepare an stid
10927 mds->snapclient->prepare_create(diri->ino(), snapname,
10928 mdr->get_mds_stamp(),
10929 &mdr->more()->stid, &mdr->more()->snapidbl,
10930 new C_MDS_RetryRequest(mdcache, mdr));
10931 return;
10932 }
10933
10934 version_t stid = mdr->more()->stid;
10935 snapid_t snapid;
10936 auto p = mdr->more()->snapidbl.cbegin();
10937 decode(snapid, p);
10938 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
10939
10940 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10941
10942 SnapPayload payload;
10943 if (req->get_data().length()) {
10944 try {
10945 auto iter = req->get_data().cbegin();
10946 decode(payload, iter);
10947 } catch (const ceph::buffer::error &e) {
10948 // backward compat -- client sends xattr bufferlist. however,
10949 // that is not used anywhere -- so (log and) ignore.
10950 dout(20) << ": no metadata in payload (old client?)" << dendl;
10951 }
10952 }
10953
10954 // journal
10955 SnapInfo info;
10956 info.ino = diri->ino();
10957 info.snapid = snapid;
10958 info.name = snapname;
10959 info.stamp = mdr->get_op_stamp();
10960 info.metadata = payload.metadata;
10961
10962 auto pi = diri->project_inode(mdr, false, true);
10963 pi.inode->ctime = info.stamp;
10964 if (info.stamp > pi.inode->rstat.rctime)
10965 pi.inode->rstat.rctime = info.stamp;
10966 pi.inode->rstat.rsnaps++;
10967 pi.inode->version = diri->pre_dirty();
10968
10969 // project the snaprealm
10970 auto &newsnap = *pi.snapnode;
10971 newsnap.created = snapid;
10972 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
10973 if (!em.second)
10974 em.first->second = info;
10975 newsnap.seq = snapid;
10976 newsnap.last_created = snapid;
10977 newsnap.last_modified = info.stamp;
10978 newsnap.change_attr++;
10979
10980 // journal the inode changes
10981 mdr->ls = mdlog->get_current_segment();
10982 EUpdate *le = new EUpdate(mdlog, "mksnap");
10983 mdlog->start_entry(le);
10984
10985 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10986 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10987 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10988 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10989
10990 // journal the snaprealm changes
10991 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
10992 mdr, __func__);
10993 mdlog->flush();
10994 }
10995
10996 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
10997 {
10998 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
10999
11000 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
11001
11002 mdr->apply();
11003
11004 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
11005
11006 // create snap
11007 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
11008
11009 // notify other mds
11010 mdcache->send_snap_update(diri, mdr->more()->stid, op);
11011
11012 mdcache->do_realm_invalidate_and_update_notify(diri, op);
11013
11014 // yay
11015 mdr->in[0] = diri;
11016 mdr->snapid = info.snapid;
11017 mdr->tracei = diri;
11018 respond_to_request(mdr, 0);
11019 }
11020
11021
11022 // RMSNAP
11023
11024 struct C_MDS_rmsnap_finish : public ServerLogContext {
11025 CInode *diri;
11026 snapid_t snapid;
11027 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
11028 ServerLogContext(s, r), diri(di), snapid(sn) {}
11029 void finish(int r) override {
11030 server->_rmsnap_finish(mdr, diri, snapid);
11031 }
11032 };
11033
11034 /* This function takes responsibility for the passed mdr*/
11035 void Server::handle_client_rmsnap(MDRequestRef& mdr)
11036 {
11037 const cref_t<MClientRequest> &req = mdr->client_request;
11038
11039 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
11040 if (!diri)
11041 return;
11042
11043 if (!diri->is_dir()) {
11044 respond_to_request(mdr, -CEPHFS_ENOTDIR);
11045 return;
11046 }
11047
11048 std::string_view snapname = req->get_filepath().last_dentry();
11049
11050 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
11051 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
11052 respond_to_request(mdr, -CEPHFS_EPERM);
11053 return;
11054 }
11055
11056 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
11057
11058 // does snap exist?
11059 if (snapname.length() == 0 || snapname[0] == '_') {
11060 respond_to_request(mdr, -CEPHFS_EINVAL); // can't prune a parent snap, currently.
11061 return;
11062 }
11063 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
11064 respond_to_request(mdr, -CEPHFS_ENOENT);
11065 return;
11066 }
11067 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
11068 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
11069 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
11070 MutationImpl::LockOpVec lov;
11071 lov.add_xlock(&diri->snaplock);
11072 if (!mds->locker->acquire_locks(mdr, lov))
11073 return;
11074 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
11075 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
11076 return;
11077 }
11078 mdr->locking_state |= MutationImpl::ALL_LOCKED;
11079 }
11080
11081 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
11082 return;
11083
11084 // prepare
11085 if (!mdr->more()->stid) {
11086 mds->snapclient->prepare_destroy(diri->ino(), snapid,
11087 &mdr->more()->stid, &mdr->more()->snapidbl,
11088 new C_MDS_RetryRequest(mdcache, mdr));
11089 return;
11090 }
11091 version_t stid = mdr->more()->stid;
11092 auto p = mdr->more()->snapidbl.cbegin();
11093 snapid_t seq;
11094 decode(seq, p);
11095 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
11096
11097 ceph_assert(mds->snapclient->get_cached_version() >= stid);
11098
11099 // journal
11100 auto pi = diri->project_inode(mdr, false, true);
11101 pi.inode->version = diri->pre_dirty();
11102 pi.inode->ctime = mdr->get_op_stamp();
11103 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
11104 pi.inode->rstat.rctime = mdr->get_op_stamp();
11105 pi.inode->rstat.rsnaps--;
11106
11107 mdr->ls = mdlog->get_current_segment();
11108 EUpdate *le = new EUpdate(mdlog, "rmsnap");
11109 mdlog->start_entry(le);
11110
11111 // project the snaprealm
11112 auto &newnode = *pi.snapnode;
11113 newnode.snaps.erase(snapid);
11114 newnode.seq = seq;
11115 newnode.last_destroyed = seq;
11116 newnode.last_modified = mdr->get_op_stamp();
11117 newnode.change_attr++;
11118
11119 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
11120 le->metablob.add_table_transaction(TABLE_SNAP, stid);
11121 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
11122 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
11123
11124 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
11125 mdr, __func__);
11126 mdlog->flush();
11127 }
11128
11129 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
11130 {
11131 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
11132 snapid_t stid = mdr->more()->stid;
11133
11134 mdr->apply();
11135
11136 mds->snapclient->commit(stid, mdr->ls);
11137
11138 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
11139
11140 // notify other mds
11141 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
11142
11143 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
11144
11145 // yay
11146 mdr->in[0] = diri;
11147 mdr->tracei = diri;
11148 mdr->snapid = snapid;
11149 respond_to_request(mdr, 0);
11150
11151 // purge snapshot data
11152 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
11153 }
11154
11155 struct C_MDS_renamesnap_finish : public ServerLogContext {
11156 CInode *diri;
11157 snapid_t snapid;
11158 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
11159 ServerLogContext(s, r), diri(di), snapid(sn) {}
11160 void finish(int r) override {
11161 server->_renamesnap_finish(mdr, diri, snapid);
11162 }
11163 };
11164
11165 /* This function takes responsibility for the passed mdr*/
11166 void Server::handle_client_renamesnap(MDRequestRef& mdr)
11167 {
11168 const cref_t<MClientRequest> &req = mdr->client_request;
11169 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
11170 respond_to_request(mdr, -CEPHFS_EINVAL);
11171 return;
11172 }
11173
11174 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
11175 if (!diri)
11176 return;
11177
11178 if (!diri->is_dir()) { // dir only
11179 respond_to_request(mdr, -CEPHFS_ENOTDIR);
11180 return;
11181 }
11182
11183 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
11184 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
11185 respond_to_request(mdr, -CEPHFS_EPERM);
11186 return;
11187 }
11188
11189 std::string_view dstname = req->get_filepath().last_dentry();
11190 std::string_view srcname = req->get_filepath2().last_dentry();
11191 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
11192
11193 if (srcname.length() == 0 || srcname[0] == '_') {
11194 respond_to_request(mdr, -CEPHFS_EINVAL); // can't rename a parent snap.
11195 return;
11196 }
11197 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
11198 respond_to_request(mdr, -CEPHFS_ENOENT);
11199 return;
11200 }
11201 if (dstname.length() == 0 || dstname[0] == '_') {
11202 respond_to_request(mdr, -CEPHFS_EINVAL);
11203 return;
11204 }
11205 if (diri->snaprealm->exists(dstname)) {
11206 respond_to_request(mdr, -CEPHFS_EEXIST);
11207 return;
11208 }
11209
11210 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
11211
11212 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
11213
11214 // lock snap
11215 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
11216 MutationImpl::LockOpVec lov;
11217 lov.add_xlock(&diri->snaplock);
11218 if (!mds->locker->acquire_locks(mdr, lov))
11219 return;
11220 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
11221 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
11222 return;
11223 }
11224 mdr->locking_state |= MutationImpl::ALL_LOCKED;
11225 }
11226
11227 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
11228 return;
11229
11230 // prepare
11231 if (!mdr->more()->stid) {
11232 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
11233 &mdr->more()->stid,
11234 new C_MDS_RetryRequest(mdcache, mdr));
11235 return;
11236 }
11237
11238 version_t stid = mdr->more()->stid;
11239 dout(10) << " stid is " << stid << dendl;
11240
11241 ceph_assert(mds->snapclient->get_cached_version() >= stid);
11242
11243 // journal
11244 auto pi = diri->project_inode(mdr, false, true);
11245 pi.inode->ctime = mdr->get_op_stamp();
11246 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
11247 pi.inode->rstat.rctime = mdr->get_op_stamp();
11248 pi.inode->version = diri->pre_dirty();
11249
11250 // project the snaprealm
11251 auto &newsnap = *pi.snapnode;
11252 auto it = newsnap.snaps.find(snapid);
11253 ceph_assert(it != newsnap.snaps.end());
11254 it->second.name = dstname;
11255 newsnap.last_modified = mdr->get_op_stamp();
11256 newsnap.change_attr++;
11257
11258 // journal the inode changes
11259 mdr->ls = mdlog->get_current_segment();
11260 EUpdate *le = new EUpdate(mdlog, "renamesnap");
11261 mdlog->start_entry(le);
11262
11263 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
11264 le->metablob.add_table_transaction(TABLE_SNAP, stid);
11265 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
11266 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
11267
11268 // journal the snaprealm changes
11269 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
11270 mdr, __func__);
11271 mdlog->flush();
11272 }
11273
11274 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
11275 {
11276 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
11277
11278 mdr->apply();
11279
11280 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
11281
11282 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
11283
11284 // notify other mds
11285 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
11286
11287 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
11288
11289 // yay
11290 mdr->in[0] = diri;
11291 mdr->tracei = diri;
11292 mdr->snapid = snapid;
11293 respond_to_request(mdr, 0);
11294 }
11295
11296 void Server::handle_client_readdir_snapdiff(MDRequestRef& mdr)
11297 {
11298 const cref_t<MClientRequest>& req = mdr->client_request;
11299 Session* session = mds->get_session(req);
11300 MutationImpl::LockOpVec lov;
11301 CInode* diri = rdlock_path_pin_ref(mdr, false, true);
11302 if (!diri) return;
11303
11304 // it's a directory, right?
11305 if (!diri->is_dir()) {
11306 // not a dir
11307 dout(10) << "reply to " << *req << " snapdiff -CEPHFS_ENOTDIR" << dendl;
11308 respond_to_request(mdr, -CEPHFS_ENOTDIR);
11309 return;
11310 }
11311
11312 auto num_caps = session->get_num_caps();
11313 auto session_cap_acquisition = session->get_cap_acquisition();
11314
11315 if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
11316 dout(20) << "snapdiff throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
11317 << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
11318 if (logger)
11319 logger->inc(l_mdss_cap_acquisition_throttle);
11320
11321 mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
11322 return;
11323 }
11324
11325 lov.add_rdlock(&diri->filelock);
11326 lov.add_rdlock(&diri->dirfragtreelock);
11327
11328 if (!mds->locker->acquire_locks(mdr, lov))
11329 return;
11330
11331 if (!check_access(mdr, diri, MAY_READ))
11332 return;
11333
11334 // which frag?
11335 frag_t fg = (__u32)req->head.args.snapdiff.frag;
11336 unsigned req_flags = (__u32)req->head.args.snapdiff.flags;
11337 string offset_str = req->get_path2();
11338
11339 __u32 offset_hash = 0;
11340 if (!offset_str.empty()) {
11341 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
11342 } else {
11343 offset_hash = (__u32)req->head.args.snapdiff.offset_hash;
11344 }
11345
11346 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
11347 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
11348
11349 // does the frag exist?
11350 if (diri->dirfragtree[fg.value()] != fg) {
11351 frag_t newfg;
11352 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
11353 if (fg.contains((unsigned)offset_hash)) {
11354 newfg = diri->dirfragtree[offset_hash];
11355 } else {
11356 // client actually wants next frag
11357 newfg = diri->dirfragtree[fg.value()];
11358 }
11359 } else {
11360 offset_str.clear();
11361 newfg = diri->dirfragtree[fg.value()];
11362 }
11363 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
11364 fg = newfg;
11365 }
11366
11367 CDir* dir = try_open_auth_dirfrag(diri, fg, mdr);
11368 if (!dir) return;
11369
11370 // ok!
11371 dout(10) << __func__<< " on " << *dir << dendl;
11372 ceph_assert(dir->is_auth());
11373
11374 if (!dir->is_complete()) {
11375 if (dir->is_frozen()) {
11376 dout(7) << "dir is frozen " << *dir << dendl;
11377 mds->locker->drop_locks(mdr.get());
11378 mdr->drop_local_auth_pins();
11379 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
11380 return;
11381 }
11382 // fetch
11383 dout(10) << " incomplete dir contents for snapdiff on " << *dir << ", fetching" << dendl;
11384 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
11385 return;
11386 }
11387
11388 #ifdef MDS_VERIFY_FRAGSTAT
11389 dir->verify_fragstat();
11390 #endif
11391
11392 utime_t now = ceph_clock_now();
11393 mdr->set_mds_stamp(now);
11394
11395 mdr->snapid_diff_other = (uint64_t)req->head.args.snapdiff.snap_other;
11396 if (mdr->snapid_diff_other == mdr->snapid ||
11397 mdr->snapid == CEPH_NOSNAP ||
11398 mdr->snapid_diff_other == CEPH_NOSNAP) {
11399 dout(10) << "reply to " << *req << " snapdiff -CEPHFS_EINVAL" << dendl;
11400 respond_to_request(mdr, -CEPHFS_EINVAL);
11401 }
11402
11403 dout(10) << __func__
11404 << " snap " << mdr->snapid
11405 << " vs. snap " << mdr->snapid_diff_other
11406 << dendl;
11407
11408 SnapRealm* realm = diri->find_snaprealm();
11409
11410 unsigned max = req->head.args.snapdiff.max_entries;
11411 if (!max)
11412 max = dir->get_num_any(); // whatever, something big.
11413 unsigned max_bytes = req->head.args.snapdiff.max_bytes;
11414 if (!max_bytes)
11415 // make sure at least one item can be encoded
11416 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
11417
11418 // start final blob
11419 bufferlist dirbl;
11420 DirStat ds;
11421 ds.frag = dir->get_frag();
11422 ds.auth = dir->get_dir_auth().first;
11423 if (dir->is_auth() && !forward_all_requests_to_auth)
11424 dir->get_dist_spec(ds.dist, mds->get_nodeid());
11425
11426 dir->encode_dirstat(dirbl, mdr->session->info, ds);
11427
11428 // count bytes available.
11429 // this isn't perfect, but we should capture the main variable/unbounded size items!
11430 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8) * 2;
11431 int bytes_left = max_bytes - front_bytes;
11432 bytes_left -= get_snap_trace(session, realm).length();
11433
11434 _readdir_diff(
11435 now,
11436 mdr,
11437 diri,
11438 dir,
11439 realm,
11440 max,
11441 bytes_left,
11442 offset_str,
11443 offset_hash,
11444 req_flags,
11445 dirbl);
11446 }
11447
11448
11449 /**
11450 * Return true if server is in state RECONNECT and this
11451 * client has not yet reconnected.
11452 */
11453 bool Server::waiting_for_reconnect(client_t c) const
11454 {
11455 return client_reconnect_gather.count(c) > 0;
11456 }
11457
11458 void Server::dump_reconnect_status(Formatter *f) const
11459 {
11460 f->open_object_section("reconnect_status");
11461 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
11462 f->close_section();
11463 }
11464
11465 const bufferlist& Server::get_snap_trace(Session *session, SnapRealm *realm) const {
11466 ceph_assert(session);
11467 ceph_assert(realm);
11468 if (session->info.has_feature(CEPHFS_FEATURE_NEW_SNAPREALM_INFO)) {
11469 return realm->get_snap_trace_new();
11470 } else {
11471 return realm->get_snap_trace();
11472 }
11473 }
11474
11475 const bufferlist& Server::get_snap_trace(client_t client, SnapRealm *realm) const {
11476 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
11477 return get_snap_trace(session, realm);
11478 }
11479
11480 void Server::_readdir_diff(
11481 utime_t now,
11482 MDRequestRef& mdr,
11483 CInode* diri,
11484 CDir* dir,
11485 SnapRealm* realm,
11486 unsigned max_entries,
11487 int bytes_left,
11488 const string& offset_str,
11489 uint32_t offset_hash,
11490 unsigned req_flags,
11491 bufferlist& dirbl)
11492 {
11493 // build dir contents
11494 bufferlist dnbl;
11495 __u32 numfiles = 0;
11496
11497 snapid_t snapid = mdr->snapid;
11498 snapid_t snapid_prev = mdr->snapid_diff_other;
11499 if (snapid < snapid_prev) {
11500 std::swap(snapid, snapid_prev);
11501 }
11502 bool from_the_beginning = !offset_hash && offset_str.empty();
11503 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
11504 dentry_key_t skip_key(snapid_prev, offset_str.c_str(), offset_hash);
11505
11506 bool end = build_snap_diff(
11507 mdr,
11508 dir,
11509 bytes_left,
11510 from_the_beginning ? nullptr : & skip_key,
11511 snapid_prev,
11512 snapid,
11513 dnbl,
11514 [&](CDentry* dn, CInode* in, bool exists) {
11515 string name;
11516 snapid_t effective_snapid;
11517 const auto& dn_name = dn->get_name();
11518 // provide the first snapid for removed entries and
11519 // the last one for existent ones
11520 effective_snapid = exists ? snapid : snapid_prev;
11521 name.append(dn_name);
11522 if ((int)(dnbl.length() + name.length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
11523 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
11524 return false;
11525 }
11526
11527 auto diri = dir->get_inode();
11528 auto hash = ceph_frag_value(diri->hash_dentry_name(dn_name));
11529 unsigned start_len = dnbl.length();
11530 dout(10) << "inc dn " << *dn << " as " << name
11531 << std::hex << " hash 0x" << hash << std::dec
11532 << dendl;
11533 encode(name, dnbl);
11534 mds->locker->issue_client_lease(dn, in, mdr, now, dnbl);
11535
11536 // inode
11537 dout(10) << "inc inode " << *in << " snap " << effective_snapid << dendl;
11538 int r = in->encode_inodestat(dnbl, mdr->session, realm, effective_snapid, bytes_left - (int)dnbl.length());
11539 if (r < 0) {
11540 // chop off dn->name, lease
11541 dout(10) << " ran out of room, stopping at "
11542 << start_len << " < " << bytes_left << dendl;
11543 bufferlist keep;
11544 keep.substr_of(dnbl, 0, start_len);
11545 dnbl.swap(keep);
11546 return false;
11547 }
11548
11549 // touch dn
11550 mdcache->lru.lru_touch(dn);
11551 ++numfiles;
11552 return true;
11553 });
11554
11555 __u16 flags = 0;
11556 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
11557 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
11558 }
11559
11560 std::swap(mdr->snapid, mdr->snapid_diff_other); // we want opponent snapid to be used for tracei
11561
11562 _finalize_readdir(mdr, diri, dir, from_the_beginning, end, flags, numfiles,
11563 dirbl, dnbl);
11564 }
11565
11566 bool Server::build_snap_diff(
11567 MDRequestRef& mdr,
11568 CDir* dir,
11569 int bytes_left,
11570 dentry_key_t* skip_key,
11571 snapid_t snapid_prev,
11572 snapid_t snapid,
11573 const bufferlist& dnbl,
11574 std::function<bool (CDentry*, CInode*, bool)> add_result_cb)
11575 {
11576 client_t client = mdr->client_request->get_source().num();
11577
11578 struct EntryInfo {
11579 CDentry* dn = nullptr;
11580 CInode* in = nullptr;
11581 utime_t mtime;
11582
11583 void reset() {
11584 *this = EntryInfo();
11585 }
11586 } before;
11587
11588 auto insert_deleted = [&](EntryInfo& ei) {
11589 dout(20) << "build_snap_diff deleted file " << ei.dn->get_name() << " "
11590 << ei.dn->first << "/" << ei.dn->last << dendl;
11591 int r = add_result_cb(ei.dn, ei.in, false);
11592 ei.reset();
11593 return r;
11594 };
11595
11596 auto it = !skip_key ? dir->begin() : dir->lower_bound(*skip_key);
11597
11598 while(it != dir->end()) {
11599 CDentry* dn = it->second;
11600 dout(20) << __func__ << " " << it->first << "->" << *dn << dendl;
11601 ++it;
11602 if (dn->state_test(CDentry::STATE_PURGING))
11603 continue;
11604
11605 bool dnp = dn->use_projected(client, mdr);
11606 CDentry::linkage_t* dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
11607
11608 if (dnl->is_null()) {
11609 dout(20) << __func__ << " linkage is null, skipping" << dendl;
11610 continue;
11611 }
11612
11613 if (dn->last < snapid_prev || dn->first > snapid) {
11614 dout(20) << __func__ << " not in range, skipping" << dendl;
11615 continue;
11616 }
11617 if (skip_key) {
11618 skip_key->snapid = dn->last;
11619 if (!(*skip_key < dn->key()))
11620 continue;
11621 }
11622
11623 CInode* in = dnl->get_inode();
11624 if (in && in->ino() == CEPH_INO_CEPH)
11625 continue;
11626
11627 // remote link?
11628 // better for the MDS to do the work, if we think the client will stat any of these files.
11629 if (dnl->is_remote() && !in) {
11630 in = mdcache->get_inode(dnl->get_remote_ino());
11631 dout(20) << __func__ << " remote in: " << *in << " ino " << std::hex << dnl->get_remote_ino() << std::dec << dendl;
11632 if (in) {
11633 dn->link_remote(dnl, in);
11634 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
11635 dout(10) << "skipping bad remote ino on " << *dn << dendl;
11636 continue;
11637 } else {
11638 // touch everything i _do_ have
11639 for (auto& p : *dir) {
11640 if (!p.second->get_linkage()->is_null())
11641 mdcache->lru.lru_touch(p.second);
11642 }
11643
11644 // already issued caps and leases, reply immediately.
11645 if (dnbl.length() > 0) {
11646 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
11647 dout(10) << " open remote dentry after caps were issued, stopping at "
11648 << dnbl.length() << " < " << bytes_left << dendl;
11649 } else {
11650 mds->locker->drop_locks(mdr.get());
11651 mdr->drop_local_auth_pins();
11652 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
11653 }
11654 return false;
11655 }
11656 }
11657 ceph_assert(in);
11658
11659 utime_t mtime = in->get_inode()->mtime;
11660
11661 if (in->is_dir()) {
11662
11663 // we need to maintain the order of entries (determined by their name hashes)
11664 // hence need to insert the previous entry if any immediately.
11665 if (before.dn) {
11666 if (!insert_deleted(before)) {
11667 break;
11668 }
11669 }
11670
11671 bool exists = true;
11672 if (snapid_prev < dn->first && dn->last < snapid) {
11673 dout(20) << __func__ << " skipping inner " << dn->get_name() << " "
11674 << dn->first << "/" << dn->last << dendl;
11675 continue;
11676 } else if (dn->first <= snapid_prev && dn->last < snapid) {
11677 // dir deleted
11678 dout(20) << __func__ << " deleted dir " << dn->get_name() << " "
11679 << dn->first << "/" << dn->last << dendl;
11680 exists = false;
11681 }
11682 bool r = add_result_cb(dn, in, exists);
11683 if (!r) {
11684 break;
11685 }
11686 } else {
11687 if (snapid_prev >= dn->first && snapid <= dn->last) {
11688 dout(20) << __func__ << " skipping unchanged " << dn->get_name() << " "
11689 << dn->first << "/" << dn->last << dendl;
11690 continue;
11691 } else if (snapid_prev < dn->first && snapid > dn->last) {
11692 dout(20) << __func__ << " skipping inner modification " << dn->get_name() << " "
11693 << dn->first << "/" << dn->last << dendl;
11694 continue;
11695 }
11696 string_view name_before =
11697 before.dn ? string_view(before.dn->get_name()) : string_view();
11698 if (before.dn && dn->get_name() != name_before) {
11699 if (!insert_deleted(before)) {
11700 break;
11701 }
11702 before.reset();
11703 }
11704 if (snapid_prev >= dn->first && snapid_prev <= dn->last) {
11705 dout(30) << __func__ << " dn_before " << dn->get_name() << " "
11706 << dn->first << "/" << dn->last << dendl;
11707 before = EntryInfo {dn, in, mtime};
11708 continue;
11709 } else {
11710 if (before.dn && dn->get_name() == name_before) {
11711 if (mtime == before.mtime) {
11712 dout(30) << __func__ << " timestamp not changed " << dn->get_name() << " "
11713 << dn->first << "/" << dn->last
11714 << " " << mtime
11715 << dendl;
11716 before.reset();
11717 continue;
11718 } else {
11719 dout(30) << __func__ << " timestamp changed " << dn->get_name() << " "
11720 << dn->first << "/" << dn->last
11721 << " " << before.mtime << " vs. " << mtime
11722 << dendl;
11723 before.reset();
11724 }
11725 }
11726 dout(20) << __func__ << " new file " << dn->get_name() << " "
11727 << dn->first << "/" << dn->last
11728 << dendl;
11729 ceph_assert(snapid >= dn->first && snapid <= dn->last);
11730 }
11731 if (!add_result_cb(dn, in, true)) {
11732 break;
11733 }
11734 }
11735 }
11736 if (before.dn) {
11737 insert_deleted(before);
11738 }
11739 return it == dir->end();
11740 }